diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e7ffe72b5..6bb0e5f51f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+message(STATUS "AR tools: ${CMAKE_AR}")
+
 if(WIN32)
     set(CMAKE_SUPPRESS_REGENERATION ON)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 10b633a4fc..df159a334e 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
     else:
         build_strategy.reduce_strategy = fluid.BuildStrategy(
         ).ReduceStrategy.AllReduce
-    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
 
     avg_loss = train_args[0]
 
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index fc204dc919..ba8b5fc6c8 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index c5754da59b..d96da470b3 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
     URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index ae2679db4a..142fce816d 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 SET(TIME_VERSION "2019.0.1.20181227")
 IF(WIN32)
     SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@@ -43,7 +43,7 @@ ELSE()
     #TODO(intel-huying):
     #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
     SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 11a5b1b455..34c6cbd73d 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index df3497de20..032da0cad8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -10,9 +10,12 @@ paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=No
 paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
 paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
+paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
+paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
+paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'f
 paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2'))
 paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
@@ -56,8 +59,14 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr
 paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
 paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
-paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c'))
+paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
 paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
+paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a3fefec8bacd6ce83f49906a9d05e779'))
+paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '7abd9cf7d695bab5bb6cf7ded5903cb2'))
+paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7'))
+paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb'))
+paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d'))
 paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -68,7 +77,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
 paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
 paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
 paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
@@ -91,7 +100,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464'))
 paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
 paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
 paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
@@ -109,7 +118,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
-paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f'))
+paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a'))
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
@@ -205,7 +214,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None,
 paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
-paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7'))
+paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
@@ -222,6 +231,7 @@ paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label'
 paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
+paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
 paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
@@ -229,7 +239,7 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non
 paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
 paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
 paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
-paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b'))
+paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709'))
 paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
 paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -255,6 +265,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=
 paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
+paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -277,7 +288,7 @@ paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywo
 paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d'))
 paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a'))
 paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0'))
-paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95'))
+paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
 paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
 paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7'))
@@ -293,13 +304,16 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
 paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
+paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
-paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
+paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
 paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
 paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
+paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
+paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
 paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
 paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
 paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
@@ -327,7 +341,8 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
@@ -364,7 +379,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init',
 paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
 paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
 paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
-paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
+paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
@@ -372,26 +387,12 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args',
 paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
 paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
-paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0'))
-paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6'))
-paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
-paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
-paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7'))
-paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196'))
-paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8'))
+paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80'))
+paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
+paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
+paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
+paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
+paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
 paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
 paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
@@ -428,48 +429,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
 paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -490,7 +502,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -508,17 +520,18 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
 paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
 paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
 paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
 paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
 paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
 paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
 paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
 paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
-paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
-paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b9491c953f..ad19d729eb 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -174,7 +174,7 @@ else()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
-target_link_libraries(executor garbage_collector)
+target_link_libraries(executor garbage_collector while_op_helper)
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index dc308fd259..7a371af510 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
+cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -22,6 +23,8 @@ endif()
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
+    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@@ -35,6 +38,8 @@ if(WITH_GPU)
 else()
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
              variable_visitor)
+    cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            variable_visitor)
     if(WITH_DISTRIBUTE)
         cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim selected_rows_functor sendrecvop_rpc)
@@ -46,9 +51,7 @@ else()
     cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
 
-cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 if(WITH_GPU)
 cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
@@ -61,14 +64,17 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+
+cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
 set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
@@ -97,5 +103,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
         fuse_elewise_add_act_pass multi_batch_merge_pass 
-        fuse_relu_depthwise_conv_pass
-        memory_optimize_pass lock_free_optimize_pass)
+        fuse_relu_depthwise_conv_pass 
+        memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index c1f9c2b60c..fdaff08e53 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -11,9 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
+
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
new file mode 100644
index 0000000000..fbc8bbf56b
--- /dev/null
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -0,0 +1,393 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
+              "fuse_parameter_memory_size is up limited memory size "
+              "of one group parameters' gradient which is the input "
+              "of communication calling(e.g NCCLAllReduce). "
+              "The default value is 0, it means that "
+              "not set group according to memory_size.");
+DEFINE_int32(
+    fuse_parameter_groups_size, 3,
+    "fuse_parameter_groups_size is the size of one group parameters' gradient. "
+    "The default value is a experimental result. If the "
+    "fuse_parameter_groups_size is 1, it means that the groups size is "
+    "the number of parameters' gradient. If the fuse_parameter_groups_size is "
+    "-1, it means that there are only one group. The default value is 3, it is "
+    "an experimental value.");
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static const char kUnKnow[] = "@UNKNOW@";
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
+
+class AllocContinuousSpaceForGradPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    ir::Graph &result = *graph;
+
+    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+
+    ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
+    ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
+
+    // NOTE: The operator nodes should be in topology order.
+    std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    for (auto &node : topo_nodes) {
+      RecordParamsAndGrads(node, &params_grads);
+    }
+
+    if (params_grads.size() == 0) {
+      VLOG(10) << "Doesn't find gradients";
+      return std::move(graph);
+    }
+
+    std::unordered_map<std::string, ir::Node *> vars;
+    for (ir::Node *node : result.Nodes()) {
+      if (node->IsVar() && node->Var()) {
+        // Note: The graph may have the same name node. For example, parameter
+        // is the input of operator and it also is the output of optimizer;
+        vars.emplace(node->Var()->Name(), node);
+      }
+    }
+
+    auto &group_grads_params =
+        result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
+
+    // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
+    SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
+
+    params_grads.clear();
+    for (auto &group_p_g : group_grads_params) {
+      params_grads.insert(params_grads.begin(), group_p_g.begin(),
+                          group_p_g.end());
+    }
+    for (auto &p_g : params_grads) {
+      std::swap(p_g.first, p_g.second);
+    }
+
+    // Set Gradients as Persistable to prevent this var becoming reusable.
+    auto dtype = kDefaultDtype;
+    for (auto &p_g : params_grads) {
+      // Get gradient var
+      auto iter = vars.find(p_g.second);
+      PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
+      iter->second->Var()->SetPersistable(true);
+
+      PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+
+      // Get Dtype
+      auto ele_dtype = iter->second->Var()->GetDataType();
+      if (dtype == kDefaultDtype) {
+        dtype = ele_dtype;
+        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
+      }
+      PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+    }
+
+    // Create the fused variable name.
+    if (!result.Has(kFusedVars)) {
+      result.Set(kFusedVars, new FusedVars);
+    }
+    const std::string prefix(kFusedVarNamePrefix);
+    // The fused_var_name should be unique.
+    auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
+    auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    fused_var_set.insert(fused_var_name);
+
+    InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
+                                      fused_var_name, params_grads);
+
+    return std::move(graph);
+  }
+
+  template <typename AttrType>
+  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
+    if (graph->Has(attr_name)) {
+      VLOG(10) << attr_name << " is reset.";
+      graph->Erase(attr_name);
+    }
+    graph->Set(attr_name, new AttrType);
+  }
+
+  void SetGroupGradsAndParams(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const {
+    SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
+    SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
+    SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+  }
+
+  void SetGroupAccordingToLayers(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const {
+    std::unordered_map<std::string, std::vector<int>> layer_params;
+
+    for (size_t i = 0; i < params_grads.size(); ++i) {
+      auto pos = params_grads[i].first.find_first_of(".");
+      if (pos == std::string::npos) {
+        layer_params[std::string(kUnKnow)].emplace_back(i);
+      } else {
+        layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
+      }
+    }
+
+    group_grads_params->reserve(layer_params.size());
+    for (size_t i = 0; i < params_grads.size(); ++i) {
+      auto pos = params_grads[i].first.find_first_of(".");
+      std::string key = kUnKnow;
+      if (pos != std::string::npos) {
+        key = params_grads[i].first.substr(0, pos);
+      }
+      auto iter = layer_params.find(key);
+      if (iter == layer_params.end()) continue;
+
+      group_grads_params->emplace_back();
+      auto &local_group_grads_params = group_grads_params->back();
+      for (auto &idx : iter->second) {
+        local_group_grads_params.emplace_back(
+            std::make_pair(params_grads[idx].second, params_grads[idx].first));
+      }
+      layer_params.erase(iter);
+    }
+
+    VLOG(10) << "SetGroupAccordingToLayers: ";
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+  void SetGroupAccordingToMemorySize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const {
+    if (FLAGS_fuse_parameter_memory_size == 0) {
+      return;
+    }
+    size_t group_memory_size =
+        static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
+    GroupGradsAndParams local_group_grads_params;
+
+    size_t j = 0;
+    while (j < group_grads_params->size()) {
+      local_group_grads_params.emplace_back();
+      auto &group_p_g = local_group_grads_params.back();
+      size_t local_group_memory_size = 0;
+      while (j < group_grads_params->size()) {
+        std::for_each(
+            group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
+            [&local_group_memory_size,
+             &var_nodes](const std::pair<std::string, std::string> &g_p) {
+              auto iter = var_nodes.find(g_p.second);
+              PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
+                             g_p.second);
+              auto shape = iter->second->Var()->GetShape();
+              size_t size =
+                  framework::SizeOfType(iter->second->Var()->GetDataType());
+              std::for_each(shape.begin(), shape.end(),
+                            [&size](const int64_t &n) { size *= n; });
+              local_group_memory_size += size;
+            });
+        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                         group_grads_params->at(j).end());
+        ++j;
+        if (local_group_memory_size >= group_memory_size) {
+          break;
+        }
+      }
+    }
+
+    std::swap(*group_grads_params, local_group_grads_params);
+
+    VLOG(10) << string::Sprintf(
+        "SetGroupAccordingToMemorySize(memory_size: %d):",
+        FLAGS_fuse_parameter_memory_size);
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &g_p : group_grads_params->at(i)) {
+        auto iter = var_nodes.find(g_p.second);
+        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
+        auto shape = iter->second->Var()->GetShape();
+        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
+        std::for_each(shape.begin(), shape.end(),
+                      [&size](const int64_t &n) { size *= n; });
+        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const {
+    if (FLAGS_fuse_parameter_groups_size == 1) {
+      return;
+    }
+    size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
+    if (FLAGS_fuse_parameter_groups_size == -1) {
+      group_size = group_grads_params->size();
+    }
+    PADDLE_ENFORCE_GT(group_size, 1);
+    size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
+    GroupGradsAndParams local_group_grads_params;
+    local_group_grads_params.reserve(groups);
+
+    size_t j = 0;
+    for (size_t i = 0; i < groups; ++i) {
+      local_group_grads_params.emplace_back();
+      auto &group_p_g = local_group_grads_params.back();
+      group_p_g.reserve(group_size);
+      while (j < group_grads_params->size()) {
+        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                         group_grads_params->at(j).end());
+        ++j;
+        if (j % group_size == 0) break;
+      }
+    }
+    std::swap(*group_grads_params, local_group_grads_params);
+
+    VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
+             << "): ";
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+ private:
+  bool IsSupportedVarType(const proto::VarType::Type &type) const {
+    // Current only support LOD_TENSOR.
+    return type == proto::VarType::LOD_TENSOR;
+  }
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const {
+    auto op_desc = global_block->AppendOp();
+    op_desc->SetType("alloc_continuous_space");
+    op_desc->SetInput("Input", params_name);
+    op_desc->SetOutput("Output", grads_name);
+    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  }
+
+  void RecordParamsAndGrads(ir::Node *node,
+                            ParamsAndGrads *params_grads) const {
+    try {
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) return;
+
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once.
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
+
+      for (size_t i = 0; i < backward_vars.size(); i += 2) {
+        VLOG(10) << "Trainable parameter: " << backward_vars[i]
+                 << ", gradient: " << backward_vars[i + 1];
+
+        params_grads->emplace_back(std::make_pair(
+            backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
+      }
+    } catch (boost::bad_get e) {
+    }
+  }
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::unordered_map<std::string, ir::Node *> &vars,
+      const std::string &fused_var_name,
+      const ParamsAndGrads &params_grads) const {
+    //  Init Gradients and FusedVars
+    VLOG(10) << "Init FusedVars and Gradients.";
+    for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+      auto &scope = *it;
+
+      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                     "%s has existed in scope.", fused_var_name);
+      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+
+      for (auto &p_g : params_grads) {
+        auto iter = vars.find(p_g.second);
+        PADDLE_ENFORCE(iter != vars.end());
+        PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+        PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
+                          proto::VarType::LOD_TENSOR);
+        scope->Var(p_g.second)->GetMutable<LoDTensor>();
+      }
+    }
+
+    std::vector<std::string> grads_name;
+    std::vector<std::string> params_name;
+    grads_name.reserve(params_grads.size());
+    params_name.reserve(params_grads.size());
+    for (auto &p_g : params_grads) {
+      params_name.emplace_back(p_g.first);
+      grads_name.emplace_back(p_g.second);
+    }
+    framework::ProgramDesc program_desc;
+    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
+                              program_desc.MutableBlock(0));
+
+    // Run Only Once Programs
+    for (size_t i = 0; i < local_scopes.size(); ++i) {
+      for (auto &op_desc : program_desc.Block(0).AllOps()) {
+        auto op = OpRegistry::CreateOp(*op_desc);
+        op->Run(*local_scopes[i], places[i]);
+      }
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(alloc_continuous_space_for_grad_pass,
+              paddle::framework::details::AllocContinuousSpaceForGradPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 0c75e05f86..0b4d335135 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };
 
  protected:
   void RunImpl() override;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 2cfc76e47f..5d9db23753 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <memory>
+#include <utility>
 
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
@@ -45,12 +46,27 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  public:
   explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
       : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
     if (strategy_.enable_sequential_execution_) {
+      VLOG(10) << "Add sequential_execution_pass";
       AppendPass("sequential_execution_pass");
     }
 
+    // Add op fusion.
+    if (strategy.sync_batch_norm_) {
+      AppendPass("sync_batch_norm_pass");
+    }
+
     // Add op fusion.
     if (strategy.fuse_relu_depthwise_conv_) {
+      VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
       AppendPass("fuse_relu_depthwise_conv_pass");
     }
 
@@ -62,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Add automatically inplace.
     if (strategy_.enable_inplace_) {
+      VLOG(10) << "Add inplace_pass";
       AppendPass("inplace_pass");
     }
 
+    if (strategy.fuse_elewise_add_act_ops_) {
+      VLOG(10) << "Add fuse_elewise_add_act_pass";
+      AppendPass("fuse_elewise_add_act_pass");
+    }
+
+    // for single card training, fuse_all_reduce_ops is unnecessary.
+    // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
+    if (strategy.fuse_all_reduce_ops_) {
+      VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+      AppendPass("alloc_continuous_space_for_grad_pass");
+    }
+
     // Add a graph viz pass to record a graph.
-    if (!strategy_.debug_graphviz_path_.empty()) {
+    if (!strategy.debug_graphviz_path_.empty()) {
       auto viz_pass = AppendPass("graph_viz_pass");
       const std::string graph_path = string::Sprintf(
-          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
       viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
     }
 
-    if (strategy.fuse_elewise_add_act_ops_) {
-      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
-      // Add a graph viz pass to record a graph.
-      if (!strategy.debug_graphviz_path_.empty()) {
-        auto viz_pass = AppendPass("graph_viz_pass");
-        const std::string graph_path = string::Sprintf(
-            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-        viz_pass->Set<std::string>("graph_viz_path",
-                                   new std::string(graph_path));
-      }
-    }
-
     CollectiveContext *context = CollectiveContext::GetInstance();
     context->endpoints_ = strategy_.trainers_endpoints_;
     context->trainer_id_ = strategy_.trainer_id_;
@@ -102,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // A side-effect of that, memory optimize cannot forsee the fetched vars
     // , so fetchlist should be set persistable before call the Run interface.
     if (strategy.memory_optimize_) {
-      auto memory_optimize_pass = AppendPass("memory_optimize_pass");
+      VLOG(10) << "Add memory_optimize_pass";
+      AppendPass("memory_optimize_pass");
     }
 
     AppendMultiDevPass(strategy);
 
+    if (strategy.fuse_all_reduce_ops_) {
+      // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
+      // first, if the number is zero, fuse_all_reduce_ops will do nothing.
+      VLOG(10) << "Add fuse_all_reduce_op_pass";
+      AppendPass("fuse_all_reduce_op_pass");
+    }
+
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
       auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
@@ -122,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Verify that the graph is correct for multi-device executor.
     AppendPass("multi_devices_check_pass");
 
+    if (VLOG_IS_ON(2)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
     if (SeqOnlyAllReduceOps(strategy)) {
+      VLOG(10) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
     }
 
     if (strategy_.remove_unnecessary_lock_) {
+      VLOG(10) << "Add modify_op_lock_and_record_event_pass";
       AppendPass("modify_op_lock_and_record_event_pass");
     }
   }
 
   // Convert graph to run on multi-devices.
   void AppendMultiDevPass(const BuildStrategy &strategy) {
-    ir::Pass *multi_devices_pass;
+    ir::Pass *multi_devices_pass = nullptr;
     if (strategy_.is_distribution_) {
-      VLOG(3) << "multi device parameter server mode";
+      VLOG(10) << "Add dist_multi_devices_pass";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(3) << "multi devices collective mode with allreduce";
+        VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
         multi_devices_pass =
-            AppendPass("allreduce_mode_multi_devices_pass").get();
+            AppendPass("all_reduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(3) << "multi deivces collective mode with reduce";
+        VLOG(10) << "Add reduce_mode_multi_devices_pass";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
@@ -200,9 +231,26 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase("nccl_ctxs");
-      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+#endif
+    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
+    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
@@ -227,12 +275,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle
 
+USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);
-USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(all_reduce_mode_multi_devices_pass);
 USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
@@ -242,4 +291,6 @@ USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
+USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
+USE_PASS(fuse_all_reduce_op_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index d755a2505a..4b599fb914 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/pass_builder.h"
@@ -75,8 +76,12 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
+  bool fuse_all_reduce_ops_{false};
+
   bool fuse_relu_depthwise_conv_{false};
 
+  bool sync_batch_norm_{false};
+
   bool memory_optimize_{true};
   // TODO(dzhwinter):
   // make enable_inplace, memory_optimize_
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 1e3dbb1e44..e98b16e6b3 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
   ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                       size_t scope_idx);
 
+  OperatorBase *GetOp() { return op_.get(); }
+
   std::string Name() const override;
 
   const Scope *GetScope() const { return scope_; }
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
deleted file mode 100644
index c9b52b6820..0000000000
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  if (ctxs) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-  }
-}
-#else
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-std::string DataBalanceOpHandle::Name() const { return "data balance"; }
-
-std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
-    const std::vector<int> &device_sizes) {
-  int device_num = device_sizes.size();
-  int total_size = 0;
-  int empty_num = 0;
-  std::vector<std::array<int, 2>> size_device_vec;
-  size_device_vec.reserve(device_num);
-  for (int i = 0; i < device_num; ++i) {
-    if (device_sizes[i] == 0) {
-      ++empty_num;
-    }
-    total_size += device_sizes[i];
-    size_device_vec.push_back({{device_sizes[i], i}});
-  }
-  std::vector<std::array<int, 3>> res;
-  if (empty_num == 0) {
-    // No need to do data balance.
-    return res;
-  }
-  if (total_size < device_num) {
-    // No enough data.
-    PADDLE_THROW_EOF();
-  }
-  std::sort(size_device_vec.begin(), size_device_vec.end(),
-            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
-              return a[0] > b[0];
-            });
-  int expected_device_size = total_size / device_num;
-  int src_idx = 0;
-  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
-    if (size_device_vec[src_idx][0] <= expected_device_size) {
-      ++src_idx;
-      PADDLE_ENFORCE_LT(
-          src_idx, device_num - empty_num,
-          "In current srategy an empty tensor should not be copy source.");
-    }
-    size_device_vec[src_idx][0] -= expected_device_size;
-    size_device_vec[dst_idx][0] += expected_device_size;
-    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
-                    expected_device_size}});
-  }
-  return res;
-}
-
-void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1UL,
-                    "Data balance can only be enabled when the number of "
-                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  int data_num = in_var_handles.size() / places_.size();
-  WaitInputVarGenerated();
-  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
-  std::vector<int> device_sizes;
-  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-    int place_idx = i / data_num;
-    int data_idx = i % data_num;
-    auto *local_scope =
-        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
-    auto *tensor = tensor_var->GetMutable<LoDTensor>();
-    lod_tensors[data_idx].push_back(tensor);
-    int ins_size =
-        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
-    if (data_idx == 0) {
-      device_sizes.emplace_back(ins_size);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          ins_size, device_sizes.at(place_idx),
-          "All data on the same device shall have the same batch size.");
-    }
-  }
-  const auto &balance_plan = GetBalancePlan(device_sizes);
-
-  for (const auto &trans : balance_plan) {
-    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
-      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
-      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
-      int trans_ins_size = trans[2];
-      LoD src_lod = src_tensor->lod();
-      int src_ins_size =
-          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
-      int cut_point = src_ins_size - trans_ins_size;
-      if (!src_lod.empty()) {
-        for (auto &level : src_lod) {
-          cut_point = level[cut_point];
-        }
-      }
-      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
-                     dst_tensor->place(), dst_tensor);
-      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
-      if (!src_lod.empty()) {
-        dst_tensor->set_lod(SliceInLevel(
-            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
-        src_tensor->set_lod(
-            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
-      }
-    }
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
deleted file mode 100644
index 2db18a1a72..0000000000
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct DataBalanceOpHandle : public OpHandleBase {
- public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places,
-                      const platform::NCCLContextMap *ctxs);
-#else
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places);
-#endif
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
-  std::vector<std::array<int, 3>> GetBalancePlan(
-      const std::vector<int> &batch_size_per_device);
-
-  const std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> places_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 03fbfd7f24..dbc90737f2 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -12,6 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
+#include <unordered_set>
+#include <utility>
+
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     }
   }
 #endif
+  PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
 void EagerDeletionOpHandle::RunImpl() {
-  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  Scope *exec_scope = nullptr;
   std::deque<std::shared_ptr<memory::Allocation>> garbages;
   for (auto &name : var_names_) {
     auto it = ref_cnts_->find(name);
-    // Var not found, not reference count has not decreased to 0
+    // Reference count has not decreased to 0
     if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
       continue;
     }
 
+    if (!exec_scope) {
+      exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    }
+
+    // Var not found
     auto *var = exec_scope->FindVar(name);
     if (var == nullptr) {
       continue;
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 4e42d0b497..377bb915e0 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -12,20 +12,173 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+#include <functional>
 #include <queue>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+// op -> variables which can be deleted after op runs
+using OpToVarNameSetMap =
+    std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
+
+// Check whether the variable is LoDTensor based on static VarDesc info
+static bool IsLoDTensor(VarDesc *var) {
+  return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
+}
+
+// Get memory size of LoDTensor
+static int64_t GetMemorySize(
+    const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
+    const std::string &var_name) {
+  auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
+  PADDLE_ENFORCE_NOT_NULL(var_desc);
+  PADDLE_ENFORCE(IsLoDTensor(var_desc));
+  auto dims = var_desc->GetShape();
+  return SizeOfType(var_desc->GetDataType()) *
+         std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
+                         std::multiplies<int64_t>());
+}
+
+// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
+// SelectedRows, LoDTensorArray)
+// Since partial GC is based on static analysis of memory size of each variable
+// So we should skip SelectedRows and LoDTensorArray here
+static void SplitIntoLoDTensorAndNonLoDTensorVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
+  lod_tensors->clear();
+  other_vars->clear();
+
+  for (auto &op_vars_pair : m) {
+    for (auto &var_name : op_vars_pair.second) {
+      auto *var_desc = TryGetLatestVarDesc(
+          vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
+      if (IsLoDTensor(var_desc)) {
+        (*lod_tensors)[op_vars_pair.first].insert(var_name);
+      } else {
+        (*other_vars)[op_vars_pair.first].insert(var_name);
+      }
+    }
+  }
+}
+
+struct GCVarInfo {
+  GCVarInfo(const std::string &name, int64_t memory_size,
+            ComputationOpHandle *op, size_t scope_idx)
+      : name_(name),
+        memory_size_(memory_size),
+        op_(op),
+        scope_idx_(scope_idx) {}
+
+  std::string name_;         // variable name
+  int64_t memory_size_;      // memory size
+  ComputationOpHandle *op_;  // op after which the variable could be deleted
+  size_t scope_idx_;         // scope index where the variable locates
+
+  int64_t AbsMemorySize() const { return std::abs(memory_size_); }
+};
+
+// Delete delete_lod_tensor_only is not used currently
+static OpToVarNameSetMap ShrinkGCVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    const std::vector<platform::Place> &places, double fraction_of_memory_size,
+    bool delete_lod_tensor_only = false) {
+  // Do not perform gc when fraction_of_memory_size = 0
+  if (fraction_of_memory_size <= 0.0) return {};
+
+  /**
+   * Step 1: Split all variables into LoDTensor and Non-LoDTensor.
+   * We can only calculate memory size of LoDTensors
+   */
+  OpToVarNameSetMap lod_tensors, other_vars;
+  SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
+
+  // Perform complete gc when fraction_of_memory_size >= 1
+  if (fraction_of_memory_size >= 1.0) {
+    return delete_lod_tensor_only ? lod_tensors : m;
+  }
+
+  /**
+   * Step 2: build GCVarInfos, and calculate total memory sizes of each device
+   */
+
+  // place -> variable info (name, memory size, place, scope_idx)
+  std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
+
+  // place -> total memory sizes
+  std::map<platform::Place, int64_t> place_to_size;
+  for (auto &op_vars_pair : lod_tensors) {
+    auto *op = op_vars_pair.first;
+    auto &var_names = op_vars_pair.second;
+    auto scope_idx = op->GetScopeIdx();
+    auto &place = places[scope_idx];
+
+    for (auto &var_name : var_names) {
+      auto var_size = GetMemorySize(vars[scope_idx], var_name);
+      GCVarInfo var_info(var_name, var_size, op, scope_idx);
+      place_to_size[place] += var_info.AbsMemorySize();
+      place_to_vars[place].emplace_back(std::move(var_info));
+    }
+  }
+
+  /**
+   * Step 3: sort GCVarInfos, and only delete the largest variables.
+   */
+  OpToVarNameSetMap partial_vars;
+  for (auto &place_to_var_pair : place_to_vars) {
+    auto &place = place_to_var_pair.first;
+    auto &gc_vars = place_to_var_pair.second;
+    std::sort(gc_vars.begin(), gc_vars.end(),
+              [](const GCVarInfo &var1, const GCVarInfo &var2) {
+                return var1.AbsMemorySize() > var2.AbsMemorySize();
+              });
+
+    int64_t accumulated_size = 0;
+    int64_t size_threshold =
+        static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
+    for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
+         ++i) {
+      partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
+      accumulated_size += gc_vars[i].AbsMemorySize();
+    }
+  }
+
+  /**
+   * Step 4: Combine other vars (SelectedRows, LoDTensorArray)
+   */
+  if (!delete_lod_tensor_only) {
+    for (auto &op_vars_pair : other_vars) {
+      partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
+                                              op_vars_pair.second.end());
+    }
+  }
+
+  return partial_vars;
+}
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   auto &ref_cnts =
@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
 
   // a reverse map of last_live_ops
   //   i.e., last op --> variable names which can be deleted.
-  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
-      op_vars_map;
-
+  OpToVarNameSetMap op_vars_map;
   for (auto &var_ops_map : last_live_ops) {
     for (auto &var_ops_pair : var_ops_map) {
       const std::string &var_name = var_ops_pair.first;
@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     }
   }
 
+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+                             FLAGS_memory_fraction_of_eager_deletion);
+
   for (auto &pair : op_vars_map) {
     auto *op = pair.first;
     auto &var_names = pair.second;
@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     eager_deletion_op->AddOutput(dummy_leaf);
   }
 
+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+           << FLAGS_memory_fraction_of_eager_deletion;
   VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
-  return graph;
+
+  auto while_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
+  return while_op_eager_deletion_pass->Apply(std::move(graph));
 }
 
 }  // namespace details
@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
     .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
     .RequirePassAttr(paddle::framework::details::kAllPlaces)
     .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+
+USE_PASS(while_op_eager_deletion_pass);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index bbf81e1b8e..232d82a5da 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
   }
 }
 
+bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 6ce42f92d7..dbb7f4f658 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  bool IsMultiDeviceTransfer() override;
+
  protected:
   void RunImpl() override;
 
diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
new file mode 100644
index 0000000000..f226491c9f
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
@@ -0,0 +1,195 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseAllReduceOpPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    ir::Graph &result = *graph;
+
+    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *nccl_ctxs = &Get<platform::NCCLContextMap>(kNCCLCtxs);
+#endif
+
+    std::unordered_set<std::string> grads;
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    size_t num_of_all_reduce = params_grads.size();
+    grads.reserve(num_of_all_reduce);
+    for (auto p_g : params_grads) {
+      grads.insert(p_g.second);
+    }
+
+    size_t num_place = places.size();
+    std::unordered_map<std::string, ir::Node *> all_reduce_ops;
+    all_reduce_ops.reserve(grads.size());
+    for (auto &node : result.Nodes()) {
+      if (node->IsOp()) {
+        PADDLE_ENFORCE(node->IsWrappedBy<OpHandleBase>());
+        auto *all_reduce_op_handle =
+            dynamic_cast<AllReduceOpHandle *>(&node->Wrapper<OpHandleBase>());
+        if (all_reduce_op_handle) {
+          auto inputs = DynamicCast<VarHandle>(all_reduce_op_handle->Inputs());
+          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
+          // The inputs' name should be the same.
+          auto &grad_name = inputs[0]->name();
+          for (size_t i = 1; i < inputs.size(); ++i) {
+            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
+                              "The input name should be the same.");
+          }
+          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
+          all_reduce_ops.emplace(grad_name, node);
+        }
+      }
+    }
+
+    VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
+    if (all_reduce_ops.size() == 0) {
+      return std::move(graph);
+    }
+
+    PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
+                      "The number of all_reduce OpHandle is not equal to the "
+                      "number of grads. Maybe some gradients are sparse type, "
+                      "it is not supported currently.");
+    VLOG(10) << "Insert fused_all_reduce";
+
+    auto &group_grads_params =
+        graph->Get<GroupGradsAndParams>(kGroupGradsAndParams);
+
+    for (auto &group_g_p : group_grads_params) {
+      size_t group_size = group_g_p.size();
+      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
+      std::vector<ir::Node *> group_all_reduce_ops;
+      group_all_reduce_ops.reserve(group_size);
+      for (auto &g_p : group_g_p) {
+        group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
+      }
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, nccl_ctxs, &result);
+#else
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, &result);
+#endif
+    }
+    return std::move(graph);
+  }
+
+  void InsertFusedAllReduce(const std::vector<platform::Place> &places,
+                            const std::vector<Scope *> &local_scopes,
+                            const size_t num_of_all_reduce,
+                            const std::vector<ir::Node *> &all_reduce_ops,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                            const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                            ir::Graph *result) const {
+    std::vector<VarHandleBase *> inputs;
+    std::vector<VarHandleBase *> outputs;
+    for (auto &op : all_reduce_ops) {
+      auto &op_handle = op->Wrapper<OpHandleBase>();
+      inputs.insert(inputs.end(), op_handle.Inputs().begin(),
+                    op_handle.Inputs().end());
+      // Remove output
+      for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
+               [&op_handle](VarHandleBase *var_handle) {
+                 var_handle->RemoveOutput(&op_handle, op_handle.Node());
+               });
+
+      outputs.insert(outputs.end(), op_handle.Outputs().begin(),
+                     op_handle.Outputs().end());
+      // Remove Input
+      for_each(
+          op_handle.Outputs().begin(), op_handle.Outputs().end(),
+          [](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); });
+
+      result->RemoveNode(op_handle.Node());
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, nccl_ctxs, result);
+#else
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, result);
+#endif
+  }
+
+ private:
+  void CreateFusedAllReduceOp(const std::vector<VarHandleBase *> &inputs,
+                              const std::vector<VarHandleBase *> &outputs,
+                              const size_t num_of_all_reduce,
+                              const std::vector<platform::Place> &places,
+                              const std::vector<Scope *> &local_scopes,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                              const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                              ir::Graph *result) const {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce, nccl_ctxs);
+#else
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce);
+#endif
+
+    for (auto in : inputs) {
+      op_handle->AddInput(in);
+    }
+
+    for (auto out : outputs) {
+      op_handle->AddOutput(out);
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    if (!nccl_ctxs) {
+      SetCommunicationContext(places, op_handle);
+    }
+#else
+    SetCommunicationContext(places, op_handle);
+#endif
+  }
+
+  void SetCommunicationContext(const std::vector<platform::Place> &places,
+                               FusedAllReduceOpHandle *op_handle) const {
+    for (size_t i = 0; i < places.size(); ++i) {
+      op_handle->SetDeviceContext(
+          places[i], platform::DeviceContextPool::Instance().Get(places[i]));
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_all_reduce_op_pass,
+              paddle::framework::details::FuseAllReduceOpPass);
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
deleted file mode 100644
index 14292c0a5d..0000000000
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void FuseVarsOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
-  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
-
-  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-  auto out_var_handle = out_var_handles[0];
-  auto out_var = scope->Var(out_var_handle->name());
-
-  auto out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
-
-  int64_t s = 0;
-  for (size_t i = 1; i < out_var_handles.size(); ++i) {
-    auto out_name = out_var_handles[i]->name();
-    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
-    auto numel = this->inputs_numel_.at(out_name);
-    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
-    s += numel;
-  }
-  this->RunAndRecordEvent([] {});
-}
-
-std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
deleted file mode 100644
index b40b01df36..0000000000
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FuseVarsOpHandle : public OpHandleBase {
- public:
-  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
-                   const platform::Place &place,
-                   const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const proto::VarType::Type var_type)
-      : OpHandleBase(node),
-        local_scope_(local_scope),
-        place_(place),
-        inputs_numel_(inputs_numel),
-        type_(var_type) {
-    total_numel_ = 0;
-    for (auto in_numel : inputs_numel) {
-      PADDLE_ENFORCE_GT(in_numel.second, 0);
-      total_numel_ += in_numel.second;
-    }
-  }
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  Scope *local_scope_;
-  const platform::Place place_;
-  const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const proto::VarType::Type type_;
-  int64_t total_numel_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
new file mode 100644
index 0000000000..644cd4e150
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -0,0 +1,249 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(skip_fused_all_reduce_check, false, "");
+namespace paddle {
+namespace framework {
+namespace details {
+
+typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
+    GradientAndLoDTensor;
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
+    const platform::NCCLContextMap *ctxs)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce),
+      nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
+    }
+  }
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+#else
+
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+
+#endif
+
+void FusedAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+
+  VLOG(4) << this->DebugString();
+
+  WaitInputVarGenerated();
+  // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+
+  size_t place_num = places_.size();
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), place_num * num_of_all_reduce_,
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  GradientAndLoDTensor grads_tensor;
+  grads_tensor.resize(place_num);
+
+  int64_t numel = -1;
+  auto dtype = static_cast<framework::proto::VarType::Type>(0);
+  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+    auto &g_tensor = grads_tensor.at(scope_idx);
+    g_tensor.reserve(num_of_all_reduce_);
+
+    GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
+
+    int64_t element_num = 0;
+    framework::proto::VarType::Type ele_dtype =
+        static_cast<framework::proto::VarType::Type>(0);
+    GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
+
+    if (numel == -1) {
+      numel = element_num;
+    }
+    if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
+      dtype = ele_dtype;
+      PADDLE_ENFORCE_NE(ele_dtype,
+                        static_cast<framework::proto::VarType::Type>(0));
+    }
+    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+
+    // Check whether the address space is contiguous.
+    std::sort(
+        g_tensor.begin(), g_tensor.end(),
+        [](const std::pair<std::string, const LoDTensor *> &grad1,
+           const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
+          return grad1.second->data<void>() < grad2.second->data<void>();
+        });
+
+    for (size_t k = 1; k < g_tensor.size(); ++k) {
+      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
+      int64_t len = g_tensor.at(k - 1).second->numel();
+      auto offset = len * framework::SizeOfType(dtype);
+      void *infer_next_address = reinterpret_cast<void *>(
+          reinterpret_cast<uintptr_t>(cur_address) + offset);
+      const void *next_address = g_tensor.at(k).second->data<void>();
+
+      VLOG(10) << string::Sprintf(
+          "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
+          "input[%d] address: 0X%02x. The offset: %d",
+          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
+          next_address, k, infer_next_address, offset);
+      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
+                        "The address is not consistent.");
+    }
+  }
+
+  if (!FLAGS_skip_fused_all_reduce_check) {
+    for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+      for (size_t j = 1; j < num_of_all_reduce_; ++j) {
+        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
+                          grads_tensor.at(scope_idx).at(j).first);
+      }
+    }
+  }
+
+  std::vector<const void *> lod_tensor_data;
+  for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
+    lod_tensor_data.emplace_back(data);
+  }
+
+  if (platform::is_gpu_place(places_[0])) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    int nccl_dtype = platform::ToNCCLDataType(dtype);
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
+
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
+            ncclSum, comm, stream));
+      });
+    }
+
+    this->RunAndRecordEvent([&] {
+      if (all_reduce_calls.size() == 1UL) {
+        // Do not use NCCLGroup when manage NCCL by per thread per device
+        all_reduce_calls[0]();
+      } else {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      }
+    });
+#else
+    PADDLE_THROW("Not compiled with CUDA");
+#endif
+  } else {
+    // Special handle CPU only Operator's gradient. Like CRF
+    auto grad_name = grads_tensor.at(0).at(0).first;
+    auto &trg = *this->local_scopes_[0]
+                     ->FindVar(kLocalExecScopeName)
+                     ->Get<Scope *>()
+                     ->FindVar(grad_name)
+                     ->GetMutable<framework::LoDTensor>();
+
+    // Reduce All data to trg in CPU
+    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    VisitDataType(trg.type(), func);
+
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
+      auto &scope =
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &p = places_[i];
+      auto *var = scope.FindVar(grad_name);
+      auto *dev_ctx = dev_ctxes_.at(p);
+      size_t size = numel * SizeOfType(trg.type());
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        platform::CPUPlace cpu_place;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+      });
+    }
+  }
+}
+
+void FusedAllReduceOpHandle::GetGradLoDTensor(
+    const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<VarHandle *> &out_var_handles,
+    std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
+  auto *local_scope =
+      local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  size_t place_num = places_.size();
+
+  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
+    auto var_name = in_var_handles[j]->name();
+    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
+    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
+  }
+}
+
+void FusedAllReduceOpHandle::GetDTypeAndNumel(
+    const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
+    proto::VarType::Type *dtype, int64_t *numel) const {
+  *numel = 0;
+  for (size_t i = 0; i < grad_tensor.size(); ++i) {
+    // Get element number
+    int64_t len = grad_tensor.at(i).second->numel();
+    PADDLE_ENFORCE_GT(len, 0);
+    *numel += len;
+
+    // Get dtype
+    auto ele_type = grad_tensor.at(i).second->type();
+    if (i == 0) {
+      *dtype = ele_type;
+    }
+    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+  }
+}
+
+std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
new file mode 100644
index 0000000000..79772c61f8
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -0,0 +1,76 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FusedAllReduceOpHandle : public OpHandleBase {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  FusedAllReduceOpHandle(ir::Node *node,
+                         const std::vector<Scope *> &local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const size_t num_of_all_reduce,
+                         const platform::NCCLContextMap *ctxs);
+#else
+  FusedAllReduceOpHandle(ir::Node *node,
+                         const std::vector<Scope *> &local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const size_t num_of_all_reduce);
+#endif
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  size_t num_of_all_reduce_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
+  // Check the dtype of the input
+  void GetDTypeAndNumel(
+      const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
+      proto::VarType::Type *dtype, int64_t *total_num) const;
+
+  // Get gradient's name and LoDTensor
+  void GetGradLoDTensor(const size_t &scope_idx,
+                        const std::vector<VarHandle *> &in_var_handles,
+                        const std::vector<VarHandle *> &out_var_handles,
+                        std::vector<std::pair<std::string, const LoDTensor *>>
+                            *grad_tensor) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index 126959bcd8..d139f84883 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
-    auto type = block->Var(inputs.front())->GetType();
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(type);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
+    auto type = ctx->GetType(inputs.front());
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, type);
   }
 };
 
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index c91fc81b2d..8d4717ad19 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <deque>
 #include <iterator>
+#include <memory>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                           ir::Graph* graph) const {
   VLOG(4) << "Try to inplace op " << op->Name();
+  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // ProgramDescs.
+  // The operations related to BlockDesc or ProgramDesc should perform on Graph
+  // or Node directly!
   PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
                  "op_desc is nullptr");
   // some pre-requirments need to meet if the op want to inplaced.
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index c89a33fc95..533d3269be 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
   auto type = node.GetType();
   // only these types holds bulk of gpu memory
   if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::SELECTED_ROWS ||
         type == proto::VarType::LOD_TENSOR_ARRAY)) {
     return false;
   }
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index e7284ea644..80720af32d 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
           // immediately to make the subblock variable reuse strategy take
           // effect. Because it is a single op in graph. No need to
           // update the ir nodes.
+          // FIXME(liuwei1031): Graph is not aware of the existence of
+          // BlockDescs and ProgramDescs.
+          // The operations related to BlockDesc or ProgramDesc should perform
+          // on Graph or Node directly!
           sub_op_desc->Rename(var->Name(), cache->Name());
           if (sub_op_desc->Block() != nullptr &&
               sub_op_desc->Block()->HasVar(var->Name())) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 478d2ffbcf..125dbf746c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -11,18 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@@ -134,21 +135,26 @@ void AddOutputToLeafOps(ir::Graph *graph) {
 }
 }  // namespace
 
+void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {}
+
 void MultiDevSSAGraphBuilderBase::Init() const {
   all_vars_.clear();
 
   loss_var_name_ = Get<const std::string>(kLossVarName);
+  VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_;
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
   strategy_ = Get<const BuildStrategy>(kStrategy);
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>(kNCCLCtxs);
 #endif
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
 
 std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   Init();
+  CheckGraph(*graph);
   std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
 
   auto nodes = graph->ReleaseNodes();
@@ -166,7 +172,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
   result.Set(kGraphOps, new GraphOps);
 
   bool is_forwarding = true;
-  bool insert_collection_ops = NeedCollectiveOps();
 
   for (ir::Node *node : sorted_ops) {
     if (DealWithSpecialOp(&result, node)) {
@@ -185,8 +190,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
         CreateComputationalOps(&result, node, places_.size());
       }
 
-      // Insert collection ops
-      if (!is_forwarding && insert_collection_ops) {
+      // Insert collective ops if nranks > 1
+      if (!is_forwarding && Get<size_t>(kNRanks) > 1) {
         try {
           bool is_bk_op =
               static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -200,13 +205,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
               boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
                   OpProtoAndCheckerMaker::OpRoleVarAttrName()));
           PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
           for (size_t i = 0; i < backward_vars.size(); i += 2) {
             auto &p_name = backward_vars[i];
             auto &g_name = backward_vars[i + 1];
             VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-            InsertCollectiveOp(&result, p_name, g_name);
+            if (NeedCollectiveForGrad(g_name, sorted_ops)) {
+              InsertCollectiveOp(&result, p_name, g_name);
+            }
           }
         } catch (boost::bad_get e) {
         }
@@ -226,6 +231,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+
   result.Erase(kGraphOps);
   return graph;
 }
@@ -258,6 +264,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
   }
 }
 
+bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result,
+                                                    ir::Node *node) const {
+  return false;
+}
+
 std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
     const ir::Graph &graph) const {
   return ir::TopologySortOperations(graph);
@@ -271,8 +282,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const {
   return use_gpu;
 }
 
-bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const {
-  return Get<size_t>(kNRanks) > 1;
+bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
+    const std::string &grad_name, std::vector<ir::Node *> ops) const {
+  // if we have allreduce_op for current gradient variable in the graph,
+  // then we don't need to add allreduce_op_handle for this gradient
+  // NOTE: This is for the case that all gradients should add collective ops
+  for (auto *node : ops) {
+    if (node->Op()->Type() != "allreduce") continue;
+    for (auto in_name : node->Op()->InputArgumentNames()) {
+      if (in_name == grad_name) {
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
@@ -496,20 +519,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
 }
 
 bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
-  return boost::get<int>(
+  return !loss_var_name_.empty() && node->Op() &&
+         boost::get<int>(
              node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
              (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss)) &&
-         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+              static_cast<int>(OpRole::kLoss));
 }
 
 bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
     const std::string &og) const {
   PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
-  }
-  return false;
+  return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
 }
 
 void AllReduceSSAGraphBuilder::InsertCollectiveOp(
@@ -995,7 +1015,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
 REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
                             paddle::framework::details::ReduceSSAGraphBuilder);
 REGISTER_MULTI_DEVICES_PASS(
-    allreduce_mode_multi_devices_pass,
+    all_reduce_mode_multi_devices_pass,
     paddle::framework::details::AllReduceSSAGraphBuilder);
 REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
                             paddle::framework::details::DistSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 6d4386538e..0ee3a06062 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -31,12 +34,6 @@ namespace framework {
 class Scope;
 namespace details {
 
-constexpr char kLossVarName[] = "loss_var_name";
-constexpr char kPlaces[] = "places";
-constexpr char kLocalScopes[] = "local_scopes";
-constexpr char kStrategy[] = "strategy";
-constexpr char kNRanks[] = "nranks";
-
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
@@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   virtual void Init() const;
 
+  virtual void CheckGraph(const ir::Graph &graph) const;
+
   virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
 
   virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const = 0;
 
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
 
   virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
 
   bool UseGPU() const;
 
-  bool NeedCollectiveOps() const;
+  bool NeedCollectiveForGrad(const std::string &grad_name,
+                             std::vector<ir::Node *> ops) const;
 
   bool IsScaleLossOp(ir::Node *node) const;
 
@@ -109,10 +109,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
   virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const;
 
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
-    return false;
-  }
-
   virtual void InsertPostprocessOps(ir::Graph *result) const {}
 };
 
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 9afbb91005..ab5e099023 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -16,6 +16,9 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/details/op_handle_base.h"
@@ -44,6 +47,26 @@ const char kGraphVars[] = "vars";
 typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
 
+constexpr char kNCCLCtxs[] = "nccl_ctxs";
+
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kPlaces[] = "places";
+constexpr char kLocalScopes[] = "local_scopes";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
+typedef std::unordered_set<std::string> FusedVars;
+constexpr char kFusedVars[] = "fused_vars";
+
+typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
+constexpr char kParamsAndGrads[] = "params_grads";
+
+typedef std::vector<std::vector<std::pair<std::string, std::string>>>
+    GroupGradsAndParams;
+constexpr char kGroupGradsAndParams[] = "group_grads_params";
+
+constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4822627ac3..158da6f606 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include <map>
+#include <unordered_set>
 
 namespace paddle {
 namespace framework {
@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {
 
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
       PADDLE_ENFORCE(
           cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
     }
+    if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          int dev_id =
+              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
+                        "%s should have only one dev_ctx.", Name());
+      auto &place = dev_ctxes_.begin()->first;
+      int dev_id = boost::get<platform::CUDAPlace>(place).device;
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          PADDLE_ENFORCE(
+              platform::is_same_place(place, out_var_handle->place()),
+              "The place of input(%s) is not consistent with the "
+              "place of current op(%s).",
+              out_var_handle->Name(), Name());
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    }
   }
 #else
+
   PADDLE_ENFORCE(!use_cuda);
 #endif
 
@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
 void OpHandleBase::WaitInputVarGenerated() {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
-      for (auto &pair : dev_ctxes_) {
-        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        auto &place = in_var_handle->place();
+        if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream =
+              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
+                  ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
       }
     }
   }
 }
 
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
-  for (auto *in : inputs_) {
-    if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        if (platform::is_gpu_place(in_var_handle->place())) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream = static_cast<platform::CUDADeviceContext *>(
+                            dev_ctxes_.at(in_var_handle->place()))
+                            ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 0901e59f97..e13ff99f3f 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string>
 #include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
@@ -127,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+    info->infer_var_type_ = [](InferVarTypeContext* context) {
       T inference;
-      inference(fwd_op, block);
+      inference(context);
     };
   }
 };
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 2e5256fbd4..0de8e43651 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,6 +53,31 @@ struct ReduceLoDTensor {
   }
 };
 
+struct ReduceBufferData {
+  const std::vector<const void *> &src_data_;
+  void *dst_data_;
+  int64_t numel_;
+
+  ReduceBufferData(const std::vector<const void *> &src, void *dst,
+                   int64_t numel)
+      : src_data_(src), dst_data_(dst), numel_(numel) {}
+
+  template <typename T>
+  void apply() const {
+    T *dst_data = reinterpret_cast<T *>(dst_data_);
+    for (size_t i = 0; i < src_data_.size(); ++i) {
+      auto srd_data = reinterpret_cast<const T *>(src_data_[i]);
+      VLOG(10) << "dst: " << dst_data_ << ", " << srd_data;
+      if (srd_data == dst_data_) {
+        continue;
+      }
+
+      std::transform(srd_data, srd_data + numel_, dst_data, dst_data,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
 inline void GatherLocalSelectedRows(
     const std::vector<const SelectedRows *> &src_selecte_rows_,
     const std::vector<platform::Place> &in_places,
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 13a042d8e6..6092143449 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <queue>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
@@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
   return shrink_func(computation_op);
 }
 
-static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
-  VarDesc *var_desc = nullptr;
-  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
-    var_desc = var_handle->Node()->Var();
-    return var_desc != nullptr;
-  });
-  return var_desc;
-}
-
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc
index 89bd08c2d0..94de0e6ab0 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
-namespace details {}  // namespace details
+namespace details {
+
+VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
+}
+
+}  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
index 1c083dbf00..ce700119c5 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -16,6 +16,7 @@
 
 #include <atomic>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -25,6 +26,10 @@
 
 namespace paddle {
 namespace framework {
+
+class VarDesc;
+class VarHandle;
+
 namespace details {
 
 class ComputationOpHandle;
@@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector";
 const char kAllPlaces[] = "all_places";
 
 using LastLiveOpsOfVars =
-    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle *>>;
 const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
 
+VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars);
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 9ba295a2b0..c4254bbadf 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     : graph_(graph),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
+      prepare_pool_(1),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      running_ops_(0),
-      strategy_(strategy) {}
+      strategy_(strategy) {
+  PrepareOpDeps();
+  CopyOpDeps();
+}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unique_ptr<platform::RecordEvent> event(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
-  std::unordered_set<OpHandleBase *> ready_ops;
+  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
+  CopyOpDeps();
+  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
+  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
+      new BlockingQueue<VarHandleBase *>);
+  auto &pending_ops = op_deps->pending_ops_;
+  auto &pending_vars = op_deps->pending_vars_;
+  auto &ready_ops = op_deps->ready_ops_;
+
   // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
   // streams from multiple GPUs, it's faster to buffer them and schedule
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
 
-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var);
-  }
-
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
-
   // Step 2. Insert FetchOps
   std::vector<FetchOpHandle *> fetch_ops;
   std::unordered_set<VarHandleBase *> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, ready_vars.get(), &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
+                 &pending_ops, &pending_vars, &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
-      running_ops_++;
       RunOp(ready_vars, op);
     }
     set.clear();
   };
-
+  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
     // Keep loop until all vars are ready.
-    //
-    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
-    // ready_ops have been performed.
-    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
-      run_all_ops(delayed_ops);
-    } else {
-      run_all_ops(ready_ops);
-    }
+    run_all_ops(ready_ops);
 
     // 2. Find ready variable
     bool timeout;
     auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-
     if (timeout) {
       if (exception_holder_.IsCaught()) {
         for (auto &run_op_future : run_op_futures_) {
@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         continue;
       }
     }
+
     // 3. Remove the dependency of ready_var.
     // Find the ready_ops after the ready_var.
     for (auto ready_var : cur_ready_vars) {
@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
-            delayed_ops.insert(op);
-          } else {
-            ready_ops.insert(op);
-          }
+          run_all_op(op);
         }
       }
     }
@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
     std::vector<FetchOpHandle *> *fetch_ops,
     std::unordered_set<VarHandleBase *> *fetch_dependencies,
+    std::unordered_set<OpHandleBase *> *ready_ops,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+    FeedFetchList *fetch_data) {
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-
+  std::unordered_set<VarHandleBase *> local_ready_vars;
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
+        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
       }
     }
   }
@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     auto &var_name = fetch_tensors[i];
     auto fetched_var_it = fetched_vars.find(var_name);
     PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);
 
     auto &vars = fetched_var_it->second;
 
@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     auto *fetch_dummy = new DummyVarHandle(fetch_var);
     op->AddOutput(fetch_dummy);
     fetch_dependencies->emplace(fetch_dummy);
-    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
-    this->InsertPendingOp(pending_ops, op);
+
+    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
+
+    size_t wait_input_num = 0;
+    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
+    for (auto *var : input_set) {
+      if (pending_vars->count(var)) {
+        ++wait_input_num;
+      }
+    }
+    if (wait_input_num) {
+      pending_ops->insert({op, wait_input_num});
+    } else {
+      ready_ops->insert(static_cast<OpHandleBase *>(op));
+    }
   }
+  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp(
 
 void ThreadedSSAGraphExecutor::InsertPendingVar(
     std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+    std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
   pending_vars->insert(var);
   if (var->GeneratedOp() == nullptr) {
-    ready_vars->Push(var);
+    ready_vars->insert(var);
+  }
+}
+
+void ThreadedSSAGraphExecutor::PrepareOpDeps() {
+  op_deps_.reset(new OpDependentData());
+  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
+      op_deps_->pending_ops_;
+  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
+  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
+  std::unordered_set<VarHandleBase *> ready_vars;
+
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
+      }
+    }
+  }
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
+    InsertPendingVar(&pending_vars, &ready_vars, var);
+  }
+
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op);
+    } else {
+      InsertPendingOp(&pending_ops, op);
+    }
   }
+  for (auto ready_var : ready_vars) {
+    pending_vars.erase(ready_var);
+    for (auto *op : ready_var->PendingOps()) {
+      auto &deps = pending_ops[op];
+      --deps;
+      if (deps == 0) {
+        ready_ops.insert(op);
+      }
+    }
+  }
+}
+
+void ThreadedSSAGraphExecutor::CopyOpDeps() {
+  op_deps_futures_ = prepare_pool_.enqueue([&] {
+    auto *op_deps = new OpDependentData();
+    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
+                                 op_deps_->pending_ops_.end());
+    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
+                                  op_deps_->pending_vars_.end());
+    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
+                               op_deps_->ready_ops_.end());
+    return std::unique_ptr<OpDependentData>(op_deps);
+  });
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp(
         op->Run(strategy_.use_cuda_);
       }
       VLOG(10) << op << " " << op->Name() << " Done ";
-      running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 0867f62104..b9bccba8fa 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,18 +15,20 @@
 #pragma once
 
 #include <deque>
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -36,6 +38,12 @@ class Scope;
 
 namespace details {
 
+struct OpDependentData {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
+  std::unordered_set<VarHandleBase *> pending_vars_;
+  std::unordered_set<OpHandleBase *> ready_ops_;
+};
+
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   ir::Graph *graph_;
   std::unique_ptr<::ThreadPool> pool_;
+  ::ThreadPool prepare_pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
-  std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
 
   void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        std::unordered_set<VarHandleBase *> *ready_vars,
                         VarHandleBase *var) const;
 
   void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
                       std::vector<FetchOpHandle *> *fetch_ops,
                       std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_set<OpHandleBase *> *ready_ops,
                       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                       std::unordered_set<VarHandleBase *> *pending_vars,
-                      BlockingQueue<VarHandleBase *> *ready_vars,
                       FeedFetchList *fetch_data);
 
+  void PrepareOpDeps();
+  void CopyOpDeps();
+
  private:
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
+
   ExecutionStrategy strategy_;
+  std::unique_ptr<OpDependentData> op_deps_;
   // use std::list because clear(), push_back, and for_each are O(1)
   std::list<std::future<void>> run_op_futures_;
 };
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 8321c32f8b..93060ef259 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -43,6 +43,7 @@ struct VarHandleBase {
   virtual ~VarHandleBase();
 
   virtual std::string DebugString() const = 0;
+  virtual const std::string& Name() const = 0;
 
   void AddInput(OpHandleBase* in, ir::Node* node) {
     node_->inputs.clear();
@@ -95,8 +96,6 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
-  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
-
   virtual ~VarHandle();
 
   std::string DebugString() const override;
@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
+#ifdef PADDLE_WITH_CUDA
+  bool HasEvent() { return has_event_; }
+
+  const cudaEvent_t& GetEvent() {
+    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    return event_;
+  }
+
+  void SetGenerateEvent(const cudaEvent_t& event) {
+    has_event_ = true;
+    event_ = event;
+  }
+#endif
+
   // version field currently is not used, however, just store the version to
   // debug easily.
  private:
@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
+#ifdef PADDLE_WITH_CUDA
+  // Only when this event is triggered, var is generated.
+  cudaEvent_t event_;
+  bool has_event_{false};
+#endif
 
  public:
   bool IsTheSameVar(const VarHandle& o) const {
@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase {
 
   size_t version() const { return version_; }
   size_t scope_idx() const { return scope_idx_; }
+  const std::string& Name() const override { return name_; }
   const std::string& name() const { return name_; }
   const platform::Place& place() const { return place_; }
 };
@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase {
   virtual ~DummyVarHandle();
 
   std::string DebugString() const override;
+
+ public:
+  const std::string& Name() const override { return name_; }
+  std::string name_{"DummyVar"};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
new file mode 100644
index 0000000000..fd6b6dd227
--- /dev/null
+++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class WhileOpEagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+    // Find all while_op and while_grad_op
+    std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
+                                         std::vector<OperatorBase *>>>
+        target_ops;
+    for (auto *op : all_ops) {
+      auto compute_op = dynamic_cast<ComputationOpHandle *>(op);
+      if (compute_op == nullptr) continue;
+
+      if (compute_op->Name() == "while") {
+        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
+            compute_op->GetOp());
+      } else if (compute_op->Name() == "while_grad") {
+        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
+            compute_op->GetOp());
+      }
+    }
+
+    for (auto &ops_pair : target_ops) {
+      auto &while_ops = ops_pair.second.first;
+      auto &while_grad_ops = ops_pair.second.second;
+      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+          while_ops, while_grad_ops);
+    }
+    return graph;
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(while_op_eager_deletion_pass,
+              paddle::framework::details::WhileOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c31d0beec3..99192292b0 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
 #include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@@ -23,17 +27,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_NGRAPH
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 #endif
 
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
-DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 
 namespace paddle {
 namespace framework {
@@ -75,11 +80,11 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
 
 ExecutorPrepareContext::ExecutorPrepareContext(
     const framework::ProgramDesc& prog, size_t block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars)
-    : prog_(prog), block_id_(block_id) {
-  if (GetEagerDeletionThreshold() >= 0) {
-    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
-                                                        skip_ref_cnt_vars);
+    const std::vector<std::string>& keep_vars, bool force_disable_gc)
+    : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
+  if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
+    global_ref_cnts_ =
+        GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
   }
 }
 
@@ -184,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 }
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope, bool create_vars) {
+                   bool create_local_scope, bool create_vars,
+                   const std::vector<std::string>& skip_ref_cnt_vars,
+                   bool force_disable_gc) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
-#endif
-  auto ctx = Prepare(pdesc, block_id);
+  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
 
@@ -357,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     const ProgramDesc& program, int block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
+    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
+  std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
+      program, block_id, skip_ref_cnt_vars, force_disable_gc));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) {
+    paddle::operators::NgraphEngine::FuseNgraphOps(
+        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
+  }
+#endif
   return ctx;
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
     const ProgramDesc& program, const std::vector<int>& block_ids,
-    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars,
+    bool force_disable_gc) {
   PADDLE_ENFORCE(
       skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
       "skip_ref_cnt_vars should be either empty or equals to block number %d",
@@ -380,9 +391,11 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
   for (auto& bid : block_ids) {
     ExecutorPrepareContext* ctx;
     if (skip_ref_cnt_vars.empty()) {
-      ctx = new ExecutorPrepareContext(program, bid);
+      ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
+                                       force_disable_gc);
     } else {
-      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
+                                       force_disable_gc);
     }
     PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
@@ -409,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
-  // skip while_op and while_grad_op temporarily
-  if (max_memory_size >= 0 && !keep_kids) {
+  // FIXME(zjl): recurrent_op is rather complex, we would
+  // disable gc forcely in recurrent_op
+  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -428,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
+    // If gc is enabled and block size > 1
+    if (gc && ctx->prog_.Size() > 1) {
+      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
+                                                                 ctx->ops_);
+    }
   }
 
   for (auto& op : ctx->ops_) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 5a040ac641..65cb9e51ab 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -30,7 +32,8 @@ namespace framework {
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
                          const std::vector<std::string>& skip_ref_cnt_vars =
-                             std::vector<std::string>());
+                             std::vector<std::string>(),
+                         bool force_disable_gc = false);
 
   ~ExecutorPrepareContext();
 
@@ -38,6 +41,7 @@ struct ExecutorPrepareContext {
 
   const framework::ProgramDesc& prog_;
   size_t block_id_;
+  bool force_disable_gc_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
   std::unordered_map<std::string, size_t> global_ref_cnts_;
@@ -66,7 +70,10 @@ class Executor {
    *  Scope
    */
   void Run(const ProgramDesc& prog, Scope* scope, int block_id,
-           bool create_local_scope = true, bool create_vars = true);
+           bool create_local_scope = true, bool create_vars = true,
+           const std::vector<std::string>& skip_ref_cnt_vars =
+               std::vector<std::string>(),
+           bool force_disable_gc = false);
 
   // This API is very slow.
   void Run(const ProgramDesc& program, Scope* scope,
@@ -79,12 +86,14 @@ class Executor {
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
       const ProgramDesc& program, int block_id,
       const std::vector<std::string>& skip_ref_cnt_vars =
-          std::vector<std::string>());
+          std::vector<std::string>(),
+      bool force_disable_gc = false);
 
   static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
       const ProgramDesc& program, const std::vector<int>& block_ids,
       const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
-          std::vector<std::vector<std::string>>());
+          std::vector<std::vector<std::string>>(),
+      bool force_disable_gc = false);
 
   void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 9bccb1a32b..f2f4c53eea 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
@@ -55,11 +57,11 @@ class GradOpDescMakerBase {
                    std::back_inserter(ret_val),
                    [this](const std::string& fwd_var_name) -> std::string {
                      auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.count(g_name)) {
-                       return kEmptyVarName;
-                     } else {
+                     if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
                        (*this->grad_to_var_)[g_name] = fwd_var_name;
                        return g_name;
+                     } else {
+                       return kEmptyVarName;
                      }
                    });
     if (!drop_empty_grad) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ca6b0229e9..a79a53867d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,9 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_placement_pass base)
+pass_library(cpu_quantize_pass inference)
+pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -66,6 +69,8 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
+pass_library(sync_batch_norm_pass base)
+pass_library(runtime_context_cache_pass base)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -100,6 +105,12 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
+cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
+if(NOT WIN32)
+    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+endif()
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
new file mode 100644
index 0000000000..ed80f9cae3
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+}  // namespace
+
+enum { U8_MAX = 255, S8_MAX = 127 };
+
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
+using string::PrettyLogDetail;
+
+void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
+                                    std::string input_name, double scale_to_one,
+                                    bool is_unsigned,
+                                    std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize");
+  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
+  q_desc.SetOutput("Output",
+                   std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("Scale", scale);
+  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+  // update op's input
+  op->Op()->SetInput(input_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
+                                       std::string output_name,
+                                       double scale_to_one, bool is_unsigned,
+                                       std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  // create a dequantize op node for output.
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("Scale", scale);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  // update op's output
+  op->Op()->SetOutput(output_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   bool with_residual_data) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ConvResidual conv_pattern{pattern, name_scope_};
+  conv_pattern(with_residual_data);
+
+  int quantize_conv_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize conv2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    auto* conv_op_desc = conv_op->Op();
+
+    // skip if should not be quantized
+    if (!conv_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[conv_input->Name()].first;
+    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
+                  is_input_unsigned, "Scale_in");
+
+    auto filter_scale_tensor = scales[conv_filter->Name()].second;
+    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
+                                     filter_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        filter_scale_tensor.data<double>(),
+        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
+
+    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
+
+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
+                                conv_pattern);
+      auto residual_scale =
+          scales[conv_residual_data->Name()].second.data<double>()[0];
+      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
+
+      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
+                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
+    }
+
+    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[conv_output->Name()].first;
+    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
+                     is_output_unsigned, "Scale_out");
+
+    ++quantize_conv_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_conv_count);
+
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
+  if (with_residual_data) msg_ss << " with residual connection";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+
+void CPUQuantizePass::QuantizePool(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Pool pool_pattern{pattern, name_scope_};
+  pool_pattern();
+
+  int quantize_pool_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize pool2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
+    auto* pool_op_desc = pool_op->Op();
+
+    // skip if should not be quantized
+    if (!pool_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[pool_input->Name()].first;
+    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
+
+    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[pool_output->Name()].first;
+    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_pool_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_pool_count);
+
+  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Quantizing the graph.";
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  PADDLE_ENFORCE(param_scope());
+
+  QuantizeConv(graph.get(), false /* with_residual_data */);
+  QuantizeConv(graph.get(), true /* with_residual_data */);
+  QuantizePool(graph.get());
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
+    .RequirePassAttr("quant_var_scales");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.h b/paddle/fluid/framework/ir/cpu_quantize_pass.h
new file mode 100644
index 0000000000..9873bb04e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+
+/*
+ * Quantize all supported operators.
+ */
+class CPUQuantizePass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+
+  void QuantizePool(Graph* graph) const;
+
+  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
+                     double scale_to_one, bool is_unsigned,
+                     std::string scale_attr_name = "") const;
+
+  void DequantizeOutput(Graph* g, Node* op, Node* output,
+                        std::string output_name, double scale_to_one,
+                        bool is_unsigned,
+                        std::string scale_attr_name = "") const;
+
+  const std::string name_scope_{"quantize"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
new file mode 100644
index 0000000000..89601be7d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           bool use_quantizer = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+    if (inputs.size() > 3) {
+      op->SetInput("ResidualData", {inputs[3]});
+      op->SetAttr("fuse_residual_connection", true);
+    } else {
+      op->SetInput("ResidualData", {});
+      op->SetAttr("fuse_residual_connection", false);
+    }
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Out", {outputs[0]});
+  }
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
+    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+// (a,w1)->Conv1->c and c->Pool1->d
+//
+// (d,w2)->Conv2->e and e->Pool2->f
+//
+// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+//
+// (d,w4, b2)->Conv4->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
+        use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
+        use_quantizer);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int quant_count, int dequant_count, int added_nodes_count,
+              float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  auto* scales = new VarQuantScale();
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
+            << "Scale_in for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+        EXPECT_EQ(
+            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
+            scale)
+            << "Scale_weights for node '" + op_name + "'.";
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = true;
+  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
+  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
+  //
+  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
+  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
+  //
+  // d->Dropout1->g and g->Fc1->h and
+  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
+  //
+  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
+  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
+  int added_nodes = 7 + 7 + 6 + 6;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+           2.0f * 127);
+}
+
+TEST(CpuQuantizePass, do_not_quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = false;
+  int added_nodes = 0;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
+           1.0f);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_pass);
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
new file mode 100644
index 0000000000..50bbe4915b
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include <string>
+#include <unordered_set>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Marks operators which are to be quantized.";
+  const auto& excluded_ids_list =
+      Get<std::unordered_set<int>>("quantize_excluded_op_ids");
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                    n->id()) != excluded_ids_list.end())
+        continue;
+      auto* op = n->Op();
+      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_quantizer", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_quantizer", true);
+        }
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_placement_pass,
+              paddle::framework::ir::CPUQuantizePlacementPass)
+    // a vector of operator type names to be quantized ("conv2d" etc.)
+    .RequirePassAttr("quantize_enabled_op_types")
+    // a vector of operator ids that are to be excluded from quantization
+    .RequirePassAttr("quantize_excluded_op_ids");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
new file mode 100644
index 0000000000..ef3861b249
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be quantized.
+ */
+class CPUQuantizePlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
new file mode 100644
index 0000000000..5a4d622645
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           boost::tribool use_quantizer) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_quantizer))
+    op->SetAttr("use_quantizer", use_quantizer);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_quantizer
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     false
+// f->relu->g                    none
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   false
+// k->pool->l                    false
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
+              std::initializer_list<int> quantize_excluded_op_ids,
+              unsigned expected_use_quantizer_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+  pass->Set("quantize_excluded_op_ids",
+            new std::unordered_set<int>(quantize_excluded_op_ids));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_quantizer_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_quantizer") &&
+          boost::get<bool>(op->GetAttr("use_quantizer"))) {
+        ++use_quantizer_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+}
+
+TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
+
+TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
+  MainTest({"conv2d"}, {4}, 1);
+}
+
+TEST(QuantizerPlacementPass, excluded_none) {
+  // 2 conv + 2 pool
+  MainTest({}, {}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_placement_pass);
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
new file mode 100644
index 0000000000..de62a69de4
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file eint8_outcept in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
+// implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUQuantizeSquashPass::FindNodesToKeep(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
+  deq_any_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
+
+    if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
+      (*nodes_keep_counter)[dequant_out] = 1;
+    else
+      (*nodes_keep_counter)[dequant_out] += 1;
+
+    found_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
+void CPUQuantizeSquashPass::Squash(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
+  squash_pattern();
+
+  int found_squash_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "squash requantize-quantize ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
+
+    auto* next_op_desc = next_op->Op();
+    float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
+    float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
+    PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
+                   nodes_keep_counter->end());
+
+    // check if dequantize op should be kept or removed, decrease the counter
+    bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
+
+    if (dequant_scale == quant_scale) {
+      // squash dequantize-quantize to nothing
+      auto quant_out_var_name = quant_out->Name();
+      auto next_op_inputs = next_op_desc->InputNames();
+      for (const auto& name : next_op_inputs) {
+        auto var_name = next_op_desc->Input(name)[0];
+        if (var_name.compare(quant_out_var_name) == 0) {
+          next_op_desc->SetInput(
+              name, std::vector<std::string>({dequant_in->Name()}));
+          break;
+        }
+      }
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+      else
+        GraphSafeRemoveNodes(graph,
+                             {dequant_op, quant_op, dequant_out, quant_out});
+
+      IR_NODE_LINK_TO(dequant_in, next_op);
+
+      found_squash_count++;
+    } else {
+      // squash dequantize-quantize to requantize op
+      OpDesc desc;
+      desc.SetType("requantize");
+      desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
+      desc.SetAttr("Scale_in", dequant_scale);
+      desc.SetAttr("Scale_out", quant_scale);
+
+      auto requant_op = g->CreateOpNode(&desc);
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op});
+      else
+        GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
+
+      IR_NODE_LINK_TO(dequant_in, requant_op);
+      IR_NODE_LINK_TO(requant_op, quant_out);
+
+      found_squash_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_squash_count);
+  PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
+                  found_squash_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
+
+  std::unordered_map<const Node*, int> nodes_keep_counter;
+  FindNodesToKeep(graph.get(), &nodes_keep_counter);
+  Squash(graph.get(), &nodes_keep_counter);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_squash_pass,
+              paddle::framework::ir::CPUQuantizeSquashPass);
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
new file mode 100644
index 0000000000..b823a2cef3
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Squash dequantize->quantize pair pattern into requantize op
+ */
+class CPUQuantizeSquashPass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizeSquashPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  /*
+   * For each dequantize's output find the number of operators it is an input to
+   */
+  void FindNodesToKeep(
+      Graph* graph,
+      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  /*
+   * Squash dequantize-quantize ops pairs into requantize or nothing
+   */
+  void Squash(Graph* graph,
+              std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  const std::string name_scope_{"squash"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
new file mode 100644
index 0000000000..3a3eb53f79
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           float scale = 0) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Output", {outputs[0]});
+  } else if (type == "quantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  } else if (type == "dequantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  }
+}
+
+// (a,w1,b1)->Conv1->d
+// d->Dequant->e
+// e->Quant->f
+// (f,w2,b2)->Conv2->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
+  ProgramDesc prog;
+  for (auto& v : std::initializer_list<std::string>(
+           {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
+  SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
+  return prog;
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "b", "c", "d", "e", "f", "g", "h"};
+// a->Conv1->b
+// b->Dequant->c
+//
+// c->Quant1->d and d->Conv2->e
+//
+// c->Conv3->f
+//
+// c->Quant2->g and g->Conv4->h
+//
+ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
+                              float scale3) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
+
+  SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
+
+  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
+
+  SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
+  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
+}
+
+TEST(CpuQuantizeSquashPass, equal_scales) {
+  auto scale = 1.2345f;
+  auto use_mkldnn = true;
+  // Remove 4 nodes: Dequant, Quant, e, f
+  auto remove_nodes = 4;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, inequal_scales) {
+  auto scale1 = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Dequant, Quant, e
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
+  // Delete both quantize ops,
+  // bypass dequantize in both branches,
+  // insert requantize on one branch
+  auto scale = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Quant1, Quant2, g
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_squash_pass);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 5e954fa9c4..6a9340b870 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include <unordered_set>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -152,6 +152,39 @@ void Graph::ResolveHazard(
   }
 }
 
+std::shared_ptr<Graph> Graph::Clone() {
+  auto cloned_graph = std::make_shared<Graph>(this->program_);
+  cloned_graph->ReleaseNodes();
+  cloned_graph->num_node_created_ = 0;
+  std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
+  for (auto *n : this->node_set_) {
+    ir::Node *cloned_node = nullptr;
+    if (n->IsCtrlVar()) {
+      cloned_node = cloned_graph->CreateControlDepVar();
+    } else if (!n->var_desc_ && !n->op_desc_) {  // empty node
+      cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
+    } else if (n->IsVar()) {
+      cloned_node = cloned_graph->CreateVarNode(n->Var());
+    } else if (n->IsOp()) {
+      cloned_node = cloned_graph->CreateOpNode(n->Op());
+    }
+    if (cloned_node) {
+      origin_to_cloned[n] = cloned_node;
+    } else {
+      PADDLE_THROW("The cloned node's type is not supported!");
+    }
+  }
+  for (auto *n : this->node_set_) {
+    for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
+      origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
+    }
+    for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
+      origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
+    }
+  }
+  return cloned_graph;
+}
+
 bool IsControlDepVar(const ir::Node &var) {
   return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index cfd974e4bd..fff015d4a6 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/node.h"
@@ -199,7 +200,12 @@ class Graph {
   // WARN: After a series of passes, the current graph can be quite
   // different from OriginProgram. Caller shouldn't assume much from
   // the returned OriginProgram.
-  const ProgramDesc &OriginProgram() const { return program_; }
+  const ProgramDesc &OriginProgram() const {
+    LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
+                    "quite different from OriginProgram. So, please avoid "
+                    "using the `OriginProgram()` method!";
+    return program_;
+  }
 
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
@@ -212,6 +218,10 @@ class Graph {
   void ResolveHazard(
       const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
 
+  // Create a new and duplicated graph.
+  // WARN: The method only clones the graph structure, not its attributes.
+  std::shared_ptr<Graph> Clone();
+
  private:
   std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
       const ProgramDesc &program);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index c0c34d186b..d0d72127f0 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
+                subgraphs.size());
   int id = 0;
   for (auto &g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";
@@ -598,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) {
 bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
   PADDLE_ENFORCE(var->IsVar());
   PADDLE_ENFORCE(op->IsOp());
-  if (op->Op()->Input(argument).size() <= nth) return false;
+  if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth)
+    return false;
   return var->Name() == op->Op()->Input(argument)[nth];
 }
 
+bool HasInput(Node *op, const std::string &argument) {
+  PADDLE_ENFORCE(op->IsOp());
+  auto const &names = op->Op()->InputNames();
+  if (std::find(names.begin(), names.end(), argument) == names.end())
+    return false;
+  return true;
+}
+
 bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
   PADDLE_ENFORCE(var->IsVar());
   PADDLE_ENFORCE(op->IsOp());
@@ -1074,9 +1084,60 @@ PDNode *patterns::Conv::operator()() {
                         ->AsOutput()
                         ->assert_is_op_output("conv2d", "Output");
 
-  conv_op->LinksFrom({input_var, filter_var});
-  conv_op->LinksTo({output_var});
+  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+
+  if (!with_residual_data) {
+    conv_op->assert_more([&](Node *x) {
+      auto node_names = x->Op()->InputNames();
+      if (!HasInput(x, "ResidualData") ||
+          x->Op()->Input("ResidualData").size() == 0)
+        return true;
+      return false;
+    });
+  }
+
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+
+  std::vector<PDNode *> links_from{input_var, filter_var};
 
+  if (with_residual_data) {
+    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("conv2d", "ResidualData");
+    links_from.push_back(res_conn_var);
+  }
+
+  conv_op->LinksFrom(links_from).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::Pool::operator()() {
+  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
+
+  auto input_var = pattern->NewNode(pool_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("pool2d", "X");
+
+  auto output_var = pattern->NewNode(pool_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("pool2d", "Out");
+
+  pool_op->LinksFrom({input_var}).LinksTo({output_var});
   return output_var;
 }
 
@@ -1301,6 +1362,51 @@ PDNode *patterns::ConvAffineChannel::operator()(
   return ac_out_var;
 }
 
+PDNode *patterns::DequantQuantAny::operator()() {
+  auto *dequant_in = pattern->NewNode(dequant_in_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("dequantize", "Input");
+
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *quant_op = pattern->NewNode(quant_op_repr())
+                       ->assert_is_op("quantize")
+                       ->AsIntermediate();
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("quantize");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
+  next_op->LinksFrom({quant_out});
+
+  return quant_out;
+}
+
+PDNode *patterns::DequantAny::operator()() {
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksTo({dequant_out});
+  next_op->LinksFrom({dequant_out});
+
+  return dequant_out;
+}
+
 // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
 // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
 // ...
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c8be586f54..bac23b6513 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -18,8 +18,11 @@
 #include <gtest/gtest_prod.h>
 #endif
 
+#include <memory>
 #include <numeric>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -302,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type);
 // Check whether a var node is a op node's nth input.
 bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
 
+// Check whether the op node has input of given name.
+bool HasInput(Node* op, const std::string& argument);
+
 // Tell whether a var node is a op node's nth output.
 bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
 
@@ -656,6 +662,35 @@ struct Conv : public PatternBase {
   PATTERN_DECL_NODE(conv_output);
 };
 
+// Convolution op with residual data
+struct ConvResidual : public PatternBase {
+  ConvResidual(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_residual") {}
+
+  PDNode* operator()(bool with_residual_data);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+
+// Pool op
+// Forward pass for pooling.
+// pool_input is the input.
+// pool_output is a result of the operator.
+struct Pool : public PatternBase {
+  Pool(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "pooling") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(pool_op);
+  PATTERN_DECL_NODE(pool_input);
+  PATTERN_DECL_NODE(pool_output);
+};
+
 // ElementwiseAdd used in residual connections.
 // y_var is used and convolution output.
 // The operator is removed, when residual
@@ -766,6 +801,34 @@ struct ConvAffineChannel : public PatternBase {
   PATTERN_DECL_NODE(ac_out);  // Out
 };
 
+// Dequantize + Quantize + anyOP
+// This pattern is used for squashing the dequantize-quantize pairs.
+struct DequantQuantAny : public PatternBase {
+  DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_quant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_in);
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Dequantize + anyOP
+// This quantize is used for getting number of ops the Dequantize's
+// output is an input to.
+struct DequantAny : public PatternBase {
+  DequantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
 struct TransposeFlattenConcat : public PatternBase {
   TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 7ed2f96eb2..a95588a57b 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 
@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
index 3d4dc9e2b6..c071d9aed2 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -14,12 +14,16 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+/*
+ * Specifies which operators should use MKLDNN.
+ */
 class MKLDNNPlacementPass : public Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 9eade9eaa8..72fb876d98 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <typeindex>
 #include <typeinfo>
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
new file mode 100644
index 0000000000..67b29512c4
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Applies Runtime Context Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(runtime_context_cache_pass,
+              paddle::framework::ir::RuntimeContextCachePass);
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
new file mode 100644
index 0000000000..a6cf1a9ae5
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RuntimeContextCachePass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
new file mode 100644
index 0000000000..b370039915
--- /dev/null
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Use synchronous batch norm";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() == "batch_norm") {
+        op->SetType("sync_batch_norm");
+      }
+      if (op->Type() == "batch_norm_grad") {
+        op->SetType("sync_batch_norm_grad");
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
new file mode 100644
index 0000000000..51cce3dca6
--- /dev/null
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SyncBatchNormPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
new file mode 100644
index 0000000000..9c94c1746a
--- /dev/null
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+}
+
+// (a, conv_w)->conv2d->b
+// (b, bn_scale, bn_bias, mean, var)->batch_norm
+//     ->(c, mean, var, save_mean, save_inv_var)
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
+                                           "bn_bias", "mean", "var", "c",
+                                           "save_mean", "save_inv_var"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
+        v == "var") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "batch_norm", "bn",
+        std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
+        std::vector<std::string>(
+            {"c", "mean", "var", "save_mean", "save_inv_var"}));
+  return prog;
+}
+
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
+
+  graph = pass->Apply(std::move(graph));
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "bn") {
+        ASSERT_EQ(op->Type(), "sync_batch_norm");
+      }
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(sync_batch_norm_pass);
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 0e7b0cbeb9..8f9c6cb5e9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const {
   // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
-    info.infer_var_type_(*this, block);
+    InferVarTypeContext context(this, block);
+    info.infer_var_type_(&context);
   }
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index df1689764d..1ba2bed886 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(3) << place << " " << DebugStringEx(&scope);
   } catch (platform::EnforceNotMet exception) {
     if (Attrs().count("sub_block") != 0) {
-      throw;
+      throw std::move(exception);
     }
 
     auto& callstack = Attr<std::vector<std::string>>(
         OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
 
     if (callstack.empty()) {
-      throw;
+      throw std::move(exception);
     }
     std::ostringstream sout;
     sout << "Invoke operator " << Type() << " error.\n";
@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     sout << "C++ Callstacks: \n";
     sout << exception.err_str_;
     exception.err_str_ = sout.str();
-    throw;
+    throw std::move(exception);
   } catch (...) {
     std::rethrow_exception(std::current_exception());
   }
@@ -876,7 +876,22 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeContext ctx(Inputs(), Outputs(), scope);
+  if (!HasAttr(kEnableCacheRuntimeContext)) {
+    RuntimeContext ctx(Inputs(), Outputs(), scope);
+    RunImpl(scope, place, &ctx);
+  } else {
+    const Scope* cur_scope = &scope;
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
+    }
+    RunImpl(scope, place, runtime_ctx_.get());
+  }
+}
+
+void OperatorWithKernel::RunImpl(const Scope& scope,
+                                 const platform::Place& place,
+                                 RuntimeContext* runtime_ctx) const {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -891,7 +906,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -915,8 +930,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
+                                     &transfered_inplace_vars, runtime_ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -926,12 +941,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
-  this->InferShape(&infer_shape_ctx);
+  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
+    this->InferShape(&infer_shape_ctx);
+  }
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(
-      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
+                                       *runtime_ctx, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 55629636a8..684960c235 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,23 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
+/// RuntimeContext is used to relate input/output names of Operator with
+/// the corresponding variables in name scope.
+/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
+/// name scope, since the input/output names of this Op do not change in the
+/// execution, RuntimeContext could be created only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
+
+/// If an Op has this attribute, all its kernels should calculate output
+/// variable's shape in the corresponding Compute() function. And
+/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
+/// function in its runtime for speedup.
+/// TODO(luotao): Note that this temporal attribute would be deleted after all
+/// ops contain it.
+constexpr char kAllKernelsMustComputeRuntimeShape[] =
+    "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
@@ -447,6 +464,8 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  void RunImpl(const Scope& scope, const platform::Place& place,
+               RuntimeContext* runtime_ctx) const;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to
@@ -465,6 +484,8 @@ class OperatorWithKernel : public OperatorBase {
 
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
+  mutable const Scope* pre_scope_ = nullptr;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 3e1d61813c..20a8c47d5d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
@@ -181,13 +183,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
 
-ParallelExecutor::ParallelExecutor(
-    const std::vector<platform::Place> &places,
-    const std::unordered_set<std::string> &bcast_vars,
-    const std::string &loss_var_name, Scope *scope,
-    const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    ir::Graph *graph)
+ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
+                                   const std::vector<std::string> &bcast_vars,
+                                   const std::string &loss_var_name,
+                                   Scope *scope,
+                                   const std::vector<Scope *> &local_scopes,
+                                   const ExecutionStrategy &exec_strategy,
+                                   const BuildStrategy &build_strategy,
+                                   ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -250,13 +253,52 @@ ParallelExecutor::ParallelExecutor(
     member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
         member_->places_, nccl_id, build_strategy.num_trainers_,
         build_strategy.trainer_id_));
+
+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
+    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
+    if (nccl_id == nullptr) {
+      dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
+    }
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      if (nccl_id != nullptr) {
+        auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]);
+        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      } else {
+        auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]);
+        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      }
+    }
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
   }
-  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToDevices(bcast_vars);
+  // broadcast parameters from the 0th device to others:
+  auto need_broadcast = [&]() -> bool {
+    if (build_strategy.num_trainers_ > 1) {
+      // 1. num_tariners would be grater than 1 for nccl distributed training.
+      return true;
+    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
+      // 2. Only one trainer process, but ParallelExecutor hold multiple
+      // devices.
+      return true;
+    }
+    return false;
+  };
+
+  if (need_broadcast()) {
+    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
   }
+
 // Startup Program has been run. All local scopes has correct parameters.
 
 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
@@ -338,7 +380,7 @@ ParallelExecutor::ParallelExecutor(
 }
 
 void ParallelExecutor::BCastParamsToDevices(
-    const std::unordered_set<std::string> &vars) const {
+    const std::vector<std::string> &vars, int trainer_id) const {
   // the initializing bcast, all vars would be bcast from device(0).
   for (auto &var : vars) {
     framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
@@ -362,7 +404,7 @@ void ParallelExecutor::BCastParamsToDevices(
         auto place = member_->places_[i];
         void *buffer;
 
-        if (i == 0) {
+        if (i == 0 && trainer_id == 0) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ddf60b3946..d4658b9623 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/details/build_strategy.h"
@@ -45,7 +47,7 @@ class ParallelExecutor {
 
  public:
   explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &bcast_vars,
+                            const std::vector<std::string> &bcast_vars,
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
@@ -70,7 +72,10 @@ class ParallelExecutor {
            const std::string &fetched_var_name);
 
  private:
-  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
+  // broadcast the parameters from the 0th device.
+  // trainer_id the trainer index in nccl distributed training.
+  void BCastParamsToDevices(const std::vector<std::string> &vars,
+                            int trainer_id = 0) const;
   bool EnableParallelGraphExecution(const ir::Graph &graph,
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 87f0f307d3..d79bf25518 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,7 +34,7 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
-DEFINE_bool(fast_eager_deletion_mode, false,
+DEFINE_bool(fast_eager_deletion_mode, true,
             "Fast eager deletion mode. If enabled, memory would release "
             "immediately without waiting GPU kernel ends.");
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a7f09df491..5f21dae605 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
+#ifdef PADDLE_WITH_MKLDNN
+    if (src.layout() == DataLayout::kMKLDNN) {
+      dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
+    }
+#endif
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index d02c699b97..f55520901c 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -27,6 +27,7 @@ namespace framework {
 class OperatorBase;
 class OpDesc;
 class InferShapeContext;
+class InferVarTypeContext;
 class BlockDesc;
 class Variable;
 
@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
     const std::vector<BlockDesc*>& grad_block)>;
 
 using InferVarTypeFN =
-    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+    std::function<void(framework::InferVarTypeContext* /*context*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 64236b78d2..2e9c64d3e6 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -21,26 +23,123 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpDesc;
+class BlockDesc;
+// default infer var type context
+class InferVarTypeContext {
+ public:
+  InferVarTypeContext(const OpDesc* op, BlockDesc* block)
+      : op_(op), block_(block) {}
+
+  virtual ~InferVarTypeContext() {}
+
+  virtual Attribute GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->GetAttr(name);
+  }
+
+  virtual bool HasVar(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindVarRecursive(name) != nullptr;
+  }
+
+  virtual bool HasInput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Inputs().count(name) > 0;
+  }
+
+  virtual bool HasOutput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Outputs().count(name) > 0;
+  }
+
+  virtual const std::vector<std::string>& Input(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Input(name);
+  }
+
+  virtual const std::vector<std::string>& Output(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Output(name);
+  }
+
+  virtual proto::VarType::Type GetType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetType();
+  }
+
+  virtual void SetType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetType(type);
+  }
+
+  virtual proto::VarType::Type GetDataType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataType();
+  }
+
+  virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataType(type);
+  }
+
+  virtual std::vector<proto::VarType::Type> GetDataTypes(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
+  }
+
+  virtual void SetDataTypes(
+      const std::string& name,
+      const std::vector<proto::VarType::Type>& multiple_data_type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
+  }
+
+  virtual std::vector<int64_t> GetShape(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetShape();
+  }
+
+  virtual void SetShape(const std::string& name,
+                        const std::vector<int64_t>& dims) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetShape(dims);
+  }
+
+  virtual int32_t GetLoDLevel(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
+  }
+
+  virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
+  }
+
+ protected:
+  const OpDesc* op_;
+  BlockDesc* block_;
+};
+
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+  virtual void operator()(InferVarTypeContext* context) const = 0;  // NOLINT
 };
 
 class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const final {
+  void operator()(framework::InferVarTypeContext* ctx) const final {  // NOLINT
     auto in_out_var_names = this->GetInputOutputWithSameType();
 
     for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = op_desc.Input(i_o_n.first).at(0);
-      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+      auto& x_name = ctx->Input(i_o_n.first).at(0);
+      auto& out_name = ctx->Output(i_o_n.second).at(0);
 
-      auto& x = block->FindRecursiveOrCreateVar(x_name);
-      auto& out = block->FindRecursiveOrCreateVar(out_name);
-      out.SetType(x.GetType());
-      out.SetDataType(x.GetDataType());
+      ctx->SetType(out_name, ctx->GetType(x_name));
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 2a75394fca..6bbb25a573 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 }  // namespace framework
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index ec8dedd605..0d116a6495 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_PYTHON)
 cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
+cc_library(imperative_profiler SRCS profiler.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 012dfc1c7f..036d2a50a4 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -159,10 +159,9 @@ class Autograd {
       for (auto it : candidate->pre_ops_) {
         for (OpBase* pre_op : it.second) {
           if (!pre_op) continue;
-          VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id "
+          VLOG(5) << "op dep " << candidate->Type() << " trace id "
                   << candidate->trace_id_ << " <---- " << it.first << " <---- "
-                  << pre_op->op_desc_->Type() << " trace id "
-                  << pre_op->trace_id_;
+                  << pre_op->Type() << " trace id " << pre_op->trace_id_;
           if (visited.find(pre_op) == visited.end()) {
             visited.insert(pre_op);
             queue.push_back(pre_op);
@@ -180,10 +179,12 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
   PADDLE_ENFORCE(var_->IsInitialized(),
                  "Variable must be initialized when getting numpy tensor");
 
-  std::unique_ptr<VarBase> new_var(new VarBase());
+  // TODO(minqiyang): change this after move unique_name generator to CXX
+  const framework::LoDTensor& self_tensor = var_->Get<framework::LoDTensor>();
+  std::unique_ptr<VarBase> new_var(new VarBase(
+      "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false));
   framework::LoDTensor* tensor =
       new_var->var_->GetMutable<framework::LoDTensor>();
-  tensor->Resize(var_->Get<framework::LoDTensor>().dims());
   tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
 
   if (blocking) {
@@ -199,59 +200,103 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
   }
 
   if (platform::is_gpu_place(dst_place)) {
-    VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
+    VLOG(3) << "copy tensor " << Name() << " from gpu";
   }
 
   return new_var;
 }
 
 framework::LoDTensor& VarBase::GradValue() {
-  VLOG(3) << "get var grad " << var_desc_->Name();
+  VLOG(3) << "get var grad " << Name();
+  PADDLE_ENFORCE_NOT_NULL(grads_,
+                          "Could not get grad value from no grad variable");
   return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    VLOG(3) << "op with no grad: " << op_desc_->Type();
-    return {};
-  }
+  PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
+                 "%s has no backward implementation", Type());
 
-  VLOG(3) << "apply op grad: " << op_desc_->Type();
-  std::vector<framework::VariableValueMap> grad_outputs;
+  VLOG(3) << "apply op grad: " << Type();
+  std::vector<VarBasePtrMap> tmp_grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
-    grad_outputs.resize(1);
-    grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
+    tmp_grad_outputs.resize(1);
+    tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
         PyLayer::ApplyGrad(
             backward_id_,
             grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
   } else {
-    grad_outputs.resize(grad_op_descs_.size());
-    for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
+    const size_t grad_op_count = grad_op_descs_.size();
+
+    tmp_grad_outputs.resize(grad_op_count);
+    for (size_t k = 0; k < grad_op_count; ++k) {
       framework::OpDesc* grad_op_desc = grad_op_descs_[k];
-      VLOG(3) << "op grad " << grad_op_desc->Type();
-      for (auto it : grad_output_vars_[k]) {
-        auto& outputs = grad_outputs[k][it.first];
+      auto& grad_output_variable_map = grad_output_vars_[k];
+
+      VLOG(3) << "apply grad op " << grad_op_desc->Type();
+
+      // Allocate tmp grad output variable
+      for (const auto& it : grad_output_variable_map) {
+        auto& outputs = tmp_grad_outputs[k][it.first];
+        outputs.reserve(it.second.size());
         for (size_t i = 0; i < it.second.size(); ++i) {
+          VarBase* origin_grad_var_base = it.second[i];
+
           // Allocate a new variable
-          Variable* tmp_var = new framework::Variable();
-          tmp_var->GetMutable<framework::LoDTensor>();
-          outputs.push_back(tmp_var);
+          VarBase* tmp_grad_var_base = new VarBase(
+              string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
+              origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
+              place_, true, false);
+          outputs.emplace_back(tmp_grad_var_base);
         }
       }
 
-      framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
-
       // No need to do compile time infer shape here.
       // grad_op_desc_->InferShape(*block_);
-      grad_op_desc->InferVarType(block_);
+      // grad_op_desc->InferVarType(block_);
 
       std::unique_ptr<framework::OperatorBase> opbase =
           framework::OpRegistry::CreateOp(*grad_op_desc);
+
+      auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      if (info.infer_var_type_) {
+        RuntimeInferVarTypeContext infer_var_type_ctx(
+            &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
+        info.infer_var_type_(&infer_var_type_ctx);
+      }
+
       framework::OperatorWithKernel* op_kernel =
           dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
       PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
+      // Run grad op
+      framework::VariableValueMap grad_invars_map;
+      framework::VariableValueMap grad_outvars_map;
+
+      for (const auto& it : grad_input_vars_[k]) {
+        auto& grad_invars = grad_invars_map[it.first];
+        grad_invars.reserve(it.second.size());
+        for (const VarBase* grad_inp : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
+                                  grad_op_desc->Type(), grad_inp->Name());
+
+          grad_invars.emplace_back(grad_inp->var_);
+        }
+      }
+
+      for (const auto& it : tmp_grad_outputs[k]) {
+        auto& grad_outvars = grad_outvars_map[it.first];
+        grad_outvars.reserve(it.second.size());
+        for (VarBase* grad_out : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
+                                  grad_op_desc->Type(), grad_out->Name());
+
+          grad_outvars.emplace_back(grad_out->var_);
+        }
+      }
+
+      framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
@@ -260,15 +305,19 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     }
   }
 
+  // Add tmp grad outputs to original grad vars
   for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
-    for (auto it : grad_output_vars_[k]) {
-      auto& outputs = grad_outputs[k][it.first];
-      auto& origin_outputs = it.second;
+    for (const auto& it : grad_output_vars_[k]) {
+      auto& outputs = tmp_grad_outputs[k][it.first];
+      const auto& origin_outputs = it.second;
       PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
       for (size_t i = 0; i < outputs.size(); ++i) {
-        framework::Variable* grad = outputs[i];
-        framework::Variable* orig_grad = origin_outputs[i];
+        framework::Variable* grad = outputs[i]->var_;
+        framework::Variable* orig_grad = origin_outputs[i]->var_;
+        VLOG(3) << "AddTo Called with orig_grad is: "
+                << origin_outputs[i]->name_ << " Grad to be added is "
+                << outputs[i]->name_;
         AddTo(grad, orig_grad, place_);
         delete grad;
       }
@@ -316,33 +365,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
 
 int PyLayer::NumFuncs() { return py_funcs_.size(); }
 
-std::vector<VarBase*> PyLayer::Apply(int func_id,
-                                     const std::vector<VarBase*>& inputs) {
-  std::vector<framework::Variable*> invars;
-  for (const VarBase* in : inputs) {
-    invars.push_back(in->var_);
-  }
+std::vector<framework::Variable*> PyLayer::Apply(
+    int func_id, const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
-  std::vector<VarBase*> ret;
-  for (Variable* v : outvars) {
-    ret.push_back(new VarBase(v, new VarBase(true)));
-  }
-  return ret;
+  return CallPythonFunc(py_funcs_[func_id], inputs);
 }
 
-std::vector<Variable*> PyLayer::ApplyGrad(
-    int func_id, const std::vector<framework::Variable*>& inputs) {
+std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], inputs);
+  auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
+
+  std::vector<VarBase*> outs;
+  outs.reserve(rets.size());
+  for (size_t i = 0U; i != rets.size(); ++i) {
+    outs.emplace_back(new VarBase(
+        string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
+                        i),
+        rets[i], nullptr, true));
+  }
+
+  return outs;
 }
 
 std::vector<framework::Variable*> PyLayer::CallPythonFunc(
-    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+    const py::object& callable, const std::vector<VarBase*>& ins) {
   py::gil_scoped_acquire guard;
   py::tuple in_args(ins.size());
   for (size_t i = 0; i < ins.size(); ++i) {
-    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
     in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
   }
   VLOG(3) << "pyfunc in " << py::len(in_args);
@@ -352,6 +403,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
   std::vector<framework::Variable*> outs;
+  outs.reserve(ret_num);
   VLOG(3) << "pyfunc out " << ret_num;
   for (size_t i = 0; i < ret_num; ++i) {
     try {
@@ -362,7 +414,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
       auto* tensor = var->GetMutable<framework::LoDTensor>();
       tensor->ShareDataWith(*py_out_tensor);
       tensor->set_lod(py_out_tensor->lod());
-      outs.push_back(var);
+      outs.emplace_back(var);
     } catch (py::cast_error&) {
       PADDLE_THROW("The %d-th output must be LoDTensor", i);
     }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 7a9f33dc1e..72c548d5e9 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -18,14 +18,16 @@
 #include "paddle/fluid/framework/python_headers.h"
 // clang-format on
 
-#include <map>     // NOLINT
-#include <string>  // NOLINT
-#include <vector>  // NOLINT
-#include <memory>  // NOLINT
+#include <map>            // NOLINT
+#include <string>         // NOLINT
+#include <vector>         // NOLINT
+#include <memory>         // NOLINT
+#include <unordered_map>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -112,31 +114,55 @@ class OpBase;
  */
 class VarBase {
  public:
-  VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
-
-  explicit VarBase(bool stop_gradient)
-      : VarBase(new framework::Variable(),
-                stop_gradient ? nullptr : new VarBase(true), stop_gradient) {}
-
-  VarBase(framework::Variable* var, VarBase* grad)
-      : VarBase(var, grad, false) {}
+  // Internal interface, create VarBase from exist variable
+  VarBase(const std::string& name, framework::Variable* var, VarBase* grad,
+          bool stop_gradient)
+      : VarBase(name, var->Get<framework::LoDTensor>().type(),
+                var->Get<framework::LoDTensor>().dims(),
+                var->Get<framework::LoDTensor>().place(), var, grad,
+                stop_gradient, false) {}
+
+  // Python interface
+  VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
+          const std::vector<int64_t>& shape, const platform::Place& place,
+          bool stop_gradient, bool persistable)
+      : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient,
+                persistable) {}
+
+  // Internal interface, create VarBase from with ddim
+  VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
+          const framework::DDim& shape, const platform::Place& place,
+          bool stop_gradient, bool persistable)
+      : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
+                persistable) {}
 
  private:
-  VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient)
-      : name_(),
-        var_desc_(nullptr),
+  // TODO(minqiyang): need support SelectedRows
+  VarBase(const std::string& name, framework::proto::VarType::Type dtype,
+          const framework::DDim& shape, const platform::Place& place,
+          framework::Variable* var, VarBase* grad, bool stop_gradient,
+          bool persistable)
+      : name_(name),
+        type_(framework::proto::VarType::LOD_TENSOR),
         var_(var),
         grads_(grad),
-        block_(nullptr),
-        persistable_(false),
         stop_gradient_(stop_gradient),
+        persistable_(persistable),
         pre_op_(nullptr),
         pre_op_out_name_(),
-        pre_op_out_idx_(-1) {}
+        pre_op_out_idx_(-1) {
+    if (!var_) {
+      var_ = new framework::Variable();
+    }
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->Resize(shape);
+    tensor->mutable_data(place, dtype);
+    VLOG(10) << "create varbase: " << name_ << " type: " << dtype
+             << " place: " << place;
+  }
 
  public:
   virtual ~VarBase() {
-    // TODO(minqiyang): remove var desc from block desc
     if (var_) {
       delete var_;
       var_ = nullptr;
@@ -151,14 +177,46 @@ class VarBase {
     pre_op_out_idx_ = -1;
   }
 
-  inline OpBase* PreOp() const { return pre_op_; }
-  inline int PreOpOutIdx() const { return pre_op_out_idx_; }
+  inline void SetName(const std::string& name) { name_ = name; }
+  inline std::string Name() const { return name_; }
+
+  inline std::vector<int64_t> Shape() const {
+    if (var_->IsInitialized()) {
+      return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
+    } else {
+      return {};
+    }
+  }
+
+  inline framework::DDim Dims() const {
+    return var_->Get<framework::LoDTensor>().dims();
+  }
+
+  // data type. e.g.. FP32
+  inline void SetDataType(framework::proto::VarType::Type type) {
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(tensor->place(), type);
+  }
+  inline framework::proto::VarType::Type DataType() const {
+    auto tensor = var_->Get<framework::LoDTensor>();
+    return tensor.type();
+  }
+
+  // tensor type. e.g.. LoDTensor
+  inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
+  inline framework::proto::VarType::Type Type() const { return type_; }
 
   inline void SetStopGradient(bool stop_gradient) {
     stop_gradient_ = stop_gradient;
   }
   inline bool IsStopGradient() const { return stop_gradient_; }
 
+  inline void SetPersistable(bool persistable) { persistable_ = persistable; }
+  inline bool IsPersistable() const { return persistable_; }
+
+  inline OpBase* PreOp() const { return pre_op_; }
+  inline int PreOpOutIdx() const { return pre_op_out_idx_; }
+
   void RunBackward();
 
   inline void ResetPreOp(OpBase* op) {
@@ -180,7 +238,7 @@ class VarBase {
   }
 
   void ClearGradient() {
-    VLOG(1) << "clear gradient of " << var_desc_->Name();
+    VLOG(1) << "clear gradient of " << Name();
     if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
       auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
       operators::math::set_constant(
@@ -196,23 +254,20 @@ class VarBase {
                                       const bool blocking) const;
 
   inline std::string GradName() const {
-    PADDLE_ENFORCE(
-        var_desc_,
-        "Couldn't get gradient variable's name, please call backward() first");
-    return string::Sprintf("%s@IGrad", var_desc_->Name());
+    return string::Sprintf("%s@IGrad", Name());
   }
 
   std::string name_;
-  framework::VarDesc* var_desc_;
+  framework::proto::VarType::Type type_;
+  platform::Place place_;
 
   framework::Variable* var_;
   VarBase* grads_;
 
-  framework::BlockDesc* block_;
-  bool persistable_;
-
  private:
   bool stop_gradient_;
+  bool persistable_;
+
   OpBase* pre_op_;
   std::string pre_op_out_name_;
   int pre_op_out_idx_;
@@ -223,11 +278,11 @@ class VarBase {
  */
 class PYBIND11_HIDDEN OpBase {
  public:
-  OpBase()
-      : op_desc_(nullptr),
+  OpBase(const std::string& type)
+      : type_(type),
+        trace_id_(-1),
         forward_id_(-1),
         backward_id_(-1),
-        trace_id_(-1),
         place_(platform::CPUPlace()),
         backward_hooks_() {}
 
@@ -249,13 +304,40 @@ class PYBIND11_HIDDEN OpBase {
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
+  inline std::string Type() const { return type_; }
+  inline std::string GradOpType(size_t index) const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]);
+    return grad_op_descs_[index]->Type();
+  }
+
   void RegisterBackwardHooks(const py::object& callable);
 
   void InvokeBackwardHooks();
 
-  // One of `op_desc_` or `forward_id_` is set, not both.
-  // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
-  framework::OpDesc* op_desc_;
+  void TrackPreOp(const std::string& inp_name,
+                  const std::vector<VarBase*>& inputs) {
+    auto& pre_ops_list = pre_ops_[inp_name];
+    pre_ops_list.reserve(inputs.size());
+    auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
+    for (VarBase* inp_var : inputs) {
+      if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
+        VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
+                << inp_name;
+        pre_ops_list.emplace_back(inp_var->PreOp());
+        pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
+      } else {
+        VLOG(3) << "no pre op in slot " << inp_name
+                << " input var stop_gradient: " << inp_var->IsStopGradient();
+        pre_ops_list.emplace_back(nullptr);
+        // pre_ops_out_idx_list.push_back(-1);
+      }
+    }
+  }
+
+  std::string type_;
+  // One of `trace_id_` or `forward_id_` is set, not both.
+  // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_.
+  int trace_id_;
   int forward_id_;
 
   // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
@@ -263,7 +345,6 @@ class PYBIND11_HIDDEN OpBase {
   // Note: each fwd op corresponds to a vector of bwd ops.
   std::vector<framework::OpDesc*> grad_op_descs_;
   int backward_id_;
-  int trace_id_;
 
   platform::Place place_;
 
@@ -273,13 +354,13 @@ class PYBIND11_HIDDEN OpBase {
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
   // Inputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_input_vars_;
+  std::vector<VarBasePtrMap> grad_input_vars_;
   // Outputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_output_vars_;
-
-  framework::BlockDesc* block_;
+  std::vector<VarBasePtrMap> grad_output_vars_;
 
   std::vector<py::object> backward_hooks_;
+
+  framework::AttributeMap attrs_;
 };
 
 class Layer {
@@ -303,15 +384,134 @@ class PyLayer {
 
   static int NumFuncs();
 
-  static std::vector<VarBase*> Apply(int func_id,
-                                     const std::vector<VarBase*>& inputs);
+  static std::vector<framework::Variable*> Apply(
+      int func_id, const std::vector<VarBase*>& inputs);
 
-  static std::vector<framework::Variable*> ApplyGrad(
-      int func_id, const std::vector<framework::Variable*>& inputs);
+  static std::vector<VarBase*> ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs);
 
  private:
   static std::vector<framework::Variable*> CallPythonFunc(
-      const py::object& callable, const std::vector<framework::Variable*>& ins);
+      const py::object& callable, const std::vector<VarBase*>& ins);
+};
+
+// infer var type context for imperative mode
+class PYBIND11_HIDDEN RuntimeInferVarTypeContext
+    : public framework::InferVarTypeContext {
+ public:
+  RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
+                             imperative::VarBasePtrMap* outputs,
+                             const framework::AttributeMap* attrs_map)
+      : InferVarTypeContext(nullptr, nullptr),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs_map),
+        input_names_(),
+        output_names_(),
+        var_set_() {
+    input_names_.reserve(inputs_->size());
+    for (auto& it : *inputs_) {
+      for (imperative::VarBase* var : it.second) {
+        input_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+
+    output_names_.reserve(outputs_->size());
+    for (auto& it : *outputs_) {
+      for (imperative::VarBase* var : it.second) {
+        output_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+  }
+
+  virtual ~RuntimeInferVarTypeContext() {}
+
+  framework::Attribute GetAttr(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(attrs_);
+    return attrs_->at(name);
+  }
+
+  bool HasVar(const std::string& name) const override {
+    return var_set_.count(name) > 0;
+  }
+
+  bool HasInput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(inputs_);
+    return inputs_->count(name) > 0;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(outputs_);
+    return outputs_->count(name) > 0;
+  }
+
+  const std::vector<std::string>& Input(
+      const std::string& name) const override {
+    return input_names_.at(name);
+  }
+
+  const std::vector<std::string>& Output(
+      const std::string& name) const override {
+    return output_names_.at(name);
+  }
+
+  framework::proto::VarType::Type GetType(
+      const std::string& name) const override {
+    return var_set_.at(name)->Type();
+  }
+
+  void SetType(const std::string& name,
+               framework::proto::VarType::Type type) override {
+    var_set_[name]->SetType(type);
+  }
+
+  framework::proto::VarType::Type GetDataType(
+      const std::string& name) const override {
+    return var_set_.at(name)->DataType();
+  }
+
+  void SetDataType(const std::string& name,
+                   framework::proto::VarType::Type type) override {
+    var_set_[name]->SetDataType(type);
+  }
+
+  std::vector<framework::proto::VarType::Type> GetDataTypes(
+      const std::string& name) const override {
+    PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
+  }
+
+  void SetDataTypes(const std::string& name,
+                    const std::vector<framework::proto::VarType::Type>&
+                        multiple_data_type) override {
+    PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
+  }
+
+  std::vector<int64_t> GetShape(const std::string& name) const override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  void SetShape(const std::string& name,
+                const std::vector<int64_t>& dims) override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  int32_t GetLoDLevel(const std::string& name) const override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+  void SetLoDLevel(const std::string& name, int32_t lod_level) override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+ private:
+  const imperative::VarBasePtrMap* inputs_;
+  imperative::VarBasePtrMap* outputs_;
+  const framework::AttributeMap* attrs_;
+  std::unordered_map<std::string, std::vector<std::string>> input_names_;
+  std::unordered_map<std::string, std::vector<std::string>> output_names_;
+  std::unordered_map<std::string, imperative::VarBase*> var_set_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
new file mode 100644
index 0000000000..34570b3a60
--- /dev/null
+++ b/paddle/fluid/imperative/profiler.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/profiler.h"
+
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
+
+DEFINE_string(
+    tracer_profile_fname, "xxgperf",
+    "Profiler filename for imperative tracer, which generated by gperftools."
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag gTracerProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gTracerProfilerStarted = false;
+#endif
+
+void StartProfile() {
+  if (!FLAGS_tracer_profile_fname.empty()) {
+    std::call_once(gTracerProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
+      gTracerProfilerStarted = true;
+#else
+      LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                      "FLAGS_tracer_profile_fname will be ignored";
+#endif
+    });
+  }
+}
+
+void StopProfile() {
+#ifdef WITH_GPERFTOOLS
+  ProfilerFlush();
+#else
+  LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                  "FLAGS_tracer_profile_fname will be ignored";
+#endif
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/imperative/profiler.h
similarity index 67%
rename from paddle/fluid/framework/details/eager_deletion_pass.h
rename to paddle/fluid/imperative/profiler.h
index d7a7a9709d..d52aeed4e8 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/imperative/profiler.h
@@ -14,19 +14,12 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
 namespace paddle {
-namespace framework {
-namespace details {
+namespace imperative {
+
+extern void StartProfile();
 
-class EagerDeletionPass : public ir::Pass {
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
-};
+extern void StopProfile();
 
-}  // namespace details
-}  // namespace framework
+}  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0cb1676372..7c9d0af3ec 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,52 +19,44 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-
-DEFINE_string(
-    tracer_profile_fname, "",
-    "Profiler filename for imperative tracer, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
-
 namespace paddle {
 namespace imperative {
 
-static std::once_flag gTracerProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gTracerProfilerStarted = false;
-#endif
-
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
                   std::vector<framework::OpDesc*>* grad_op_descs,
                   std::unordered_map<std::string, std::string>* grad_to_var) {
   PADDLE_ENFORCE(grad_op_descs->empty());
-  std::vector<std::unique_ptr<framework::OpDesc>> descs =
-      framework::OpInfoMap::Instance()
-          .Get(op_desc.Type())
-          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  const framework::OpInfo& op_info =
+      framework::OpInfoMap::Instance().Get(op_desc.Type());
+  if (!op_info.grad_op_maker_) return;
 
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
+      op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
   for (auto& desc : descs) {
     grad_op_descs->emplace_back(desc.release());
   }
 }
 
-void InitVar(framework::Variable* var, framework::Variable* grad_var,
-             platform::DeviceContext* dev_ctx) {
+void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) {
+  PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base");
   PADDLE_ENFORCE_NOT_NULL(dev_ctx,
                           "Could not get valid device from forward op");
-  auto& var_t = var->Get<framework::LoDTensor>();
-  grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-      var_t.dims(), dev_ctx->GetPlace());
-  operators::math::set_constant(
-      *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
+
+  if (var->grads_ == nullptr) {
+    auto& var_t = var->var_->Get<framework::LoDTensor>();
+    var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32,
+                              framework::vectorize(var_t.dims()),
+                              dev_ctx->GetPlace(), true, false);
+    auto grad_t = var->grads_->var_->GetMutable<framework::LoDTensor>();
+    operators::math::set_constant(*dev_ctx, grad_t, 0.0);
+  }
 }
 
 platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
@@ -85,92 +77,135 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
   return result;
 }
 
-Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
-  if (!FLAGS_tracer_profile_fname.empty()) {
-    std::call_once(gTracerProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
-      gTracerProfilerStarted = true;
-#else
-      LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                      "FLAGS_tracer_profile_fname will be ignored";
-#endif
-    });
+framework::VariableNameMap CreateInputVarNameMap(
+    const OpBase* op, const VarBasePtrMap& varbase_map) {
+  framework::VariableNameMap result;
+
+  auto& info_map = framework::OpInfoMap::Instance();
+  auto* op_info = info_map.GetNullable(op->Type());
+  if (op_info == nullptr || op_info->proto_ == nullptr) {
+    return result;
   }
-}
 
-std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                                    const VarBasePtrMap& outputs,
-                                    framework::BlockDesc* block,
-                                    const platform::Place expected_place,
-                                    const bool stop_gradient) {
-#ifdef WITH_GPERFTOOLS
-  if (gTracerProfilerStarted) {
-    ProfilerFlush();
+  for (auto& in : op_info->Proto().inputs()) {
+    auto it = varbase_map.find(in.name());
+    if (it == varbase_map.end()) {
+      PADDLE_ENFORCE(in.dispensable());
+      result[in.name()] = {};
+    } else {
+      auto var_vector = it->second;
+      std::vector<std::string> args;
+      args.reserve(var_vector.size());
+      for (VarBase* var_base : var_vector) {
+        args.emplace_back(var_base->Name());
+      }
+      result[in.name()] = args;
+    }
   }
-#endif
+  return result;
+}
 
-  std::map<std::string, VarBase*> vars;
+framework::VariableNameMap CreateOutputVarNameMap(
+    const OpBase* op, const VarBasePtrMap& varbase_map) {
+  framework::VariableNameMap result;
 
-  framework::OpDesc* op_desc = op->op_desc_;
-  VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id "
-          << op->trace_id_;
-  op_desc->InferShape(*block);
-  op_desc->InferVarType(block);
+  auto& info_map = framework::OpInfoMap::Instance();
+  auto* op_info = info_map.GetNullable(op->Type());
+  if (op_info == nullptr || op_info->proto_ == nullptr) {
+    return result;
+  }
 
-  std::unique_ptr<framework::OperatorBase> op_base =
-      framework::OpRegistry::CreateOp(*op_desc);
+  for (auto& out : op_info->Proto().outputs()) {
+    auto it = varbase_map.find(out.name());
+    if (it == varbase_map.end()) {
+      PADDLE_ENFORCE(out.dispensable());
+      result[out.name()] = {};
+    } else {
+      auto var_vector = it->second;
+      std::vector<std::string> args;
+      args.reserve(var_vector.size());
+      for (VarBase* var_base : var_vector) {
+        args.emplace_back(var_base->Name());
+      }
+      result[out.name()] = args;
+    }
+  }
+  return result;
+}
+
+Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
+std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                                    VarBasePtrMap* outputs,
+                                    framework::AttributeMap attrs_map,
+                                    const platform::Place expected_place,
+                                    const bool stop_gradient) {
   framework::VariableValueMap invars_map;
   framework::VariableValueMap outvars_map;
 
+  // Construct input_vars_map and output_vars_map
+  std::map<std::string, VarBase*> current_vars_map;
   op->input_vars_ = inputs;
   for (auto it : op->input_vars_) {
     auto& invars = invars_map[it.first];
     invars.reserve(it.second.size());
     for (VarBase* inp : it.second) {
-      PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
-                              op->op_desc_->Type(), inp->var_desc_->Name());
+      PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
+                              inp->Name());
 
       invars.emplace_back(inp->var_);
-      vars[inp->var_desc_->Name()] = inp;
-      if (inp->PreOp() && !inp->IsStopGradient()) {
-        op->pre_ops_[it.first].push_back(inp->PreOp());
-        op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
-        VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type();
-      } else {
-        op->pre_ops_[it.first].push_back(nullptr);
+      if (!stop_gradient) {
+        current_vars_map[inp->Name()] = inp;
       }
-      VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
-              << inp->var_->IsInitialized() << " stop_gradient "
-              << inp->IsStopGradient();
+      VLOG(3) << "input var name: " << inp->Name()
+              << " inited: " << inp->var_->IsInitialized()
+              << " stop_grad: " << inp->IsStopGradient();
     }
+    op->TrackPreOp(it.first, it.second);
   }
 
-  op->output_vars_ = outputs;
+  op->output_vars_ = *outputs;
   for (auto it : op->output_vars_) {
     auto& outvars = outvars_map[it.first];
     const std::vector<VarBase*>& outputs = it.second;
     outvars.reserve(outputs.size());
-    for (size_t i = 0; i < outputs.size(); ++i) {
+    for (size_t i = 0U; i < outputs.size(); ++i) {
       VarBase* out = outputs[i];
       outvars.emplace_back(out->var_);
-      vars[out->var_desc_->Name()] = out;
-
-      framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        out->var_->GetMutable<framework::LoDTensor>();
-      } else {
-        LOG(ERROR) << "tracer doesn't support yet";
-      }
       out->TrackPreOp(op, it.first, i, stop_gradient);
+      if (!stop_gradient) {
+        current_vars_map[out->Name()] = out;
+      }
 
-      VLOG(3) << "output vname " << out->var_desc_->Name() << " "
-              << out->var_->IsInitialized();
+      VLOG(3) << "input var name: " << out->Name()
+              << " inited: " << out->var_->IsInitialized()
+              << " stop_grad: " << out->IsStopGradient();
     }
   }
 
-  VLOG(3) << "tracer running " << op_desc->Type();
+  // Check attrs and create op
+  framework::VariableNameMap invars_name_map =
+      CreateInputVarNameMap(op, inputs);
+  framework::VariableNameMap outvars_name_map =
+      CreateOutputVarNameMap(op, *outputs);
+
+  auto& info = framework::OpInfoMap::Instance().Get(op->Type());
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(&attrs_map);
+  }
+
+  std::unique_ptr<framework::OperatorBase> op_base =
+      framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
+                                      outvars_name_map, attrs_map);
+
+  if (info.infer_var_type_) {
+    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
+    info.infer_var_type_(&infer_var_type_ctx);
+  }
+
+  // TODO(minqiyang): Support infer var type in imperative mode
+  // Run forward op
+  VLOG(3) << "tracer running " << op->Type();
   framework::RuntimeContext ctx(invars_map, outvars_map);
 
   // TODO(panyx0718): Cache p.
@@ -186,36 +221,45 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
                                   prepared_op.ctx, prepared_op.kernel_configs));
 
+  // construct backward op
   std::set<std::string> vars_saved_for_backward;
-
   if (!stop_gradient) {
+    VLOG(5) << "start construct backward op";
+
+    // construct grad op descs
+    op->attrs_ = attrs_map;
+    std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
+        op->Type(), invars_name_map, outvars_name_map, attrs_map));
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
         new std::unordered_map<std::string, std::string>());
-    CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
+    // NOTE(minqiyang): We don't support control flow op in imperative now
+    // Add grad_block_ when we want to support it
+    CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get());
+
+    VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type();
 
-    op->grad_input_vars_.resize(op->grad_op_descs_.size());
-    op->grad_output_vars_.resize(op->grad_op_descs_.size());
+    const size_t grad_op_count = op->grad_op_descs_.size();
 
-    for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
+    op->grad_input_vars_.resize(grad_op_count);
+    op->grad_output_vars_.resize(grad_op_count);
+
+    for (size_t i = 0; i < grad_op_count; ++i) {
       framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
       for (auto it : grad_op_desc->Inputs()) {
         auto& grad_in_vars = op->grad_input_vars_[i][it.first];
+        grad_in_vars.reserve(it.second.size());
         for (const std::string& grad_invar : it.second) {
-          block->FindRecursiveOrCreateVar(grad_invar);
           auto var_it = grad_to_var->find(grad_invar);
           if (var_it == grad_to_var->end()) {
-            auto fwd_var_it = vars.find(grad_invar);
-            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            auto fwd_var_it = current_vars_map.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
             // Forward inputs or outputs.
-            grad_in_vars.push_back(fwd_var_it->second->var_);
+            grad_in_vars.emplace_back(fwd_var_it->second);
           } else {
-            VarBase* var = vars[var_it->second];
-            if (!var->grads_->var_->IsInitialized()) {
-              InitVar(var->var_, var->grads_->var_,
-                      prepared_op.GetDeviceContext());
-            }
+            VarBase* var = current_vars_map[var_it->second];
+            InitGrad(var, prepared_op.GetDeviceContext());
             // Douts.
-            grad_in_vars.push_back(var->grads_->var_);
+            grad_in_vars.emplace_back(var->grads_);
           }
 
           vars_saved_for_backward.insert(it.first);
@@ -225,48 +269,47 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       for (auto it : grad_op_desc->Outputs()) {
         auto& grad_out_vars = op->grad_output_vars_[i][it.first];
         for (const std::string& grad_outvar : it.second) {
-          block->FindRecursiveOrCreateVar(grad_outvar);
           auto var_it = grad_to_var->find(grad_outvar);
           PADDLE_ENFORCE(var_it != grad_to_var->end(),
                          "Could not found the grad op output var, should this "
                          "operator %s's stop gradient be True",
-                         op_desc->Type());
-          VarBase* var = vars[var_it->second];
-          if (!var->grads_->var_->IsInitialized()) {
-            InitVar(var->var_, var->grads_->var_,
-                    prepared_op.GetDeviceContext());
-          }
-          grad_out_vars.push_back(var->grads_->var_);
+                         op->Type());
+          VarBase* var = current_vars_map[var_it->second];
+          InitGrad(var, prepared_op.GetDeviceContext());
+          grad_out_vars.push_back(var->grads_);
+          VLOG(3) << "grads output var name: " << var->name_;
         }
       }
     }
   }
 
-  op->block_ = block;
   return vars_saved_for_backward;
 }
 
 std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
                                       const std::vector<VarBase*>& inputs,
                                       bool stop_gradient) {
-  VLOG(3) << "py_trace";
+  VLOG(3) << "py_trace " << op->Type();
+
   op->input_vars_[PyLayer::kFwdInp] = inputs;
-  op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
-  for (VarBase* inp : inputs) {
-    if (inp->PreOp() && !inp->IsStopGradient()) {
-      op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
-      op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
-    } else {
-      op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr);
-    }
-  }
 
-  auto& outputs = op->output_vars_[PyLayer::kFwdOut];
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    VarBase* out = outputs[i];
+  std::vector<framework::Variable*> ret_vars =
+      PyLayer::Apply(op->forward_id_, inputs);
+
+  op->TrackPreOp(PyLayer::kFwdInp, inputs);
+
+  std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
+  outputs.reserve(ret_vars.size());
+  for (size_t i = 0U; i != ret_vars.size(); ++i) {
+    framework::Variable* v = ret_vars[i];
+    VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v,
+                               nullptr, stop_gradient);
+    outputs.emplace_back(out);
     out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
   }
+
   if (!stop_gradient) {
+    VLOG(5) << "start construct backward op";
     op->grad_input_vars_.resize(1);
     op->grad_output_vars_.resize(1);
     auto& grad_input_vars =
@@ -274,30 +317,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     auto& grad_output_vars =
         op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
 
-    for (const VarBase* inp : inputs) {
-      grad_input_vars.push_back(inp->var_);
+    for (VarBase* inp : inputs) {
+      grad_input_vars.push_back(inp);
     }
     for (VarBase* out : outputs) {
-      grad_input_vars.push_back(out->var_);
+      grad_input_vars.push_back(out);
     }
 
+    // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
     platform::CPUPlace place;
     for (VarBase* out : outputs) {
-      grad_input_vars.push_back(out->grads_->var_);
-      if (!grad_input_vars.back()->IsInitialized()) {
-        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
-        InitVar(out->var_, grad_input_vars.back(),
-                platform::DeviceContextPool::Instance().Get(place));
-      }
+      InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
+      grad_input_vars.push_back(out->grads_);
     }
 
-    for (const VarBase* inp : inputs) {
-      grad_output_vars.push_back(inp->grads_->var_);
-      if (!grad_output_vars.back()->IsInitialized()) {
-        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
-        InitVar(inp->var_, grad_output_vars.back(),
-                platform::DeviceContextPool::Instance().Get(place));
-      }
+    for (VarBase* inp : inputs) {
+      InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
+      grad_output_vars.push_back(inp->grads_);
     }
   }
   return outputs;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 8a0267c37f..a87f3b8009 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -17,6 +17,8 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/op_desc.h"
@@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc,
                   framework::OpDesc** grad_op_desc,
                   std::unordered_map<std::string, std::string>* grad_to_var);
 
-void InitVar(framework::Variable* var, framework::Variable* grad_var);
+void InitVar(const VarBase* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx);
 
 platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
 
@@ -45,8 +48,8 @@ class Tracer {
   virtual ~Tracer() {}
 
   std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
-                              const VarBasePtrMap& outputs,
-                              framework::BlockDesc* block,
+                              VarBasePtrMap* outputs,  // NOLINT
+                              framework::AttributeMap attrs_map,
                               const platform::Place expected_place,
                               const bool stop_gradient = false);
 
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index fc9e42f8d0..c51ce931de 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -25,6 +25,7 @@ class VarBase;
 class OpBase;
 
 typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
 typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 762640d6d1..d27ef8fe3c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -91,5 +91,5 @@ if(WITH_TESTING)
   add_subdirectory(tests/book)
   if(WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
-  endif()  
+  endif()
 endif()
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 89e934ae27..997f3575f4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -38,7 +39,10 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
+
 using framework::ir::Graph;
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -128,6 +132,17 @@ struct Argument {
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
 
+  // A set of op types to enable their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
+                      std::unordered_set<std::string>);
+
+  // A set of op IDs to exclude from enabling their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
+                      std::unordered_set<int>);
+
+  // Scales for variables to be quantized
+  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
+
   // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 1cdb4881fb..1556caa464 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -55,14 +57,21 @@ void IRPassManager::CreatePasses(Argument *argument,
                                   ".dot";
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
-    }
-    if (pass_name == "mkldnn_placement_pass") {
+    } else if (pass_name == "mkldnn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
-    }
-
-    if (pass_name == "tensorrt_subgraph_pass") {
+    } else if (pass_name == "cpu_quantize_placement_pass") {
+      pass->Set("quantize_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->quantize_enabled_op_types()));
+      pass->Set(
+          "quantize_excluded_op_ids",
+          new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
+    } else if (pass_name == "cpu_quantize_pass") {
+      pass->Set("quant_var_scales",
+                new VarQuantScale(argument->quant_var_scales()));
+    } else if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7741111222..1be25de497 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -202,6 +202,7 @@ void AnalysisConfig::Update() {
       // Append after the Affine_channel_conv_fuse pass.
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
+    pass_builder()->DeletePass("runtime_context_cache_pass");
   }
 
   if (use_mkldnn_) {
@@ -219,7 +220,14 @@ void AnalysisConfig::Update() {
   }
 
   if (enable_memory_optim_) {
-    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+    auto analysis_passes = pass_builder()->AnalysisPasses();
+    auto memory_opti_pass_name = "memory_optimize_pass";
+    bool already_exists =
+        std::find(analysis_passes.begin(), analysis_passes.end(),
+                  memory_opti_pass_name) != analysis_passes.end();
+    if (!already_exists) {
+      pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
+    }
   }
 
   if (ir_debug_) {
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 963986f245..bf2e3593c2 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
 fi
 
 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index cf02901d96..9a40cf4b60 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
 }
 template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
 template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
+template void ZeroCopyTensor::copy_from_cpu<int32_t>(const int32_t *data);
 template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
 template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
+template void ZeroCopyTensor::copy_to_cpu<int32_t>(int32_t *data);
 
 template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
                                             int *size) const;
 template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
                                                 int *size) const;
+template int32_t *ZeroCopyTensor::data<int32_t>(PaddlePlace *place,
+                                                int *size) const;
 template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
 template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
+template int32_t *ZeroCopyTensor::mutable_data<int32_t>(PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const {
   PADDLE_ENFORCE(!name_.empty(),
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 1ce3fe5af7..258a79fa4e 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor,
 }
 
 template <typename T>
-static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
-                                    const std::vector<std::vector<T>> &data) {
-  int size{0};
+static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                     const std::vector<std::vector<T>> &data) {
   auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
   int c = 0;
   for (const auto &f : data) {
@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
       ptr[c++] = v;
     }
   }
-  return size;
+}
+
+template <typename T>
+static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                     const PaddleBuf &data) {
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  for (size_t i = 0; i < data.length() / sizeof(T); i++) {
+    ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
+  }
 }
 
 static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 92c24647e8..d413a418c8 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -80,6 +80,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_act_fuse_pass",   //
         "conv_elementwise_add2_act_fuse_pass",  //
         "conv_elementwise_add_fuse_pass",       //
+        "runtime_context_cache_pass",           //
 #endif
   });
 
@@ -90,6 +91,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   use_gpu_ = true;
 }
 
+void GpuPassStrategy::EnableQuantizer() {
+  LOG(ERROR) << "GPU not support quantization yet";
+}
+
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
   analysis_passes_.push_back(pass);
 }
@@ -115,6 +120,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
       "conv_eltwiseadd_bn_fuse_pass",  //
       "is_test_pass",                  //
       "identity_scale_op_clean_pass",  //
+      "runtime_context_cache_pass",    //
   });
   use_gpu_ = false;
 }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 2524d89fcd..84645fef01 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -84,6 +84,10 @@ class PassStrategy : public PaddlePassBuilder {
    */
   virtual void EnableMKLDNN() {}
 
+  /** Enable quantize optimization
+   */
+  virtual void EnableQuantizer() {}
+
   bool use_gpu() const { return use_gpu_; }
 
   virtual ~PassStrategy() = default;
@@ -124,6 +128,16 @@ class CpuPassStrategy : public PassStrategy {
     use_mkldnn_ = false;
 #endif
   }
+
+  void EnableQuantizer() override {
+    if (!use_quantizer_) {
+      passes_.push_back("cpu_quantize_placement_pass");
+    }
+    use_quantizer_ = true;
+  }
+
+ protected:
+  bool use_quantizer_{false};
 };
 
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -138,6 +152,7 @@ class GpuPassStrategy : public PassStrategy {
   }
 
   void EnableMKLDNN() override;
+  void EnableQuantizer() override;
 
   virtual ~GpuPassStrategy() = default;
 };
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8f7b6f31de..2f17a44e0c 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -110,19 +110,19 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 3f6c933f2b..5157bd280d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim();
+  if (FLAGS_zero_copy) {
+    cfg->SwitchUseFeedFetchOps(false);
+  }
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                  input_slots_all, &outputs, FLAGS_num_threads);
 
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
     PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
     size_t size = GetSize(outputs[0]);
     PADDLE_ENFORCE_GT(size, 0);
@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back("cos_sim_2.tmp_0");
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             input_slots_all, outputs_name);
+}
+
 // Compare Deterministic result
 TEST(Analyzer_Pyramid_DNN, compare_determine) {
   AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 36282b3efe..dcf4b38ce8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim();
+  if (FLAGS_zero_copy) {
+    cfg->SwitchUseFeedFetchOps(false);
+  }
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) {
                  input_slots_all, &outputs, 2 /* multi_thread */);
 }
 
-// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
-// on the complex RNN1 model.
-TEST(Analyzer_rnn1, ZeroCopy) {
-  AnalysisConfig config;
-  SetConfig(&config);
-  config.SwitchUseFeedFetchOps(false);
-
-  PaddlePlace place;
-
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-
-  config.SwitchUseFeedFetchOps(true);
-  auto native_predictor =
-      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
-
-  config.SwitchUseFeedFetchOps(
-      true);  // the analysis predictor needs feed/fetch.
-  auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-
-#define NEW_TENSOR(name__) \
-  auto name__##_tensor = predictor->GetInputTensor(#name__);
-  NEW_TENSOR(data_lod_attention);
-  NEW_TENSOR(cell_init);
-  NEW_TENSOR(data);
-  NEW_TENSOR(week);
-  NEW_TENSOR(minute);
-  NEW_TENSOR(hidden_init);
-
-  // Prepare data for AnalysisPredictor
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
-                        data_tensor.get(), hidden_init_tensor.get(),
-                        week_tensor.get(), minute_tensor.get(), &data,
-                        FLAGS_batch_size);
-
-  // Prepare data for NativePredictor
-  std::vector<std::vector<PaddleTensor>> native_inputs;
-  SetInput(&native_inputs);
-  std::vector<PaddleTensor> native_outputs;
-  std::vector<PaddleTensor> analysis_outputs;
-
-  auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
-  // Run analysis predictor
-
-  int num_ops;
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
-  ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
-  ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
-  ASSERT_EQ(num_ops,
-            13);  // After graph optimization, only 13 operators exists.
-
-  Timer timer;
-  double total_time{0};
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    timer.tic();
-    predictor->ZeroCopyRun();
-    total_time += timer.toc();
-  }
-  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
-
-  ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
-  LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
-
-  int output_size{0};  // this is the number of elements not memory size
-  auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
-  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
-  for (int i = 0; i < output_size; i++) {
-    EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
-  }
-}
-
-TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
-  AnalysisConfig config;
-  SetConfig(&config);
-  config.SwitchUseFeedFetchOps(false);
-
-#define NEW_TENSOR(name__) \
-  auto name__##_tensor = predictor->GetInputTensor(#name__);
-
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
-  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
-  double total_time_of_threads{0};
-  std::vector<std::thread> threads;
-
-  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
-    threads.emplace_back([&, tid] {
-      auto &predictor = predictors[tid];
-      NEW_TENSOR(data_lod_attention);
-      NEW_TENSOR(cell_init);
-      NEW_TENSOR(data);
-      NEW_TENSOR(week);
-      NEW_TENSOR(minute);
-      NEW_TENSOR(hidden_init);
-
-      // Prepare data for AnalysisPredictor
-      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-      Timer timer;
-      double total_time{0};
-
-      for (int i = 0; i < FLAGS_repeat; i++) {
-        PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
-                              cell_init_tensor.get(), data_tensor.get(),
-                              hidden_init_tensor.get(), week_tensor.get(),
-                              minute_tensor.get(), &data, FLAGS_batch_size);
-
-        timer.tic();
-        predictor->ZeroCopyRun();
-        total_time += timer.toc();
-      }
-
-      total_time_of_threads += total_time;
-
-      LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
-    });
-  }
-
-  for (auto &t : threads) {
-    t.join();
-  }
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_rnn1, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
 
-  LOG(INFO) << "average time: "
-            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back("final_output.tmp_1");
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             input_slots_all, outputs_name);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index cca2ab1ee1..19fa5528da 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrDebug();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+  if (FLAGS_zero_copy) {
+    cfg->SwitchUseFeedFetchOps(false);
+  }
   if (use_mkldnn) {
     cfg->EnableMKLDNN();
   }
@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) {
                        input_slots_all);
 }
 
-void analysis_fuse_statis(bool use_zerocopy) {
+// Check the fuse status
+TEST(Analyzer_seq_pool1, fuse_statis) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg.SwitchUseFeedFetchOps(!use_zerocopy);
   int num_ops;
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
   auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) {
   EXPECT_EQ(num_ops, 171);
 }
 
-// Check the fuse status
-TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); }
-
-void PrepareZeroCopyInputs(
-    const std::unique_ptr<PaddlePredictor> &predictor,
-    std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  // only feed one batch
-  const auto &one_batch = data.NextBatch();
-  inputs->clear();
-  for (size_t i = 0; i < one_batch.size(); ++i) {
-    auto &slot = one_batch[i];
-    auto tensor = predictor->GetInputTensor(slot.name + "_embed");
-    tensor->Reshape(slot.shape);
-    tensor->SetLoD({slot.lod});
-    ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
-    inputs->emplace_back(std::move(tensor));
-  }
-}
-
-// return the output values
-std::vector<float> zerocopy_profile(int repeat_times) {
-  AnalysisConfig config;
-  SetConfig(&config);
-  config.SwitchUseFeedFetchOps(false);
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-  std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
-  PrepareZeroCopyInputs(predictor, &inputs);
-  auto output_tensor = predictor->GetOutputTensor(out_var_name);
-  Timer timer;
-  LOG(INFO) << "Warm up run...";
-  timer.tic();
-  predictor->ZeroCopyRun();
-  PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
-  if (FLAGS_profile) {
-    paddle::platform::ResetProfiler();
-  }
-  LOG(INFO) << "Run " << repeat_times << " times...";
-  timer.tic();
-  for (int i = 0; i < repeat_times; i++) {
-    predictor->ZeroCopyRun();
-  }
-  PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
-            1);
-
-  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
-  PaddlePlace place;
-  int output_size{0};
-  auto *pdata = output_tensor->data<float>(&place, &output_size);
-  std::vector<float> res(output_size);
-  for (int i = 0; i < output_size; ++i) {
-    res[i] = pdata[i];
-  }
-  return res;
-}
-
-TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
-
-TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
-  AnalysisConfig config;
-  SetConfig(&config);
-  config.SwitchUseFeedFetchOps(false);
-
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
-  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
-  double total_time_of_threads{0};
-  std::vector<std::thread> threads;
-
-  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
-    threads.emplace_back([&, tid] {
-      auto &predictor = predictors[tid];
-      std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
-      PrepareZeroCopyInputs(predictor, &inputs);
-      auto output_tensor = predictor->GetOutputTensor(out_var_name);
-      Timer timer;
-      double total_time{0};
-
-      LOG(INFO) << "Warm up run...";
-      timer.tic();
-      predictor->ZeroCopyRun();
-      PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
-      if (FLAGS_profile) {
-        paddle::platform::ResetProfiler();
-      }
-      int repeat_times = FLAGS_repeat;
-      LOG(INFO) << "Run " << repeat_times << " times...";
-      timer.tic();
-
-      for (int i = 0; i < repeat_times; i++) {
-        predictor->ZeroCopyRun();
-      }
-      total_time += timer.toc();
-      total_time_of_threads += total_time;
-
-      LOG(INFO) << "thread time: " << total_time / repeat_times;
-    });
-  }
-
-  for (auto &t : threads) {
-    t.join();
-  }
-
-  LOG(INFO) << "average time: "
-            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
-}
-
-TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_seq_pool1, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
 
-TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
-  AnalysisConfig config;
-  SetConfig(&config);
-  config.SwitchUseFeedFetchOps(true);
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
-  std::vector<PaddleTensor> native_outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs));
-  EXPECT_EQ(native_outputs.size(), 1UL);
-
-  auto zerocopy_output = zerocopy_profile(1);
-  EXPECT_EQ(zerocopy_output.size() * sizeof(float),
-            native_outputs.front().data.length());
-  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
-  for (size_t i = 0; i < zerocopy_output.size(); ++i) {
-    EXPECT_LT(
-        std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]),
-        1e-3);
-  }
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back(out_var_name);
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             input_slots_all, outputs_name);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index 9d17f38ab7..f765f55611 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_Transformer, profile) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 }
 
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_Transformer, fuse_statis) {
   AnalysisConfig cfg;
@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Transformer, compare) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 41daff83c4..a4881afe58 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
 DEFINE_bool(record_benchmark, false,
             "Record benchmark after profiling the model");
 DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
+DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
 
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
   LOG(INFO) << analysis_config->ToNativeConfig();
 }
 
+// Compare result between two PaddleTensor
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
   EXPECT_GT(outputs.size(), 0UL);
@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
   }
 }
 
+// Compare result between a PaddleTensor and a ZeroCopyTensor
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<ZeroCopyTensor> &ref_outputs) {
+  EXPECT_GT(outputs.size(), 0UL);
+  EXPECT_EQ(outputs.size(), ref_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &ref_out = ref_outputs[i];
+    size_t size = VecReduceToInt(out.shape);
+    EXPECT_GT(size, 0UL);
+    int ref_size = 0;  // this is the number of elements not memory size
+    PaddlePlace place;
+    switch (out.dtype) {
+      case PaddleDType::INT64: {
+        int64_t *pdata = static_cast<int64_t *>(out.data.data());
+        int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
+        EXPECT_EQ(size, ref_size);
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+      case PaddleDType::FLOAT32: {
+        float *pdata = static_cast<float *>(out.data.data());
+        float *pdata_ref = ref_out.data<float>(&place, &ref_size);
+        EXPECT_EQ(size, ref_size);
+        for (size_t j = 0; j < size; ++j) {
+          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
+        }
+        break;
+      }
+      case PaddleDType::INT32: {
+        int32_t *pdata = static_cast<int32_t *>(out.data.data());
+        int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
+        EXPECT_EQ(size, ref_size);
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+    }
+  }
+}
+
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
     const PaddlePredictor::Config *config, bool use_analysis = true) {
   const auto *analysis_config =
@@ -205,61 +251,106 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
   }
 }
 
-void TestOneThreadPrediction(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
-  auto predictor = CreateTestPredictor(config, use_analysis);
+void ConvertPaddleTensorToZeroCopyTensor(
+    PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto input = inputs[i];
+    auto tensor = predictor->GetInputTensor(input.name);
+    tensor->Reshape(input.shape);
+    tensor->SetLoD({input.lod});
+    if (input.dtype == PaddleDType::INT64) {
+      ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
+    } else if (input.dtype == PaddleDType::FLOAT32) {
+      ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
+    } else if (input.dtype == PaddleDType::INT32) {
+      ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
+    } else {
+      LOG(ERROR) << "unsupported feed type " << input.dtype;
+    }
+  }
+}
 
-  // warmup run
-  LOG(INFO) << "Warm up run...";
-  {
-    Timer warmup_timer;
-    warmup_timer.tic();
+void PredictionWarmUp(PaddlePredictor *predictor,
+                      const std::vector<std::vector<PaddleTensor>> &inputs,
+                      std::vector<PaddleTensor> *outputs, int num_threads,
+                      int tid) {
+  int batch_size = FLAGS_batch_size;
+  LOG(INFO) << "Running thread " << tid << ", warm up run...";
+  if (FLAGS_zero_copy) {
+    ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
+  }
+  Timer warmup_timer;
+  warmup_timer.tic();
+  if (!FLAGS_zero_copy) {
     predictor->Run(inputs[0], outputs, batch_size);
-    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-    if (FLAGS_profile) {
-      paddle::platform::ResetProfiler();
-    }
+  } else {
+    predictor->ZeroCopyRun();
+  }
+  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+  if (FLAGS_profile) {
+    paddle::platform::ResetProfiler();
   }
+}
 
-  LOG(INFO) << "Run " << num_times << " times...";
-  {
-    Timer run_timer;
-    run_timer.tic();
+void PredictionRun(PaddlePredictor *predictor,
+                   const std::vector<std::vector<PaddleTensor>> &inputs,
+                   std::vector<PaddleTensor> *outputs, int num_threads,
+                   int tid) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+  Timer run_timer;
+  double elapsed_time = 0;
 #ifdef WITH_GPERFTOOLS
-    ProfilerStart("paddle_inference.prof");
+  ProfilerStart("paddle_inference.prof");
 #endif
-    for (int i = 0; i < num_times; i++) {
-      for (size_t j = 0; j < inputs.size(); j++) {
-        predictor->Run(inputs[j], outputs, batch_size);
+  if (!FLAGS_zero_copy) {
+    run_timer.tic();
+    for (size_t i = 0; i < inputs.size(); i++) {
+      for (int j = 0; j < num_times; j++) {
+        predictor->Run(inputs[i], outputs, batch_size);
+      }
+    }
+    elapsed_time = run_timer.toc();
+  } else {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
+      run_timer.tic();
+      for (int j = 0; j < num_times; j++) {
+        predictor->ZeroCopyRun();
       }
+      elapsed_time += run_timer.toc();
     }
+  }
 #ifdef WITH_GPERFTOOLS
-    ProfilerStop();
+  ProfilerStop();
 #endif
 
-    double latency = run_timer.toc() / (num_times > 1 ? num_times : 1);
-    PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
-    if (FLAGS_record_benchmark) {
-      Benchmark benchmark;
-      benchmark.SetName(FLAGS_model_name);
-      benchmark.SetBatchSize(batch_size);
-      benchmark.SetLatency(latency);
-      benchmark.PersistToFile("benchmark_record.txt");
-    }
+  PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
+            inputs.size());
+  if (FLAGS_record_benchmark) {
+    Benchmark benchmark;
+    benchmark.SetName(FLAGS_model_name);
+    benchmark.SetBatchSize(batch_size);
+    benchmark.SetLatency(elapsed_time / num_times);
+    benchmark.PersistToFile("benchmark_record.txt");
   }
 }
 
+void TestOneThreadPrediction(
+    const PaddlePredictor::Config *config,
+    const std::vector<std::vector<PaddleTensor>> &inputs,
+    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
+  auto predictor = CreateTestPredictor(config, use_analysis);
+  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
+}
+
 void TestMultiThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<PaddleTensor> *outputs, int num_threads,
     bool use_analysis = true) {
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
   predictors.emplace_back(CreateTestPredictor(config, use_analysis));
@@ -267,7 +358,6 @@ void TestMultiThreadPrediction(
     predictors.emplace_back(predictors.front()->Clone());
   }
 
-  size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
       // Each thread should have local inputs and outputs.
@@ -280,34 +370,8 @@ void TestMultiThreadPrediction(
             ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
-
-      // warmup run
-      LOG(INFO) << "Running thread " << tid << ", warm up run...";
-      {
-        Timer warmup_timer;
-        warmup_timer.tic();
-        predictor->Run(inputs[0], outputs, batch_size);
-        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-        if (FLAGS_profile) {
-          paddle::platform::ResetProfiler();
-        }
-      }
-
-      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
-      {
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          for (const auto &input : inputs) {
-            ASSERT_TRUE(predictor->Run(input, &outputs_tid));
-          }
-        }
-
-        auto time = timer.toc();
-        total_time += time;
-        PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
-                  inputs.size());
-      }
+      PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
+      PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
     });
   }
   for (int i = 0; i < num_threads; ++i) {
@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis(
   CompareResult(analysis_outputs, native_outputs);
 }
 
+void CompareAnalysisAndZeroCopy(
+    PaddlePredictor::Config *config,
+    const std::vector<std::vector<PaddleTensor>> &inputs,
+    const std::vector<std::string> &outputs_name) {
+  int batch_size = FLAGS_batch_size;
+  // analysis
+  std::vector<PaddleTensor> analysis_outputs;
+  auto predictor = CreateTestPredictor(config, true);
+  predictor->Run(inputs[0], &analysis_outputs, batch_size);
+  // analysis + zero_copy
+  std::vector<ZeroCopyTensor> zerocopy_outputs;
+  reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
+  predictor = CreateTestPredictor(config, true);
+  ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
+  predictor->ZeroCopyRun();
+  for (size_t i = 0; i < outputs_name.size(); i++) {
+    ZeroCopyTensor zerocopy_output =
+        *predictor->GetOutputTensor(outputs_name[i]).get();
+    zerocopy_outputs.emplace_back(zerocopy_output);
+    LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
+  }
+  // compare
+  CompareResult(analysis_outputs, zerocopy_outputs);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 6c5fe043ff..df7af71d9b 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,5 +1,5 @@
 include(ExternalProject)
-set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
 
@@ -30,19 +30,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       ${EXTERNAL_PROJECT_NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${INSTALL_DIR}
-      URL                   ${URL}/${FILENAME}
+      DOWNLOAD_COMMAND      wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
+                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
       DOWNLOAD_DIR          ${INSTALL_DIR}
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
       BUILD_COMMAND         ""
       UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
+      INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
-    inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index c43eaf7f98..2104e4ac72 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,4 +1,2 @@
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_binary(visualizer SRCS visualizer.cc DEPS analysis
-    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc
deleted file mode 100644
index 7c0dd64dea..0000000000
--- a/paddle/fluid/inference/utils/visualizer.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/visualizer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <fstream>
-#include <memory>
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include "paddle/fluid/platform/init.h"
-
-DEFINE_string(model_dir, "", "model directory");
-DEFINE_string(model_program_path, "", "model program path");
-DEFINE_string(model_params_path, "", "model params path");
-
-using paddle::inference::analysis::Argument;
-
-namespace paddle {
-namespace inference {
-namespace utils {
-
-void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
-
-bool Visualizer::Run() {
-  paddle::framework::InitDevices(false);
-  paddle::inference::analysis::Analyzer().Run(argument_);
-  return true;
-}
-
-}  // namespace utils
-}  // namespace inference
-}  // namespace paddle
-
-// Generate a dot file describing the structure of graph.
-// To use this tool, run command: ./visualizer [options...]
-// Options:
-//     --model_dir: the directory of model
-//     --model_program_path: the path of program
-//     --model_params_path: the path of params
-int main(int argc, char *argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  google::InitGoogleLogging(argv[0]);
-
-  paddle::inference::analysis::Argument argument;
-  argument.SetUseGPU(false);
-  argument.SetUseTensorRT(false);
-
-  if (FLAGS_model_dir.empty()) {
-    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
-      LOG(ERROR) << "Please set model_dir"
-                    " or model_program_path and model_params_path";
-      return -1;
-    } else {
-      argument.SetModelProgramPath(FLAGS_model_program_path);
-      argument.SetModelParamsPath(FLAGS_model_params_path);
-    }
-  } else {
-    argument.SetModelDir(FLAGS_model_dir);
-  }
-
-  // Only 1 pass, default filename is 0_ir_origin.dot
-  // For more details, looking for paddle::inference::analysis::IRPassManager
-  argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
-
-  std::unique_ptr<paddle::framework::Scope> scope{
-      new paddle::framework::Scope()};
-  argument.SetScopeNotOwned(
-      const_cast<paddle::framework::Scope *>(scope.get()));
-
-  paddle::inference::utils::Visualizer visualizer;
-  visualizer.SetArgument(&argument);
-  visualizer.Run();
-
-  return 0;
-}
-
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4b7b9064dc..ac77c3d2a5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
@@ -61,4 +61,6 @@ nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocat
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 
-cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
+cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
+
+cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
new file mode 100644
index 0000000000..67905973ff
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+//! Run allocate test cases for different places
+void AllocateTestCases() {
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+TEST(Allocator, SpecifyGpuMemory) {
+#ifdef PADDLE_WITH_CUDA
+  // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
+  // FLAGS_reallocate_gpu_memory_in_mb
+  FLAGS_fraction_of_gpu_memory_to_use = 0.0;
+  // 512 MB
+  FLAGS_initial_gpu_memory_in_mb = 512;
+  // 4 MB
+  FLAGS_reallocate_gpu_memory_in_mb = 4;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
similarity index 92%
rename from paddle/fluid/memory/allocation/allocator_facade_test.cc
rename to paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 802d79e15d..decdc62f13 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -19,6 +19,8 @@
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
 
@@ -26,13 +28,8 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
+//! Run allocate test cases for different places
+void AllocateTestCases() {
   auto &instance = AllocatorFacade::Instance();
   platform::Place place;
   size_t size = 1024;
@@ -82,6 +79,16 @@ TEST(allocator, allocator) {
 #endif
 }
 
+TEST(Allocator, Allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 1936f9d4cd..514ac7883a 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/legacy_allocator.h"
-
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -23,9 +22,11 @@
 #endif
 
 #include "glog/logging.h"
+#include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 
@@ -36,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
             "that initializing the allocated memory with a small value "
             "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);
 
 namespace paddle {
@@ -152,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
                                     platform::GpuMinChunkSize(),
                                     platform::GpuMaxChunkSize());
 
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
     }
   });
 
@@ -328,18 +337,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 }  // namespace legacy
 
 namespace allocation {
-
 LegacyMemMonitor GPUMemMonitor;
 
 Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  return new Allocation(ptr, size, place_);
+  auto *tmp_alloc = new Allocation(ptr, size, place_);
+  platform::MemEvenRecorder::Instance().PushMemRecord(
+      static_cast<void *>(tmp_alloc), place_, size);
+  return tmp_alloc;
 }
 
 void LegacyAllocator::Free(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
+  platform::MemEvenRecorder::Instance().PopMemRecord(
+      static_cast<void *>(allocation), place_);
   delete allocation;
 }
 
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index c725dba5e9..a555b6b299 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -9,3 +9,5 @@ endif(${WITH_GPU})
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
 cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
+
+cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 26ef27c3ca..edd6ea4ade 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+
+#include <algorithm>
+#include <utility>
+
 #include "glog/logging.h"
 
 DEFINE_bool(free_idle_memory, false,
@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
               "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
+             << ")";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
     pool_.erase(pool_.begin());
   }
@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
 
   // refill the pool if failure
   if (it == pool_.end()) {
-    it = RefillPool();
+    it = RefillPool(size);
     // if still failure, fail fatally
     if (it == pool_.end()) {
       return nullptr;
@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   return static_cast<MemoryBlock*>(p)->data();
 }
 
-BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
+    size_t request_bytes) {
+  size_t allocate_bytes = max_chunk_size_;
+  size_t index = 0;
+
 #ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
+    } else {
+      // Reallocation size
+      if (realloc_size_ == 0) {
+        realloc_size_ = platform::GpuReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
     }
   }
 #endif
 
-  // Allocate a new maximum sized block
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
+  // Allocate a new block
+  void* p = system_allocator_->Alloc(&index, allocate_bytes);
 
   if (p == nullptr) return pool_.end();
 
@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
            << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     allocate_bytes, nullptr, nullptr);
 
   // gpu fallback allocation
   if (system_allocator_->UseGpu() &&
@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
     fallback_alloc_count_++;
   }
 
-  total_free_ += max_chunk_size_;
+  total_free_ += allocate_bytes;
 
   // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
 
     VLOG(10) << "Return block " << block << " to fallback allocator.";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
     fallback_alloc_count_--;
 
     // If no fall allocation exists, return directly
@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     VLOG(10) << "Return block " << block << " to base allocator.";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
 
     if (!shall_free_alloc()) return;
   }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 3f86a51f0d..bdc8cca4b5 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -60,7 +60,7 @@ class BuddyAllocator {
   void* SystemAlloc(size_t size);
 
   /*! \brief If existing chunks are not suitable, refill pool */
-  PoolSet::iterator RefillPool();
+  PoolSet::iterator RefillPool(size_t request_bytes);
 
   /**
    *  \brief   Find the suitable chunk from existing pool and split
@@ -89,6 +89,8 @@ class BuddyAllocator {
   size_t min_chunk_size_;  // the minimum size of each chunk
   size_t max_chunk_size_;  // the maximum size of each chunk
 
+  size_t realloc_size_ = 0;  // the size of re-allocated chunk
+
  private:
   /**
    * \brief A list of free allocation
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
new file mode 100644
index 0000000000..1edc9f2034
--- /dev/null
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+
+#include <memory>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+constexpr static int test_gpu_id = 0;
+
+void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
+  bool freed = false;
+  size_t used_bytes = allocator->Used();
+
+  if (size_bytes > 0) {
+    void* p = allocator->Alloc(size_bytes);
+
+    EXPECT_NE(p, nullptr);
+#ifdef PADDLE_WITH_CUDA
+    if (size_bytes < platform::GpuMaxChunkSize()) {
+#else
+    if (size_bytes < platform::CpuMaxChunkSize()) {
+#endif
+      // Not allocate from SystemAllocator
+      EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
+    } else {
+      // Allocate from SystemAllocator doesn't count in Used()
+      EXPECT_EQ(allocator->Used(), used_bytes);
+    }
+
+    int* intp = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(intp, [&](void* p) {
+      allocator->Free(intp);
+      freed = true;
+    });
+  } else {
+    freed = true;
+  }
+
+  EXPECT_EQ(used_bytes, allocator->Used());
+  EXPECT_TRUE(freed);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BuddyAllocator, GpuFraction) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+
+TEST(BuddyAllocator, InitRealloc) {
+  FLAGS_initial_gpu_memory_in_mb = 100;
+  FLAGS_reallocate_gpu_memory_in_mb = 50;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 40 << 20);
+  // Greater then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+
+TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
+  FLAGS_initial_gpu_memory_in_mb = 5;
+  FLAGS_reallocate_gpu_memory_in_mb = 10;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 1 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then initial size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
+  // MB)
+  TestBuddyAllocator(&buddy_allocator, 7 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 8 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+#endif
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 197d1c2f21..41d79c5beb 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -32,6 +32,9 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    LOG(WARNING)
-        << "Cannot malloc " << size / 1024.0 / 1024.0
-        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
-           "environment variable to a lower value. Current value is "
-        << FLAGS_fraction_of_gpu_memory_to_use;
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB GPU memory. Please shrink "
+                    "FLAGS_fraction_of_gpu_memory_to_use or "
+                    "FLAGS_initial_gpu_memory_in_mb or "
+                    "FLAGS_reallocate_gpu_memory_in_mb"
+                    "environment variable to a lower value. "
+                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+                 << FLAGS_fraction_of_gpu_memory_to_use
+                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
+                 << FLAGS_initial_gpu_memory_in_mb
+                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
+                 << FLAGS_reallocate_gpu_memory_in_mb;
     return nullptr;
   }
 }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a3f2a69aef..651c5e6e75 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE)
     SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
-# warpctc_op needs cudnn 7 above
 if (WITH_GPU)
+    # warpctc_op needs cudnn 7 above
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
@@ -58,6 +58,10 @@ if (WITH_GPU)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
+    if (NOT WIN32)
+        op_library(sync_batch_norm_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 2feb8e4c47..c87e4b22b3 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -74,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const std::string& name) {
   framework::LibraryType library{framework::LibraryType::kPlain};
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_CUDA
-  auto it1 = oper.Attrs().find("use_cudnn");
-  if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kCUDNN;
-  }
-#endif
+// FIXME(liuwei1031) temporarily disable the code to unblock users
+// TODO(liuwei1031) figure out the reason behind
+// https://github.com/PaddlePaddle/Paddle/issues/16096
+// and re-enable this in the future
+// #ifdef PADDLE_WITH_CUDA
+//   auto it1 = oper.Attrs().find("use_cudnn");
+//   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+//     library = framework::LibraryType::kCUDNN;
+//   }
+// #endif
 #ifdef PADDLE_WITH_MKLDNN
   auto it = oper.Attrs().find("use_mkldnn");
   if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
@@ -186,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
+Please make sure legal input, when input a negative value closed to zero,
+you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors.
+
 $out = \sqrt{x}$
 
 )DOC";
@@ -269,6 +278,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
 
 )DOC";
 
+class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of acos operator");
+    AddOutput("Out", "Output of acos operator");
+    AddComment(R"DOC(
+Arccosine Activation Operator.
+
+$$out = \cos^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of asin operator");
+    AddOutput("Out", "Output of asin operator");
+    AddComment(R"DOC(
+Arcsine Activation Operator.
+
+$$out = \sin^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of atan operator");
+    AddOutput("Out", "Output of atan operator");
+    AddComment(R"DOC(
+Arctanh Activation Operator.
+
+$$out = \tanh^{-1}(x)$$
+
+)DOC");
+  }
+};
+
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -543,7 +594,10 @@ namespace ops = paddle::operators;
   __macro(SoftShrink, softshrink);   \
   __macro(Abs, abs);                 \
   __macro(Cos, cos);                 \
+  __macro(Acos, acos);               \
   __macro(Sin, sin);                 \
+  __macro(Asin, asin);               \
+  __macro(Atan, atan);               \
   __macro(Round, round);             \
   __macro(Log, log);                 \
   __macro(Square, square);           \
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 1f5ae7fb5c..ff7e623f6f 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -39,9 +39,8 @@ namespace operators {
    Please refer to the layer_helper.py and get the details.
  */
 static std::unordered_set<std::string> InplaceOpSet = {
-    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
-    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
-};
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",        "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
 
 static bool IsInplace(const std::string& op) {
   bool inplace = InplaceOpSet.count(op);
@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+};
+
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(relu, ReluFunctor, ReluGradFunctor);                       \
   __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
   __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(atan, AtanFunctor, AtanGradFunctor);                       \
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
   __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
   __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
   __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(acos, AcosFunctor, AcosGradFunctor);                       \
   __macro(sin, SinFunctor, SinGradFunctor);                          \
+  __macro(asin, AsinFunctor, AsinGradFunctor);                       \
   __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 8944a74967..268a5b894a 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel {
                    "Input(Bias) of AffineChannelOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of AffineChannelOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto scale_dims = ctx->GetInputDim("Scale");
+    auto b_dims = ctx->GetInputDim("Bias");
+    const framework::DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    const int64_t C = (data_layout == framework::DataLayout::kNCHW
+                           ? x_dims[1]
+                           : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale_dims[0], C);
+    PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(b_dims[0], C);
+
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", "Out");
   }
@@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_channel_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Scale", Input("Scale"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 template <typename T>
 using EigenArrayMap =
     Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
@@ -244,8 +281,7 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
-                  ops::AffineChannelOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
 REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
 
 REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index feac412538..c0ad959309 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -22,147 +24,150 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class BatchNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
-
-    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
-                      "Mean and MeanOut should share the same memory");
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
-                      ctx->Outputs("VarianceOut")[0],
-                      "Variance and VarianceOut should share the same memory");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "Input X must have 2 to 5 dimensions.");
-
-    const int64_t C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
-
-    ctx->SetOutputDim("Y", x_dims);
-    ctx->SetOutputDim("MeanOut", {C});
-    ctx->SetOutputDim("VarianceOut", {C});
-    ctx->SetOutputDim("SavedMean", {C});
-    ctx->SetOutputDim("SavedVariance", {C});
-    ctx->ShareLoD("X", "Y");
+void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                 "Input(Scale) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                 "Input(Mean) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                 "Input(Variance) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                 "Output(Y) of ConvOp should not be null.");
+  bool is_test = ctx->Attrs().Get<bool>("is_test");
+  if (!is_test) {
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
+                   "Output(MeanOut) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
+                   "Output(VarianceOut) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
+                   "Output(SavedMean) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
+                   "Output(SavedVariance) of ConvOp should not be null.");
   }
 
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should both be float. (For float or float16 input tensor)
-    // or double (For double input tensor).
-    auto bn_param_type = framework::proto::VarType::FP32;
-    if (input_data_type == framework::proto::VarType::FP64) {
-      bn_param_type = framework::proto::VarType::FP64;
-    }
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
-                      "Scale input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
-                      "Bias input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
-                      "Mean input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
-                      "Variance input should be of float type");
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                    "Mean and MeanOut should share the same memory");
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
+                    "Variance and VarianceOut should share the same memory");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                 "Input X must have 2 to 5 dimensions.");
+
+  const int64_t C =
+      (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                        : x_dims[x_dims.size() - 1]);
+
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType BatchNormOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = ctx.Input<Tensor>("X")->type();
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should both be float. (For float or float16 input tensor)
+  // or double (For double input tensor).
+  auto bn_param_type = framework::proto::VarType::FP32;
+  if (input_data_type == framework::proto::VarType::FP64) {
+    bn_param_type = framework::proto::VarType::FP64;
+  }
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+                    "Scale input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+                    "Bias input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
+                    "Mean input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
+                    "Variance input should be of float type");
+
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library);
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
-};
+#endif
 
-class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-    AddInput("X", "The input tensor");
-    AddInput("Scale",
-             "Scale is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Bias",
-             "Bias is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Mean",
-             "The global mean (for training) or "
-             "estimated mean (for testing)");
-    AddInput("Variance",
-             "The global variance (for training) "
-             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
-    AddOutput("MeanOut",
-              "Share memory with Mean. "
-              "Store the global mean when training");
-    AddOutput("VarianceOut",
-              "Share memory with Variance. "
-              "Store the global Variance when training");
-    AddOutput("SavedMean",
-              "Mean of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("SavedVariance",
-              "Variance of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("fuse_with_relu",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("use_global_stats",
-                  "(bool, default false) Whether to use global mean and "
-                  "variance. In inference or test mode, set use_global_stats "
-                  "to true or is_test true. the behavior is equivalent. "
-                  "In train mode, when setting use_global_stats True, the "
-                  "global mean and variance are also used during train time, "
-                  "the BN acts as scaling and shiffting.")
-        .SetDefault(false);
-    AddComment(R"DOC(
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void BatchNormOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                       "'epsilon' should be between 0.0 and 0.001.");
+      });
+  AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+  AddInput("X", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Mean",
+           "The global mean (for training) or "
+           "estimated mean (for testing)");
+  AddInput("Variance",
+           "The global variance (for training) "
+           "or estimated Variance (for testing)");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_with_relu",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("use_global_stats",
+                "(bool, default false) Whether to use global mean and "
+                "variance. In inference or test mode, set use_global_stats "
+                "to true or is_test true. the behavior is equivalent. "
+                "In train mode, when setting use_global_stats True, the "
+                "global mean and variance are also used during train time, "
+                "the BN acts as scaling and shiffting.")
+      .SetDefault(false);
+  AddComment(R"DOC(
 Batch Normalization.
 
 Batch Norm has been implemented as discussed in the paper:
@@ -173,17 +178,7 @@ The required data format for this layer is one of the following:
 2. NCHW `[batch, in_channels, in_height, in_width]`
 
 )DOC");
-  }
-};
-
-class BatchNormOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
-  }
-};
+}
 
 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
   }
 };
 
-class BatchNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
-                   "Input(SavedMean) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
-                   "Input(SavedVariance) should not be null");
-
-    // check output
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
-                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
-                     "null at same time");
-    }
-    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
-    if (use_global_stats) {
-      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
-                     "Using global stats during training is not supported "
-                     "in gradient op kernel of batch_norm_mkldnn_op now.");
-    }
+void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
+  // check input
+  PADDLE_ENFORCE(ctx->HasInput("X"));
+  PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                 "Input(Y@GRAD) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                 "Input(SavedMean) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                 "Input(SavedVariance) should not be null");
+
+  // check output
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                   "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                   "null at same time");
+  }
+  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+  if (use_global_stats) {
+    PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                   "Using global stats during training is not supported "
+                   "in gradient op kernel of batch_norm_mkldnn_op now.");
+  }
 
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
+  const auto x_dims = ctx->GetInputDim("X");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
 
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-    }
+  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
   }
+}
 
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
+framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW("can't find Y@GRAD");
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW("can't find Y@GRAD");
+  }
 
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
 #endif
 
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(), layout, library);
-  }
-};
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout, library);
+}
 
 template <typename T>
 class BatchNormGradKernel<platform::CPUDeviceContext, T>
@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
   }
 };
 
-class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("batch_norm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-    op->SetInput("Scale", Input("Scale"));
-    op->SetInput("Bias", Input("Bias"));
-    op->SetInput("SavedMean", Output("SavedMean"));
-    op->SetInput("SavedVariance", Output("SavedVariance"));
-
-    // used when setting use_global_stats True during training
-    if (boost::get<bool>(GetAttr("use_global_stats"))) {
-      op->SetInput("Mean", Output("MeanOut"));
-      op->SetInput("Variance", Output("VarianceOut"));
-    }
+std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
+  auto *op = new framework::OpDesc();
+  op->SetType(GradOpType());
+  op->SetInput("X", Input("X"));
+  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+  op->SetInput("Scale", Input("Scale"));
+  op->SetInput("Bias", Input("Bias"));
+  op->SetInput("SavedMean", Output("SavedMean"));
+  op->SetInput("SavedVariance", Output("SavedVariance"));
+
+  // used when setting use_global_stats True during training
+  if (boost::get<bool>(GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
+  }
 
-    op->SetAttrMap(Attrs());
+  op->SetAttrMap(Attrs());
 
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+  op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+  op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+  op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
 
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
+  return std::unique_ptr<framework::OpDesc>(op);
+}
 
 class BatchNormInplaceInToOut : public framework::InplaceInToOut {
  public:
@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker,
-                  ops::BatchNormInplaceInToOut);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
-                  ops::BatchNormGradInplaceInToOut);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
+// ops::BatchNormInplaceInToOut);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
+//                  ops::BatchNormGradInplaceInToOut);
 
 REGISTER_OP_CPU_KERNEL(
     batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 1c45746a92..36d297ec55 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
-                  int *N, int *C, int *H, int *W, int *D) {
-  *N = dims[0];
-  if (dims.size() == 2) {
-    *C = dims[1];
-    *H = 1;
-    *W = 1;
-    *D = 1;
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = dims.size() > 3
-             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
-             : 1;
-    *D = dims.size() > 4
-             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
-             : 1;
-  }
-}
-
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
 template <typename T, int BlockDim, framework::DataLayout layout>
 static __global__ void KeBNBackwardScaleBias(
     const T *dy, const T *x, const BatchNormParamType<T> *mean,
@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias(
   }
 }
 
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
           KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
               grid2, block, 0, dev_ctx.stream()>>>(
               d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
               d_bias->data<BatchNormParamType<T>>());
         }
       } else {
@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
               running_var_data, epsilon, C, H * W, num, d_x->data<T>());
         }
         if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
               grid2, block, 0, dev_ctx.stream()>>>(
               d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
               d_bias->data<BatchNormParamType<T>>());
         }
       }
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 5e3d630d68..6e89d73eb2 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -35,17 +38,84 @@ template <typename T>
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 
+class BatchNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override;
+};
+
+class BatchNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override;
+};
+
+class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override;
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+class BatchNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override;
 };
 
 template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override;
 };
 
+inline void ExtractNCWHD(const framework::DDim &dims,
+                         const DataLayout &data_layout, int *N, int *C, int *H,
+                         int *W, int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index cf78c83297..4cef49280d 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for
 each source sentence by walking back along the LoDTensorArray Input(ids)
 whose lods can be used to restore the path in the beam search tree.
 
-The Output(SentenceIds) and Output(SentenceScores) separately contain the 
-generated id sequences and the corresponding scores. The shapes and lods of the 
-two LodTensor are same. The lod level is 2 and the two levels separately 
-indicate how many hypotheses each source sentence has and how many ids each 
+The Output(SentenceIds) and Output(SentenceScores) separately contain the
+generated id sequences and the corresponding scores. The shapes and lods of the
+two LodTensor are same. The lod level is 2 and the two levels separately
+indicate how many hypotheses each source sentence has and how many ids each
 hypothesis has.
 )DOC");
   }
@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    for (auto& o : op_desc.Output("SentenceIds")) {
-      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
-      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    for (auto& o : ctx->Output("SentenceIds")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto& o : op_desc.Output("SentenceScores")) {
-      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
-      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto& o : ctx->Output("SentenceScores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index fa6b09b4e7..a6aa35e056 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
 
     AddComment(R"DOC(
-This operator does the search in beams for one time step. 
+This operator does the search in beams for one time step.
 Specifically, it selects the top-K candidate word ids of current step from
 Input(ids) according to their Input(scores) for all source sentences,
 where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 
 class BeamSearchInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("selected_ids")) {
-      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
-      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("selected_ids")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto &o : op_desc.Output("selected_scores")) {
-      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
-      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto &o : ctx->Output("selected_scores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 194f9cf503..6e3c9f2864 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -50,9 +50,19 @@ class ConcatOp : public framework::OperatorWithKernel {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
         } else {
-          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                            "Input tensors should have the same "
-                            "elements except the specify axis.");
+          if (ctx->IsRuntime()) {
+            // check all shape in run time
+            PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                              "Input tensors should have the same "
+                              "elements except the specify axis.");
+          } else {
+            // not check -1 with other in compile time
+            if (out_dims[j] > 0 && ins[i][j] > 0) {
+              PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                                "Input tensors should have the same "
+                                "elements except the specify axis.");
+            }
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index b614e9b035..7aa1c44eaa 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(operators)
 register_operators(DEPS naive_executor)
+cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 1a157688f3..fa77f97419 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -93,11 +93,9 @@ execution.
 
 class GetPlacesInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o_name : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o_name).SetType(
-          framework::proto::VarType::PLACE_LIST);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o_name : ctx->Output("Out")) {
+      ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index fa18ade323..45f18ac925 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = ctx->Input("X")[0];
+    auto out_name = ctx->Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
-    auto *x = block->FindVarRecursive(x_name);
-    if (x != nullptr) {
-      out.SetDataType(x->GetDataType());
+    ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
+    if (ctx->HasVar(x_name)) {
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 0360cf5273..deb8ec3bb2 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
@@ -26,14 +27,6 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-static constexpr char kStepBlock[] = "sub_block";
-static constexpr char kCondition[] = "Condition";
-static constexpr char kStepScopes[] = "StepScopes";
-static constexpr char kX[] = "X";
-static constexpr char kXGRAD[] = "X@GRAD";
-static constexpr char kOutputs[] = "Out";
-static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-
 namespace {  // NOLINT
 static std::string GetSkipEagerDeletionVarsDebugString(
     const std::vector<std::string> &vars) {
@@ -372,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kX);
-    auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto p_names = ctx->Input(kX);
+    auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
-      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
-      auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
-      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+      if (ctx->HasVar(pg_ig_names[i])) {
         VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
-        g_var->SetType(p_var.GetType());
-        g_var->SetDataType(p_var.GetDataType());
+                << " type: " << ctx->GetType(p_names[i]);
+        ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
+        ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
new file mode 100644
index 0000000000..2cbd94a061
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -0,0 +1,291 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace operators {
+
+// OpVariant is a wrapper class of OpDesc and OperatorBase
+// So that API would be the same.
+class OpVariant {
+  struct InputsVisitor
+      : public boost::static_visitor<const framework::VariableNameMap *> {
+    template <typename OpType>
+    const framework::VariableNameMap *operator()(const OpType *op) const {
+      return &(op->Inputs());
+    }
+  };
+
+  struct OutputsVisitor
+      : public boost::static_visitor<const framework::VariableNameMap *> {
+    template <typename OpType>
+    const framework::VariableNameMap *operator()(const OpType *op) const {
+      return &(op->Outputs());
+    }
+  };
+
+  struct AttributeMapVisitor
+      : public boost::static_visitor<const framework::AttributeMap *> {
+    const framework::AttributeMap *operator()(
+        const framework::OpDesc *op) const {
+      return &(op->GetAttrMap());
+    }
+
+    const framework::AttributeMap *operator()(
+        const framework::OperatorBase *op) const {
+      return &(op->Attrs());
+    }
+  };
+
+  struct RawPointerVisitor : public boost::static_visitor<const void *> {
+    template <typename OpType>
+    const void *operator()(const OpType *op) const {
+      return op;
+    }
+  };
+
+ public:
+  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
+
+  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
+
+  const framework::VariableNameMap &Inputs() const {
+    return *boost::apply_visitor(InputsVisitor(), op_);
+  }
+
+  const framework::VariableNameMap &Outputs() const {
+    return *boost::apply_visitor(OutputsVisitor(), op_);
+  }
+
+  const framework::AttributeMap &Attrs() const {
+    return *boost::apply_visitor(AttributeMapVisitor(), op_);
+  }
+
+  template <typename AttrType>
+  const AttrType &Attr(const std::string &name) const {
+    auto &attrs = Attrs();
+    auto it = attrs.find(name);
+    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
+    return boost::get<AttrType>(it->second);
+  }
+
+  bool operator==(const OpVariant &other) const {
+    return RawPointer() == other.RawPointer();
+  }
+
+  const void *RawPointer() const {
+    return boost::apply_visitor(RawPointerVisitor(), op_);
+  }
+
+  int which() const { return static_cast<int>(op_.which()); }
+
+  struct Hasher {
+    size_t operator()(const OpVariant &op) const {
+      return reinterpret_cast<size_t>(op.RawPointer());
+    }
+  };
+
+ private:
+  const boost::variant<const framework::OperatorBase *,
+                       const framework::OpDesc *>
+      op_;
+};
+
+static std::string GetDebugString(const std::vector<std::string> &names) {
+  if (names.empty()) return "";
+  std::string ret = names[0];
+  for (size_t i = 1; i < names.size(); ++i) {
+    ret += (" " + names[i]);
+  }
+  return ret;
+}
+
+// Set skip variables of while_op and while_grad_op
+// These variables should be skipped when eager deletion enables.
+// It is because:
+//  1. while_grad_op needs some variables defined in while_op.
+//  2. while_grad_op needs variables from the previous time step.
+static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  VLOG(2) << "Prepare to skip " << attr.size()
+          << " var(s): " << GetDebugString(attr);
+  attrs[kSkipEagerDeletionVars] = std::move(attr);
+}
+
+// Check whether the forward while_op and while_grad_op match
+// The program may have many while_ops.
+static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
+                                           const OpVariant &grad_op) {
+  return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
+         fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
+}
+
+// Test whether the variable is skippable in forward while_op
+// The variable is skippable in while_op when the variable used in while_grad
+// is not from grad_block.
+static bool IsSkippableVar(const std::string &name,
+                           framework::BlockDesc *grad_block) {
+  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
+}
+
+static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
+                                            const OpVariant &bwd_op) {
+  auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
+
+  // Find all skippable variables in forward while_op
+  std::unordered_set<std::string> forward_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (IsSkippableVar(in_arg_name, grad_block)) {
+        forward_skip_vars.insert(in_arg_name);
+      }
+    }
+
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (IsSkippableVar(out_arg_name, grad_block)) {
+        forward_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+
+  SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
+                                               forward_skip_vars.end()));
+
+  // Find all skippable variables in while_grad_op
+  // The skipped variables are those which would be used across time steps.
+  auto &fwd_input = fwd_op.Inputs().at(kX);
+  auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
+  PADDLE_ENFORCE_EQ(
+      fwd_input.size(), in_grads.size(),
+      "Backward input gradient number does not match forward input number.");
+
+  std::unordered_set<std::string> backward_skip_vars;
+  for (size_t i = 0; i < in_grads.size(); ++i) {
+    if (in_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    backward_skip_vars.insert(in_grads[i]);
+    backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
+  }
+
+  SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
+                                               backward_skip_vars.end()));
+}
+
+// Find all while_ops and while_grad_ops in the graph or program
+// The while_grad_op and while_op may located in different blocks
+// So we should traverse all blocks in the program and find them out.
+static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops,
+                                       std::vector<OpVariant> *while_grad_ops) {
+  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
+
+  if (while_ops->empty()) return;
+
+  const auto *program =
+      while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program();
+  for (size_t i = 1; i < program->Size(); ++i) {
+    auto &block = program->Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == "while") {
+        while_ops->emplace_back(op);
+      } else if (op->Type() == "while_grad") {
+        while_grad_ops->emplace_back(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
+                    "There are extra while_grad ops in the graph or program");
+}
+
+static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
+    std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) {
+  FindAllWhileAndWhileGradOp(while_ops, while_grad_ops);
+
+  VLOG(2) << "Found while op num: " << while_ops->size()
+          << ", while grad op num: " << while_grad_ops->size();
+
+  if (while_grad_ops->empty()) {
+    return;
+  }
+
+  std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
+      while_ops->begin(), while_ops->end());
+
+  for (auto &bwd_op : *while_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : while_op_set) {
+      if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE(matched_fwd_op == nullptr,
+                       "Found multiple matched while ops");
+        matched_fwd_op = &fwd_op;
+      }
+    }
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
+                            "Cannot find matched forward while op.");
+    ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
+    while_op_set.erase(*matched_fwd_op);
+  }
+}
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all while_ops and while_grad_ops in the whole program
+  // would be processed when block_id is 0 (i.e. when Executor::Run() or
+  // ParallelExecutor constructs).
+
+  // What's more, all while_ops and while_grad_ops must be processed when
+  // block_id is zero. If not, while_op may run first and erase variables
+  // used in while_grad_op, and in this moment, while_grad_ops may be not
+  // constructed yet.
+  if (block_id != 0) return;
+
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  for (auto &op : all_ops) {
+    if (op->Type() == "while") {
+      fwd_ops.emplace_back(op.get());
+    } else if (op->Type() == "while_grad") {
+      bwd_ops.emplace_back(op.get());
+    }
+  }
+  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
+}
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    const std::vector<framework::OperatorBase *> &while_ops,
+    const std::vector<framework::OperatorBase *> &while_grad_ops) {
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  fwd_ops.reserve(while_ops.size());
+  for (auto *op : while_ops) {
+    fwd_ops.emplace_back(op);
+  }
+
+  bwd_ops.reserve(while_grad_ops.size());
+  for (auto *op : while_grad_ops) {
+    bwd_ops.emplace_back(op);
+  }
+
+  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
new file mode 100644
index 0000000000..456ba8642b
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    const std::vector<framework::OperatorBase *> &while_ops,
+    const std::vector<framework::OperatorBase *> &while_grad_ops);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index ca6bc4df0f..c6121d00da 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index c994c6f642..baa39c0f99 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
                                  ctx.GetPlace(), layout_, library_);
 }
 
+class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType(ForwardOp().Type() + "_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetInput("Bias", Input("Bias"));
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -352,7 +375,7 @@ namespace ops = paddle::operators;
 // conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
@@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL(
 // conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
@@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL(
 // depthwise conv2d_transpose
 REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvTransposeGradOpDescMaker);
 REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 8f3644039f..30ec74d844 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
               "Norm of the second input, reduced along the 1st "
               "dimension.")
         .AsIntermediate();
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
 **Cosine Similarity Operator**
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index 76cfc68051..0b4e3f7746 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
+    auto* in_x = context.Input<framework::LoDTensor>("X");
     auto* in_y = context.Input<Tensor>("Y");
-    auto* out_z = context.Output<Tensor>("Out");
+    auto* out_z = context.Output<framework::LoDTensor>("Out");
     auto* out_x_norm = context.Output<Tensor>("XNorm");
     auto* out_y_norm = context.Output<Tensor>("YNorm");
-    out_z->mutable_data<T>(context.GetPlace());
-    out_x_norm->mutable_data<T>(context.GetPlace());
-    out_y_norm->mutable_data<T>(context.GetPlace());
 
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
+    out_z->Resize({rows_x, 1});
+    out_x_norm->Resize({rows_x, 1});
+    out_y_norm->Resize({rows_y, 1});
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
+    out_z->set_lod(in_x->lod());
 
     int cols = framework::product(in_x->dims()) / rows_x;
 
@@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
 
     if (rows_x == rows_y) {
       if (out_grad_x) {
+        out_grad_x->Resize(in_x->dims());
         math::CosSimGradFunctor<T> functor(
             in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
             in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
         for_range(functor);
       }
       if (out_grad_y) {
+        out_grad_y->Resize(in_y->dims());
         math::CosSimGradFunctor<T> functor(
             in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
             in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
       }
     } else {
       if (out_grad_x) {
+        out_grad_x->Resize(in_x->dims());
         math::CosSimDxFunctor<T> functor(
             in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
             in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
@@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
         for_range(functor);
       }
       if (out_grad_y) {
+        out_grad_y->Resize(in_y->dims());
         out_grad_y->mutable_data<T>(context.GetPlace());
         math::SetConstant<DeviceContext, T> set_zero;
         auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 72774a878d..d6b54038ec 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     Tensor track;
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
-                        platform::CPUPlace>(tag_num);
+    auto ker =
+        jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
+            .At(tag_num);
     ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 3adc7baebd..a617b9fb1d 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 
 namespace paddle {
 namespace operators {
 
-class CrossEntropyOp : public framework::OperatorWithKernel {
+class CrossEntropyOpBase : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
     PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
                         "Input(X) and Input(Label) shall have the same shape "
                         "except the last dimension.");
     }
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+
+    if (IsSoftLabel(ctx)) {
       if (check) {
         PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
                           "If Attr(soft_label) == true, the last dimension of "
@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
                                    ctx.device_context());
   }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return ctx->Attrs().Get<bool>("soft_label");
+  }
 };
 
-class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+  void InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
                    "Input(Y@GRAD) shoudl be not null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@GRAD) should be not null.");
 
-    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims = GetXDim(ctx);
     auto label_dims = ctx->GetInputDim("Label");
     auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
     int rank = x_dims.size();
@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "The Input(X) and Input(Y@Grad) should have the same "
                         "shape except the last dimension.");
     }
-    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
-                      "The last dimension of Input(Y@Grad) should be 1.");
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+    if (IsSoftLabel(ctx)) {
       if (check) {
         PADDLE_ENFORCE_EQ(
             x_dims[rank - 1], label_dims[rank - 1],
@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X"));
   }
 
  protected:
@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->type(),
+        ctx.device_context());
+  }
+
+  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
+    return ctx->GetInputDim("X");
+  }
+
+  virtual const char* VarNameWithXLoD() const { return "X"; }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return ctx->Attrs().Get<bool>("soft_label");
+  }
+};
+
+class CrossEntropyOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
   }
 };
 
@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X.
   }
 };
 
-class CrossEntropyOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
+class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
+ public:
+  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    CrossEntropyGradientOpBase::InferShape(ctx);
+  }
+};
+
+class CrossEntropyOp2 : public CrossEntropyOpBase {
+ public:
+  using CrossEntropyOpBase::CrossEntropyOpBase;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    CrossEntropyOpBase::InferShape(ctx);
+
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("MatchX"),
+                   "Output(MatchX) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims_vec = framework::vectorize(x_dims);
+    x_dims_vec.push_back(0);
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec));
+    x_dims[x_dims.size() - 1] = 1;
+    ctx->SetOutputDim("MatchX", x_dims);
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
  protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  bool IsSoftLabel(framework::InferShapeContext* ctx) const override {
+    return false;
+  }
+};
+
+class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase {
+ public:
+  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist");
+    CrossEntropyGradientOpBase::InferShape(ctx);
+  }
+
+ protected:
+  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
+    auto x_shape = ctx->GetInputDim("XShape");
+    return framework::DDim(x_shape.Get(), x_shape.size() - 1);
+  }
+
+  virtual const char* VarNameWithXLoD() const { return "XShape"; }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return false;
   }
 };
+
+class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "probability computed by the previous operator, which is almost "
+             "always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. One hot Tensor.");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the cross entropy loss.");
+    AddOutput("XShape", "Temporaily variable to save shape and LoD of X.");
+    AddOutput("MatchX",
+              "X value that matches label, used for gradient computation.");
+    AddAttr<int>("ignore_index",
+                 "(int, default -100), Specifies a target value that is"
+                 "ignored and does not contribute to the input gradient."
+                 "Only valid if soft_label is set to False")
+        .SetDefault(-100);
+    AddComment(R"DOC(
+Hard-label CrossEntropy Operator.
+
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
+The matrix's second dimension(row length) is as same as the original last 
+dimension, and the first dimension(column length) is the product of all other 
+original dimensions. Then the softmax computation will take palce on each raw 
+of flattened matrixs.
+
+Only support hard label.
+
+Both the input X and Label can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cross_entropy_grad2");
+    op->SetInput("Label", Input("Label"));
+    op->SetInput("MatchX", Output("MatchX"));
+    op->SetInput("XShape", Output("XShape"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;
 
-REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-                  ops::CrossEntropyOpInferVarType,
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
+                  ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
                        ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
                        ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
+
+REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2,
+                  ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType,
+                  ops::CrossEntropyGradOpDescMaker2);
+REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2);
+REGISTER_OP_CPU_KERNEL(cross_entropy2,
+                       ops::CrossEntropyOpKernel2<CPUCtx, float>,
+                       ops::CrossEntropyOpKernel2<CPUCtx, double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad2,
+                       ops::CrossEntropyGradientOpKernel2<CPUCtx, float>,
+                       ops::CrossEntropyGradientOpKernel2<CPUCtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index fcd34383a8..243e7f52c1 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL(
     cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
     ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
     ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(cross_entropy2,
+                        ops::CrossEntropyOpKernel2<CUDACtx, float>,
+                        ops::CrossEntropyOpKernel2<CUDACtx, double>,
+                        ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
+    ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
+    ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index f123e11542..7eb663773e 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+struct HardLabelCrossEntropyForwardFunctor {
+  HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x,
+                                      const int64_t* label,
+                                      int64_t ignore_index,
+                                      int64_t feature_size)
+      : x_(x),
+        y_(y),
+        match_x_(match_x),
+        label_(label),
+        ignore_index_(ignore_index),
+        feature_size_(feature_size) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto label = label_[idx];
+    if (label != ignore_index_) {
+      auto match_x = x_[idx * feature_size_ + label];
+      y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
+      match_x_[idx] = match_x;
+    } else {
+      y_[idx] = 0;
+      match_x_[idx] = 0;  // any value is ok
+    }
+  }
+
+  const T* x_;
+  T* y_;
+  T* match_x_;
+  const int64_t* label_;
+  int64_t ignore_index_;
+  int64_t feature_size_;
+};
+
+template <typename T>
+struct HardLabelCrossEntropyBackwardFunctor {
+  HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x,
+                                       const int64_t* label,
+                                       int64_t ignore_index,
+                                       int64_t feature_size)
+      : dx_(dx),
+        dy_(dy),
+        match_x_(match_x),
+        label_(label),
+        ignore_index_(ignore_index),
+        feature_size_(feature_size) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    auto label = label_[row_idx];
+    if (label == col_idx && label != ignore_index_) {
+      dx_[idx] = -dy_[row_idx] / match_x_[row_idx];
+    } else {
+      dx_[idx] = 0;
+    }
+  }
+
+  T* dx_;
+  const T* dy_;
+  const T* match_x_;
+  const int64_t* label_;
+  int64_t ignore_index_;
+  int64_t feature_size_;
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* match_x = ctx.Output<Tensor>("MatchX");
+
+    auto& x_dims = x->dims();
+    auto feature_size = x_dims[x_dims.size() - 1];
+    auto batch_size = framework::product(x->dims()) / feature_size;
+
+    auto* p_x = x->data<T>();
+    auto* p_label = label->data<int64_t>();
+    auto* p_y = y->mutable_data<T>(ctx.GetPlace());
+    auto* p_match_x = match_x->mutable_data<T>(ctx.GetPlace());
+
+    auto ignore_index = ctx.Attr<int>("ignore_index");
+
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(), batch_size);
+    for_range(HardLabelCrossEntropyForwardFunctor<T>(
+        p_x, p_y, p_match_x, p_label, ignore_index, feature_size));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* match_x = ctx.Input<Tensor>("MatchX");
+    auto* label = ctx.Input<Tensor>("Label");
+
+    auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
+    auto* p_dy = dy->data<T>();
+    auto* p_match_x = match_x->data<T>();
+    auto* p_label = label->data<int64_t>();
+
+    int64_t ignore_index = ctx.Attr<int>("ignore_index");
+    int rank = dx->dims().size();
+    int64_t feature_size = dx->dims()[rank - 1];
+    int64_t batch_size = framework::product(dx->dims()) / feature_size;
+
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        batch_size * feature_size);
+    for_range(HardLabelCrossEntropyBackwardFunctor<T>(
+        p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c87837e694..94a2016aa5 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 0a51d50e06..de36126774 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel {
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
                         "The rank of Input TargetBox must be 3");
-      if (axis == 0) {
-        PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-      } else if (axis == 1) {
-        PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
-      } else {
-        PADDLE_THROW("axis must be 0 or 1.");
+      PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1");
+      if (ctx->IsRuntime()) {
+        if (axis == 0) {
+          PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+        } else if (axis == 1) {
+          PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+        }
+        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
       }
-      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
       ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
     }
 
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 6d406f8196..d4c7e8cf77 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -20,7 +20,7 @@ namespace operators {
 
 enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
 
-inline BoxCodeType GetBoxCodeType(const std::string& type) {
+inline BoxCodeType GetBoxCodeType(const std::string &type) {
   if (type == "encode_center_size") {
     return BoxCodeType::kEncodeCenterSize;
   } else if (type == "decode_center_size") {
@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
 template <typename DeviceContext, typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
  public:
-  void EncodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void EncodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                         const bool normalized,
-                        const std::vector<float> variance, T* output) const {
+                        const std::vector<float> variance, T *output) const {
     int64_t row = target_box->dims()[0];
     int64_t col = prior_box->dims()[0];
     int64_t len = prior_box->dims()[1];
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+        size_t offset = i * col * len + j * len;
         T prior_box_width = prior_box_data[j * len + 2] -
                             prior_box_data[j * len] + (normalized == false);
         T prior_box_height = prior_box_data[j * len + 3] -
@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
                               target_box_data[i * len + 1] +
                               (normalized == false);
 
-        size_t offset = i * col * len + j * len;
         output[offset] =
             (target_box_center_x - prior_box_center_x) / prior_box_width;
         output[offset + 1] =
@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
             std::log(std::fabs(target_box_width / prior_box_width));
         output[offset + 3] =
             std::log(std::fabs(target_box_height / prior_box_height));
-        if (prior_box_var) {
-          int prior_var_offset = j * len;
-          output[offset] /= prior_box_var_data[prior_var_offset];
-          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
-          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
-          output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
-        } else if (!(variance.empty())) {
+      }
+    }
+
+    if (prior_box_var) {
+      const T *prior_box_var_data = prior_box_var->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
           for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
+            int prior_var_offset = j * len;
+            output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+          }
+        }
+      }
+    } else if (!(variance.empty())) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
             output[offset + k] /= static_cast<T>(variance[k]);
           }
         }
       }
     }
   }
+
   template <int axis, int var_size>
-  void DecodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void DecodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                         const bool normalized, std::vector<float> variance,
-                        T* output) const {
+                        T *output) const {
     int64_t row = target_box->dims()[0];
     int64_t col = target_box->dims()[1];
     int64_t len = target_box->dims()[2];
 
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
-    int prior_box_offset = 0;
-    T var_data[4] = {1., 1., 1., 1.};
-    T* var_ptr = var_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+
+        T var_data[4] = {1., 1., 1., 1.};
+        T *var_ptr = var_data;
         size_t offset = i * col * len + j * len;
-        prior_box_offset = axis == 0 ? j * len : i * len;
+        int prior_box_offset = axis == 0 ? j * len : i * len;
+
         T prior_box_width = prior_box_data[prior_box_offset + 2] -
                             prior_box_data[prior_box_offset] +
                             (normalized == false);
@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
         T target_box_width = 0, target_box_height = 0;
         int prior_var_offset = axis == 0 ? j * len : i * len;
         if (var_size == 2) {
-          std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
+          std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
                       4 * sizeof(T));
         } else if (var_size == 1) {
-          var_ptr = reinterpret_cast<T*>(variance.data());
+          var_ptr = reinterpret_cast<T *>(variance.data());
         }
         T box_var_x = *var_ptr;
         T box_var_y = *(var_ptr + 1);
@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     }
   }
 
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto *output_box = context.Output<framework::Tensor>("OutputBox");
     std::vector<float> variance = context.Attr<std::vector<float>>("variance");
     const int axis = context.Attr<int>("axis");
     if (target_box->lod().size()) {
@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
 
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
-    T* output = output_box->data<T>();
+    T *output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
                        variance, output);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
new file mode 100644
index 0000000000..e0d7e25d94
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class YoloBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
+                   "Input(ImgSize) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
+                   "Output(Boxes) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Scores"),
+                   "Output(Scores) of YoloBoxOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_imgsize = ctx->GetInputDim("ImgSize");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], anchor_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
+                      "Input(ImgSize) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0], dim_x[0],
+        "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
+    PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater than 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater than 0.");
+
+    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
+
+    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+    ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YoloBox operator is a 4-D tensor with "
+             "shape of [N, C, H, W]. The second dimension(C) stores "
+             "box locations, confidence score and classification one-hot "
+             "keys of each anchor box. Generally, X should be the output "
+             "of YOLOv3 network.");
+    AddInput("ImgSize",
+             "The image size tensor of YoloBox operator, "
+             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
+             "height and width of each input image used for resizing output "
+             "box in input image scale.");
+    AddOutput("Boxes",
+              "The output tensor of detection boxes of YoloBox operator, "
+              "This is a 3-D tensor with shape of [N, M, 4], N is the "
+              "batch num, M is output box number, and the 3rd dimension "
+              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
+    AddOutput("Scores",
+              "The output tensor of detection boxes scores of YoloBox "
+              "operator, This is a 3-D tensor with shape of "
+              "[N, M, :attr:`class_num`], N is the batch num, M is "
+              "output box number.");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YoloBox operator "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YoloBox operators.")
+        .SetDefault(32);
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
+                   "be ignored.")
+        .SetDefault(0.01);
+    AddComment(R"DOC(
+         This operator generates YOLO detection boxes from output of YOLOv3 network.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors. In the second dimension(the channel
+         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor 
+         box.
+
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         predictions should be as follows:
+
+         $$
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
+         b_w = p_w e^{t_w}
+         $$
+         $$
+         b_h = p_h e^{t_h}
+         $$
+
+         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
+
+         The logistic regression value of the 5th channel of each anchor prediction boxes
+         represents the confidence score of each prediction box, and the logistic
+         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         boxes represents the classifcation scores. Boxes with confidence scores less than
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+         confidence scores and classification scores.
+
+         $$
+         score_{pred} = score_{conf} * score_{class}
+         $$
+
+         )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
+                       ops::YoloBoxKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
new file mode 100644
index 0000000000..5a882958e6
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
+                            T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    T conf = sigmoid<T>(input[obj_idx]);
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
+                  grid_num, img_height, img_width);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+
+template <typename T>
+class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* img_size = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = sizeof(int) * anchors.size();
+    auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
+    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto cplace = platform::CPUPlace();
+    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
+                 dev_ctx.stream());
+
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = img_size->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, boxes, static_cast<T>(0));
+    set_zero(dev_ctx, scores, static_cast<T>(0));
+
+    int grid_dim = (n * box_num + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
+        anchors_data, n, h, w, an_num, class_num, box_num, input_size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
+                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
new file mode 100644
index 0000000000..8b7c7df0f3
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
+                                  int j, int an_idx, int grid_size,
+                                  int input_size, int index, int stride,
+                                  int img_height, int img_width) {
+  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
+                                        const int img_height,
+                                        const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<T>(img_height - 1);
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+template <typename T>
+class YoloBoxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* imgsize = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    Tensor anchors_;
+    auto anchors_data =
+        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = imgsize->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    memset(boxes_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    memset(scores_data, 0, scores->numel() * sizeof(T));
+
+    T box[4];
+    for (int i = 0; i < n; i++) {
+      int img_height = imgsize_data[2 * i];
+      int img_width = imgsize_data[2 * i + 1];
+
+      for (int j = 0; j < an_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            int obj_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            T conf = sigmoid<T>(input_data[obj_idx]);
+            if (conf < conf_thresh) {
+              continue;
+            }
+
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
+                          box_idx, stride, img_height, img_width);
+            box_idx = (i * box_num + j * stride + k * w + l) * 4;
+            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
+                                img_width);
+
+            int label_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
+                              class_num, conf, stride);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index ab01bdf7ca..6c37da17f4 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(class_num, 0,
                       "Attr(class_num) should be an integer greater then 0.");
 
+    if (ctx->HasInput("GTScore")) {
+      auto dim_gtscore = ctx->GetInputDim("GTScore");
+      PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
+                        "Input(GTScore) should be a 2-D tensor");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[0], dim_gtbox[0],
+          "Input(GTBox) and Input(GTScore) dim[0] should be same");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[1], dim_gtbox[1],
+          "Input(GTBox) and Input(GTScore) dim[1] should be same");
+    }
+
     std::vector<int64_t> dim_out({dim_x[0]});
     ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
 
@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
              "This is a 2-D tensor with shape of [N, max_box_num], "
              "and each element should be an integer to indicate the "
              "box class id.");
+    AddInput("GTScore",
+             "The score of GTLabel, This is a 2-D tensor in same shape "
+             "GTLabel, and score values should in range (0, 1). This "
+             "input is for GTLabel score can be not 1.0 in image mixup "
+             "augmentation.")
+        .AsDispensable();
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
               "This is a 1-D tensor with shape of [N]");
@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("ignore_thresh",
                    "The ignore threshold to ignore confidence loss.")
         .SetDefault(0.7);
+    AddAttr<bool>("use_label_smooth",
+                  "Whether to use label smooth. Default True.")
+        .SetDefault(true);
     AddComment(R"DOC(
          This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          loss = (loss_{xy} + loss_{wh}) * weight_{box}
               + loss_{conf} + loss_{class}
          $$
+
+         While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
+         target will be smoothed when calculating classification loss, target of 
+         positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
+         negetive samples will be smoothed to :math:`1.0 / class\_num`.
+
+         While :attr:`GTScore` is given, which means the mixup score of ground truth 
+         boxes, all losses incured by a ground truth box will be multiplied by its 
+         mixup score.
          )DOC");
   }
 };
@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("X", Input("X"));
     op->SetInput("GTBox", Input("GTBox"));
     op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput("GTScore", Input("GTScore"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
     op->SetInput("GTMatchMask", Output("GTMatchMask"));
@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetOutput(framework::GradVarName("GTBox"), {});
     op->SetOutput(framework::GradVarName("GTLabel"), {});
+    op->SetOutput(framework::GradVarName("GTScore"), {});
     return std::unique_ptr<framework::OpDesc>(op);
   }
 };
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index 8407d4e6e8..a004b022b7 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
 }
 
 template <typename T>
-static T L2Loss(T x, T y) {
-  return 0.5 * (y - x) * (y - x);
+static T L1Loss(T x, T y) {
+  return std::abs(y - x);
 }
 
 template <typename T>
@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
 }
 
 template <typename T>
-static T L2LossGrad(T x, T y) {
-  return x - y;
+static T L1LossGrad(T x, T y) {
+  return x > y ? 1.0 : -1.0;
 }
 
 static int GetMaskIndex(std::vector<int> mask, int val) {
@@ -121,47 +121,49 @@ template <typename T>
 static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
                                 std::vector<int> anchors, int an_idx,
                                 int box_idx, int gi, int gj, int grid_size,
-                                int input_size, int stride) {
+                                int input_size, int stride, T score) {
   T tx = gt.x * grid_size - gi;
   T ty = gt.y * grid_size - gj;
   T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
   T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
 
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
   loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
   loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
 }
 
 template <typename T>
 static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
                                     Box<T> gt, std::vector<int> anchors,
                                     int an_idx, int box_idx, int gi, int gj,
-                                    int grid_size, int input_size, int stride) {
+                                    int grid_size, int input_size, int stride,
+                                    T score) {
   T tx = gt.x * grid_size - gi;
   T ty = gt.y * grid_size - gj;
   T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
   T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
 
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
   input_grad[box_idx] =
       SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
   input_grad[box_idx + stride] =
       SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
   input_grad[box_idx + 2 * stride] =
-      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
   input_grad[box_idx + 3 * stride] =
-      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
 }
 
 template <typename T>
 static inline void CalcLabelLoss(T* loss, const T* input, const int index,
                                  const int label, const int class_num,
-                                 const int stride) {
+                                 const int stride, const T pos, const T neg,
+                                 T score) {
   for (int i = 0; i < class_num; i++) {
     T pred = input[index + i * stride];
-    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
   }
 }
 
@@ -169,11 +171,13 @@ template <typename T>
 static inline void CalcLabelLossGrad(T* input_grad, const T loss,
                                      const T* input, const int index,
                                      const int label, const int class_num,
-                                     const int stride) {
+                                     const int stride, const T pos, const T neg,
+                                     T score) {
   for (int i = 0; i < class_num; i++) {
     T pred = input[index + i * stride];
     input_grad[index + i * stride] =
-        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
+        loss;
   }
 }
 
@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
         for (int l = 0; l < w; l++) {
           T obj = objness[k * w + l];
           if (obj > 1e-5) {
-            // positive sample: obj = 1
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+            // positive sample: obj = mixup score
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
           } else if (obj > -0.5) {
             // negetive sample: obj = 0
             loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
           T obj = objness[k * w + l];
           if (obj > 1e-5) {
             input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
+                loss[i];
           } else if (obj > -0.5) {
             input_grad[k * w + l] =
                 SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* gt_box = ctx.Input<Tensor>("GTBox");
     auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
     auto* loss = ctx.Output<Tensor>("Loss");
     auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
     auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     int class_num = ctx.Attr<int>("class_num");
     float ignore_thresh = ctx.Attr<float>("ignore_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
 
     const int n = input->dims()[0];
     const int h = input->dims()[2];
@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     int* gt_match_mask_data =
         gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
 
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
     // calc valid gt box mask, avoid calc duplicately in following code
     Tensor gt_valid_mask;
     bool* gt_valid_mask_data =
@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
         int mask_idx = GetMaskIndex(anchor_mask, best_n);
         gt_match_mask_data[i * b + t] = mask_idx;
         if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
           int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                       an_stride, stride, 0);
           CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
-                                 box_idx, gi, gj, h, input_size, stride);
+                                 box_idx, gi, gj, h, input_size, stride, score);
 
           int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
-          obj_mask_data[obj_idx] = 1.0;
+          obj_mask_data[obj_idx] = score;
 
           int label = gt_label_data[i * b + t];
           int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                         an_stride, stride, 5);
           CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
-                           class_num, stride);
+                           class_num, stride, label_pos, label_neg, score);
         }
       }
     }
@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* gt_box = ctx.Input<Tensor>("GTBox");
     auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
     auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
     int class_num = ctx.Attr<int>("class_num");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
 
     const int n = input_grad->dims()[0];
     const int c = input_grad->dims()[1];
@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
         input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
     memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
 
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
     for (int i = 0; i < n; i++) {
       for (int t = 0; t < b; t++) {
         int mask_idx = gt_match_mask_data[i * b + t];
         if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
           Box<T> gt = GetGtBox(gt_box_data, i, b, t);
           int gi = static_cast<int>(gt.x * w);
           int gj = static_cast<int>(gt.y * h);
 
           int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                       an_stride, stride, 0);
-          CalcBoxLocationLossGrad<T>(
-              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
-              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+          CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
+                                     input_data, gt, anchors,
+                                     anchor_mask[mask_idx], box_idx, gi, gj, h,
+                                     input_size, stride, score);
 
           int label = gt_label_data[i * b + t];
           int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                         an_stride, stride, 5);
           CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
-                               label_idx, label, class_num, stride);
+                               label_idx, label, class_num, stride, label_pos,
+                               label_neg, score);
         }
       }
     }
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
new file mode 100644
index 0000000000..0fbc27515c
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+struct MutableDataFunctor {
+  MutableDataFunctor(void** data, framework::LoDTensor* tensor,
+                     const platform::Place& place)
+      : data_(data), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void apply() {
+    *data_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** data_;
+  framework::LoDTensor* tensor_;
+  platform::Place place_;
+};
+
+class AllReduceOp : public framework::OperatorBase {
+  using OperatorBase::OperatorBase;
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE(is_gpu_place(place),
+                   "AllReduce op can run on gpu place only for now.");
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* ctx = pool.Get(place);
+    auto in_names = Inputs("X");
+    auto out_names = Outputs("Out");
+    PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input");
+    PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output");
+
+    auto* in = scope.FindVar(in_names[0]);
+    auto* out = scope.FindVar(out_names[0]);
+
+    PADDLE_ENFORCE(in->IsType<framework::LoDTensor>() ||
+                       out->IsType<framework::LoDTensor>(),
+                   "Only support allreduce LoDTensors");
+
+    int dtype = -1;
+    auto in_tensor = in->Get<framework::LoDTensor>();
+    dtype = platform::ToNCCLDataType(in_tensor.type());
+
+    int64_t numel = in_tensor.numel();
+    auto* sendbuff = in_tensor.data<void>();
+    auto* out_tensor = out->GetMutable<framework::LoDTensor>();
+    out_tensor->Resize(in_tensor.dims());
+    void* recvbuff = nullptr;
+    framework::VisitDataType(in_tensor.type(),
+                             MutableDataFunctor(&recvbuff, out_tensor, place));
+
+    auto cuda_ctx = static_cast<platform::CUDADeviceContext*>(ctx);
+    auto* comm = cuda_ctx->nccl_comm();
+    // FIXME(typhoonzero): should use nccl stream here.
+    auto stream = cuda_ctx->stream();
+
+    int reduce_type = Attr<int>("reduce_type");
+    ncclRedOp_t red_type = ncclSum;
+    switch (reduce_type) {
+      case 0:
+        red_type = ncclSum;
+        break;
+      case 1:
+        red_type = ncclProd;
+        break;
+      case 2:
+        red_type = ncclMax;
+        break;
+      case 3:
+        red_type = ncclMin;
+        break;
+    }
+
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
+        comm, stream));
+#endif
+  }
+};
+
+class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be allreduced.");
+    AddOutput("Out", "(Tensor) the result of allreduced.");
+    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+***AllReduce Operator***
+
+Call NCCL AllReduce internally. Note that this op must be used when one
+thread is managing one GPU device.
+
+For speed reasons, reduce_type should be an integer:
+
+0: sum
+1: prod
+2: max
+3: min
+
+If input and output are the same variable, in-place allreduce will be used.
+)DOC");
+  }
+};
+
+class AllReduceOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(allreduce, ops::AllReduceOp,
+                  paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker,
+                  ops::AllReduceOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 28ebdcb03e..5ee35e0458 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
 
 class FakeInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index da0185b8c4..1b0b4dd316 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
 
 class MergeIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index f61d387fbe..191ca1efe8 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 2ccc86c1dc..65c2ff6415 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dropout_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
         "1. downgrade_in_infer(default), downgrade the outcome at inference "
         "time"
         "   train: out = input * mask"
-        "   inference: out = input * dropout_prob"
+        "   inference: out = input * (1.0 - dropout_prob)"
         "2. upscale_in_train, upscale the outcome at training time, do nothing "
         "in inference"
         "   train: out = input * mask / ( 1.0 - dropout_prob )"
@@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                       "GradOp is only callable when is_test is false");
 
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) must not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims, out_dims,
-                      "Dimensions of Input(X) and Out@Grad must be the same.");
-    auto mask_dims = ctx->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
-                      "Dimensions of Input(X) and Mask must be the same.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+
+    ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    ctx->ShareLoD(framework::GradVarName("Out"),
+                  /*->*/ framework::GradVarName("X"));
+  }
+};
+
+class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("dropout_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Mask", Output("Mask"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
   }
 };
 
@@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::DropoutGradOpDescMaker);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
new file mode 100644
index 0000000000..66c56da417
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "FloorDiv"; }
+  std::string GetEquation() const override { return "Out = X // Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
+                             ops::ElementwiseFloorDivOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
new file mode 100644
index 0000000000..60846d1e8f
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
new file mode 100644
index 0000000000..2d24e394d5
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct FloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_floor_div(const framework::ExecutionContext &ctx,
+                           const framework::Tensor *x,
+                           const framework::Tensor *y, framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
new file mode 100644
index 0000000000..d63a7df03d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseModOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Mod"; }
+  std::string GetEquation() const override { return "Out = X % Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
+                             ops::ElementwiseModOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mod,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
new file mode 100644
index 0000000000..da3304a839
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
new file mode 100644
index 0000000000..5b139fd4b3
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_mod(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        ModFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseModKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 04e8800bbc..f2f4d3fee0 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         constexpr int simd_width = 16;
         int C = c / simd_width;
 
-        auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
-                                 platform::CPUPlace>(0);
+        auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
+                                         platform::CPUPlace>::Cache()
+                            .At(0);
 #pragma omp parallel for collapse(2)
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 44a2f37b66..fcb2be9363 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+#include <memory>
 #include <vector>
 
 namespace paddle {
@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("expand_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ExpandGradOpDescMaker);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 5d6488c67e..4a8937ba1c 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fake_dequantize_op.h"
 #include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -32,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    if (scale_num == 1) {
+      const int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      for (int i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = (s / max_range) * in_e;
+      }
+    } else if (scale_num == 2) {
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      for (int i = 0; i < batch_size; i++) {
+        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+            framework::slice_ddim(in->dims(), 1, in->dims().size()));
+        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+            framework::slice_ddim(out->dims(), 1, out->dims().size()));
+        for (int j = 0; j < channel; j++) {
+          T s = scale_one[j];
+          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
+        }
+      }
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
 template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
 
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
@@ -76,6 +120,63 @@ $$Out = \frac{scale*X}{ max_range }$$
   }
 };
 
+class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInputs("Scales"),
+                   "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
+
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeChannelWiseDequantizeMaxAbsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddInput("Scales",
+             "(Tensors) The scales in quantization stage. "
+             "Now, `Scales` is a vector with at most two tensors. "
+             "If Scales has two elements, the second tensor should only have "
+             "one value.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<std::vector<int>>(
+        "quant_bits",
+        "Quantization bit numbers in quantization stage. "
+        "The size of `quant_bits` should be equal to the size of `Scales`.")
+        .SetDefault({8});
+
+    AddComment(R"DOC(
+FakeChannelWiseDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
+
+$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
+
+In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
+Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$  in the formula.
+
+Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -88,3 +189,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
 REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
                        ops::FakeDequantizeMaxAbsKernel<CPU, float>,
                        ops::FakeDequantizeMaxAbsKernel<CPU, double>);
+
+REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
+                  ops::FakeChannelWiseDequantizeMaxAbsOp,
+                  ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
+                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 225bcc45bc..02f9dc827d 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
+                                   int num, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int batch_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (batch_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int num = in->numel();
+      int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      int block = 1024;
+      int grid = channel;
+      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, channel, out_data);
+    } else if (scale_num == 2) {
+      int num = in->numel();
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = batch_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          out_data);
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
 template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -55,3 +113,7 @@ using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
                         ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
                         ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_dequantize_max_abs,
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index d9923a10da..ed9a0a4d65 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -27,6 +29,13 @@ struct DequantizeFunctor {
                   framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
@@ -45,5 +54,43 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
+    int max_range = 1;
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+    int scale_num = scales.size();
+    if (scale_num == 1) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[0],
+          "The number of first scale values must be the same with "
+          "first dimension value of Input(X) when the `Scales` has only one "
+          "element.");
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
+    } else if (scale_num == 2) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[1],
+          "The number of first scale values must be the same with "
+          "second dimension value of Input(X) when the `Scales` has two "
+          "elements.");
+      PADDLE_ENFORCE_EQ(
+          scales[1]->numel(), 1,
+          "The second scale tensor should only have one value at now.");
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
+                   (std::pow(2, quant_bits[1] - 1) - 1);
+    }
+    ChannelDequantizeFunctor<DeviceContext, T>()(
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 3bb07d3835..054ef4658c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
 
 template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    const int channel_size = num / channel;
+    for (int i = 0; i < channel; i++) {
+      auto* start = in + i * channel_size;
+      auto* end = in + (i + 1) * channel_size;
+      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+    }
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const int channel_size = in.numel() / channel;
+    platform::Transform<platform::CPUDeviceContext> trans;
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      auto* start = in_data + i * channel_size;
+      auto* end = in_data + (i + 1) * channel_size;
+      trans(ctx, start, end, out_data + i * channel_size,
+            ClipFunctor<T>(-s, s));
+    }
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
+    }
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -81,6 +126,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
 
 template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    T accum = in_accum.data<T>()[0];
+    T state = in_state.data<T>()[0];
+    T scale = cur_scale[0];
+
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+
+    out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
+    out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
+  }
+};
+
+template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeAbsMaxOp(const std::string& type,
@@ -134,6 +203,60 @@ $$Out = round(X/scale * range)$$
   }
 };
 
+class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeChannelWiseQuantizeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScale"),
+        "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c})$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -201,6 +324,78 @@ $$Out = round(X/scale * range)$$
   }
 };
 
+class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
+                                    const framework::VariableNameMap& inputs,
+                                    const framework::VariableNameMap& outputs,
+                                    const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
+                   "should not be null");
+    if (ctx->HasOutput("OutState")) {
+      ctx->SetOutputDim("OutState", {1});
+    }
+    if (ctx->HasOutput("OutAccum")) {
+      ctx->SetOutputDim("OutAccum", {1});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class FakeQuantizeMovingAverageAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("InAccum", "Last accum.").AsDispensable();
+    AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
+    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
+    AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
+        .SetDefault(0.9);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
+
+$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -218,3 +413,16 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
                        ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
+                  ops::FakeQuantizeMovingAverageAbsMaxOp,
+                  ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
+                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
+REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
+                  ops::FakeChannelWiseQuantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
+                       ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index a0ff639621..33bd275e5c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
+template <typename T>
+__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
+                                        T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  extern __shared__ T shared_max_data[];
+  shared_max_data[tid] = T(0);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T tmp = fabs(in_c[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    int block = 1024;
+    int grid = channel;
+    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, channel, out);
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
+
 template <typename T>
 __global__ void ClipAndQuantKernel(const T* in, const T* scale,
                                    const int bin_cnt, const int n, T* out) {
@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
 
   T s = scale[0];
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[bid];
+    T x = in[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[i] = round(v);
+  }
+}
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
+template <typename T>
+__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
+                                          const int bin_cnt, const int n,
+                                          const int c, T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt / s * v;
-    out[bid] = round(v);
+    out_c[i] = round(v);
   }
 }
 
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = channel;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, channel, out_data);
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
+                                               float>;
+
 template <typename T>
 __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
                                             const T* last_scale,
@@ -148,24 +249,39 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
 
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    T accum;
+    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
+                 sizeof(T), 0);
+    T state;
+    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
+                 sizeof(T), 0);
+    T scale;
+    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
+                 0);
 
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+
+    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &accum, sizeof(T), 0);
+    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &state, sizeof(T), 0);
+    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &scale, sizeof(T), 0);
   }
 };
 
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
+                                               float>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -174,5 +290,10 @@ namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
                         ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
+                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
                         ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_quantize_moving_average_abs_max,
+    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 7ace7573ec..5ab38b086d 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,12 +42,33 @@ struct FindRangeAbsMaxFunctor {
                   framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };
 
+template <typename DeviceContext, typename T>
+struct FindChannelAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num,
+                  const int channel, T* out);
+};
+
+template <typename DeviceContext, typename T>
+struct ChannelClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int channel, framework::Tensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct FindMovingAverageAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state,
+                  const framework::Tensor& cur_scale,
+                  framework::Tensor* out_state, framework::Tensor* out_accum,
+                  framework::Tensor* out_scale);
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
-
     auto* out = context.Output<framework::Tensor>("Out");
     auto* out_scale = context.Output<framework::Tensor>("OutScale");
     T* out_s = out_scale->mutable_data<T>(context.GetPlace());
@@ -63,6 +84,28 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    out->mutable_data<T>(context.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
+    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
@@ -105,5 +148,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
+    }
+
+    // training
+    auto* in_accum = context.Input<framework::Tensor>("InAccum");
+    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cur_scale = allocator.Allocate(1 * sizeof(T));
+    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+
+    auto* out_state = context.Output<framework::Tensor>("OutState");
+    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    out_state->mutable_data<T>(context.GetPlace());
+    out_accum->mutable_data<T>(context.GetPlace());
+    out_scale->mutable_data<T>(context.GetPlace());
+    float moving_rate = context.Attr<float>("moving_rate");
+
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
+        out_accum, out_scale);
+
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index eb4617a935..242f5390b8 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
       "The input tensor Input's rank of FCOp should be larger than "
       "in_num_col_dims.");
 
-  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
-
   std::vector<int64_t> output_dims;
-  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
-  for (int i = 0; i < in_num_col_dims; ++i) {
-    output_dims.push_back(in_dims[i]);
-  }
-  output_dims.push_back(w_dims[1]);
+  FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
   ctx->ShareLoD("Input", "Out");
@@ -128,6 +119,9 @@ void FCOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                "Skip calling InferShape() function in the runtime.")
+      .SetDefault(true);
   AddComment(R"DOC(
   Fully Connected Operator.
 
@@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     auto w_dims = w->dims();
+
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     auto out_dims = output->dims();
-    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
+    int M = framework::product(out_dims) / w_dims[1];
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index e1b780fc0c..b82a63cd83 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
+inline void FCOutputSize(const framework::DDim& in_dims,
+                         const framework::DDim& w_dims,
+                         std::vector<int64_t>& out_dims,  // NOLINT
+                         int in_num_col_dims) {
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+
+  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    out_dims.push_back(in_dims[i]);
+  }
+  out_dims.push_back(w_dims[1]);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index c86430524e..cf2f4776cf 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
-    auto& out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetDataType(data_type);
+        boost::get<int>(ctx->GetAttr("dtype")));
+    auto& out_var_name = ctx->Output("Out").front();
+    ctx->SetDataType(out_var_name, data_type);
   }
 };
 
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
new file mode 100644
index 0000000000..fbe8e56a61
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fsp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FSPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FSPOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE(
+        x_dims.size() == 4,
+        "The Input(X) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        y_dims.size() == 4,
+        "The Input(Y) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        (x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
+        "The Input(X) and Input(Y) should have the same height and width.");
+
+    ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
+    ctx->ShareLoD("X", "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context(), layout_, library_);
+  }
+};
+
+class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input of FSP op with shape [batch_size, x_channel, "
+             "height, width]");
+    AddInput("Y",
+             "(Tensor) The input of FSP op with shape"
+             "[batch_size, y_channel, height, width]."
+             "The y_channel can be different with the x_channel of Input(X)"
+             " while the other dimensions must be the same with Input(X)'s.");
+    AddOutput(
+        "Out",
+        "(Tensor) The output of FSP op with shape "
+        "[batch_size, x_channel, y_channel]. The x_channel is the channel "
+        "of Input(X) and the y_channel is the channel of Input(Y).");
+    AddComment(R"DOC(
+    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
+    Given feature map x with shape [x_channel, h, w] and feature map y with shape
+    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
+
+        step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
+                transpose y into matrix with shape [h * w, y_channel]
+        step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
+
+    The output is a batch of fsp matrices.
+    )DOC");
+  }
+};
+
+class FSPOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu
new file mode 100644
index 0000000000..4fd7ba04ff
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/fsp_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPOpKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fsp_grad,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
new file mode 100644
index 0000000000..544af2b7d9
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FSPOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    auto batch_size = x_dims[0];
+    auto x_channel = x_dims[1];
+    auto y_channel = y_dims[1];
+    auto height = x_dims[2];
+    auto width = x_dims[3];
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+
+    math::MatDescriptor x_mat_desc;
+    x_mat_desc.height_ = x_channel;
+    x_mat_desc.width_ = height * width;
+    x_mat_desc.batch_size_ = batch_size;
+    x_mat_desc.stride_ = x_channel * height * width;
+
+    math::MatDescriptor y_mat_desc;
+    y_mat_desc.height_ = height * width;
+    y_mat_desc.width_ = y_channel;
+    y_mat_desc.batch_size_ = batch_size;
+    y_mat_desc.stride_ = y_channel * height * width;
+    y_mat_desc.trans_ = true;
+
+    blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
+                static_cast<T>(1.0 / (height * width)), output,
+                static_cast<T>(0.0));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FSPGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    if (d_x == nullptr && d_y == nullptr) {
+      return;
+    }
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_out_dims = d_out->dims();
+    auto batch_size = d_out_dims[0];
+    auto x_channel = d_out_dims[1];
+    auto y_channel = d_out_dims[2];
+    int64_t h = 0;
+    int64_t w = 0;
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    math::SetConstant<DeviceContext, T> set_zero;
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_x,
+               static_cast<T>(0));
+      auto* y = context.Input<Tensor>("Y");
+      auto y_dims = y->dims();
+      h = y_dims[2];
+      w = y_dims[3];
+
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = x_channel;
+      d_out_mat_desc.width_ = y_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+
+      math::MatDescriptor y_mat_desc;
+      y_mat_desc.height_ = y_channel;
+      y_mat_desc.width_ = h * w;
+      y_mat_desc.batch_size_ = batch_size;
+      y_mat_desc.stride_ = y_channel * h * w;
+
+      blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
+    }
+
+    if (d_y != nullptr) {
+      d_y->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_y,
+               static_cast<T>(0));
+      auto* x = context.Input<Tensor>("X");
+      auto x_dims = x->dims();
+      h = x_dims[2];
+      w = x_dims[3];
+
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = y_channel;
+      d_out_mat_desc.width_ = x_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+      d_out_mat_desc.trans_ = true;
+
+      math::MatDescriptor x_mat_desc;
+      x_mat_desc.height_ = x_channel;
+      x_mat_desc.width_ = h * w;
+      x_mat_desc.batch_size_ = batch_size;
+      x_mat_desc.stride_ = x_channel * h * w;
+
+      blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index 80caf70b08..9cc94ab88d 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
     PADDLE_ENFORCE(ctx->HasInput("W"),
                    "Input W of FusedEmbeddingSeqPoolOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -91,6 +88,9 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
+        .SetDefault(true);
     AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.
 
@@ -138,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
 class FusedEmbeddingSeqPoolOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index f13c020386..4651c2b2ba 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
                                   out_width, jit::SeqPoolType::kSum);
     for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
       attr.index_height = ids_lod[i + 1] - ids_lod[i];
-      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
-                                  platform::CPUPlace>(attr);
+      auto emb_seqpool =
+          jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
+              .At(attr);
       emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
                   &attr);
     }
@@ -120,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      // runtime shape
+      d_table->set_height(table_dim[0]);
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
@@ -135,8 +138,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
       const T *d_output_data = d_output->data<T>();
 
-      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
-                                 platform::CPUPlace>(out_width);
+      auto vbroadcast =
+          jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
+              .At(out_width);
       for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
         int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
         const T *src = d_output_data + i * out_width;
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 66acba49e5..ba5f0747c4 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int total_T = x_dims[0];           \
   const int D3 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                                        \
-  const jit::gru_attr_t attr(                                                  \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
-      jit::to_kerneltype(ctx.Attr<std::string>("activation")));                \
-  jit::gru_t one_step;                                                         \
-  auto ComputeH1 =                                                             \
-      jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr);      \
-  auto ComputeHtPart1 =                                                        \
-      jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  auto ComputeHtPart2 =                                                        \
-      jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  auto place = ctx.GetPlace();                                                 \
+#define INIT_OTHER_DEFINES                                                   \
+  auto* h0 = ctx.Input<Tensor>("H0");                                        \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                   \
+  auto* bias = ctx.Input<Tensor>("Bias");                                    \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
+  const int M = x_dims[1];                                                   \
+  const int D = wh_dims[0];                                                  \
+  const int D2 = D * 2;                                                      \
+  const jit::gru_attr_t attr(                                                \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),       \
+      jit::to_kerneltype(ctx.Attr<std::string>("activation")));              \
+  jit::gru_t one_step;                                                       \
+  auto ComputeH1 =                                                           \
+      jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At(  \
+          attr);                                                             \
+  auto ComputeHtPart1 =                                                      \
+      jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
+  auto ComputeHtPart2 =                                                      \
+      jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
+  const T* x_data = x->data<T>();                                            \
+  const T* wx_data = wx->data<T>();                                          \
+  const T* wh_data = wh->data<T>();                                          \
+  auto place = ctx.GetPlace();                                               \
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index b11b7c11bf..c8c07bd126 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D = wh_dims[0];                                 \
   const int D4 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                    \
-  const T* x_data = x->data<T>();                                             \
-  const T* wx_data = wx->data<T>();                                           \
-  const T* wh_data = wh->data<T>();                                           \
-  /* diagonal weight*/                                                        \
-  const T* wp_data = bias->data<T>() + D4;                                    \
-  /* for peephole only*/                                                      \
-  T* checked_cell_data = nullptr;                                             \
-  auto place = ctx.GetPlace();                                                \
-  if (use_peepholes) {                                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                   \
-    checked_cell_data = checked_cell->mutable_data<T>(place);                 \
-  }                                                                           \
-  const jit::lstm_attr_t attr(                                                \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),        \
-      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),      \
-      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),           \
-      use_peepholes);                                                         \
-  jit::lstm_t one_step;                                                       \
-  one_step.wp = wp_data;                                                      \
-  one_step.checked = checked_cell_data;                                       \
-  auto ComputeC1H1 =                                                          \
-      jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
-  auto ComputeCtHt =                                                          \
-      jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
+#define INIT_OTHER_DEFINES                                                     \
+  const T* x_data = x->data<T>();                                              \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  /* diagonal weight*/                                                         \
+  const T* wp_data = bias->data<T>() + D4;                                     \
+  /* for peephole only*/                                                       \
+  T* checked_cell_data = nullptr;                                              \
+  auto place = ctx.GetPlace();                                                 \
+  if (use_peepholes) {                                                         \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                           \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                    \
+    checked_cell_data = checked_cell->mutable_data<T>(place);                  \
+  }                                                                            \
+  const jit::lstm_attr_t attr(                                                 \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
+      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),       \
+      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),            \
+      use_peepholes);                                                          \
+  jit::lstm_t one_step;                                                        \
+  one_step.wp = wp_data;                                                       \
+  one_step.checked = checked_cell_data;                                        \
+  auto ComputeC1H1 =                                                           \
+      jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
+          attr);                                                               \
+  auto ComputeCtHt =                                                           \
+      jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
+          attr)
 
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index 8ecdf2ed9d..6be35de65f 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -82,9 +82,11 @@ template <typename T>
 static void fc_relu(const T* x, const T* w, const T* b, T* y,
                     const jit::matmul_attr_t& attr) {
   auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+      jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+          attr);
   auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+      jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
+          attr.n);
   matmul(x, w, y, &attr);
   T* dst = y;
   for (int i = 0; i < attr.m; ++i) {
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index d48bdafe0a..25916768c0 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
       attr.type = jit::SeqPoolType::kSqrt;
     }
     auto seqpool =
-        jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
             attr);
     size_t n = ins.size();
     size_t dst_step_size = n * w;
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 8493f4468f..53679ebdde 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     int o_numel = attr.m * attr.n;
 
     auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
-                                                                       attr.k);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            attr.m * attr.k);
     auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
-                                                                       attr.n);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            attr.k * attr.n);
     auto vsquare_xy =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
     auto vsub =
-        jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
     auto vscal =
-        jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
     auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+        jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
 
     const T* x_data = x->data<T>();
     const T* y_data = y->data<T>();
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index a4ae19d9c1..c0893359af 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
 class GetTensorFromSelectedRowsOpVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const final {
-    auto out_var_name = op_desc.Output("Out").front();
-    auto in_var_name = op_desc.Input("X").front();
-
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
-    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
-    out_var.SetDataType(in_var.GetDataType());
+  void operator()(framework::InferVarTypeContext *ctx) const {  // NOLINT
+    auto out_var_name = ctx->Output("Out").front();
+    auto in_var_name = ctx->Input("X").front();
+
+    ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
   }
 };
 
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 7a29f80ff1..82222d0a7e 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of HashOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -57,6 +54,9 @@ $$Out = scale * X$$
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int>("mod_by", "").SetDefault(100000);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
+        .SetDefault(true);
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 6ca6f0bc04..d0e1057c43 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
 class HierarchicalSigmoidGradOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto bias_grad_var_name_vec =
-        op_desc.Output(framework::GradVarName("Bias"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
     std::string bias_grad_var_name;
     bool hasBias = false;
     if (bias_grad_var_name_vec.size()) {
       hasBias = true;
-      bias_grad_var_name =
-          op_desc.Output(framework::GradVarName("Bias")).front();
+      bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
     }
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to SelectedRows";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to LoDTensor";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
     if (hasBias) {
       VLOG(30) << "hierarchical_sigmoid_grad op "
                << framework::GradVarName("Bias") << " is set to LoDTensor";
-      block->Var(bias_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index 35775d7ec9..47d6c83f2a 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
 
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
 
 file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 3088280bb9..fbb04a166e 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
       InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
   void BenchJITKernel_##name##_##dtype##_##place##_::Run()
 
-#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
-
 void RUN_ALL_BENCHMARK() {
   for (auto p : g_all_benchmarks) {
     if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
   return s;
 }
 
-template <typename KernelTuples, typename... Args>
+template <typename KernelTuple, typename... Args>
 struct BenchFunc {
   // return this function avg time
   // TODO(TJ): clear cache every time
-  double operator()(const typename KernelTuples::func_type tgt, Args... args) {
+  double operator()(const typename KernelTuple::func_type tgt, Args... args) {
     for (int i = 0; i < FLAGS_burning; ++i) {
       tgt(args...);
     }
@@ -109,40 +107,17 @@ struct BenchFunc {
 
 namespace jit = paddle::operators::jit;
 
-template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
-          typename... Args>
-void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
-  BenchFunc<KernelTuples, Args...> benchmark;
+template <typename KernelTuple, typename PlaceType, typename... Args>
+void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
+  BenchFunc<KernelTuple, Args...> benchmark;
   std::vector<std::pair<std::string, double>> infos;
-  // test refer
-  auto refer = jit::GetRefer<KT, KernelTuples>();
-  if (!refer) {
-    LOG(FATAL) << "Refer can not be empty!";
+  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  for (auto f : funcs) {
+    infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
   }
-  infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
 
-  // test jitcode
-  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitcode) {
-    infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
-  }
-  // test all impls in more
-  jit::KernelKey kkey(KT, PlaceType());
-  auto& pool = jit::KernelPool().Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        auto more = i->GetFunc();
-        infos.push_back(
-            std::make_pair(i->ImplType(), benchmark(more, args...)));
-      }
-    }
-  }
   // Test result from Get function
-  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
+  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
   if (!tgt) {
     LOG(FATAL) << "Target can not be empty!";
   }
@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 
   // print
   std::ostringstream loginfos;
-  loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": ";
+  loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
+           << attr << ": ";
   for (auto pair : infos) {
     loginfos << pair.first << " takes " << pair.second << " us; ";
   }
@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 
 using Tensor = paddle::framework::Tensor;
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchXYZNKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelXYZN() {
+  using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     Tensor x, y, z;
     x.Resize({d});
@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
     T* z_data = z.mutable_data<T>(PlaceType());
     RandomVec<T>(d, x_data);
     RandomVec<T>(d, y_data);
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
-                                                     y.data<T>(), z_data, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y.data<T>(), z_data,
+                                          d);
     // test inplace
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
-                                                     z_data, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchAXYNKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelAXYN() {
+  using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
     Tensor x, y;
@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
     T* x_data = x.mutable_data<T>(PlaceType());
     T* y_data = y.mutable_data<T>(PlaceType());
     RandomVec<T>(d, x_data);
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
-                                                     d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
     // test inplace
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
-                                                     d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchXRNKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelXRN() {
+  using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     Tensor x;
     RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
     T res;
-    BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchXYNKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelXYN() {
+  using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     Tensor x, y;
     x.Resize({d});
@@ -216,12 +193,13 @@ void BenchXYNKernel() {
     T* x_data = x.mutable_data<T>(PlaceType());
     T* y_data = y.mutable_data<T>(PlaceType());
     RandomVec<T>(d, x_data);
-    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchLSTMKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelLSTM() {
+  using T = typename KernelTuple::data_type;
   for (bool use_peephole : {true, false}) {
     for (int d : TestSizes()) {
       const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
         step.wp = wp_data;
         step.checked = checked_data;
       }
-      BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr);
+      BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchGRUKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelGRU() {
+  using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
     auto place = PlaceType();
@@ -275,12 +254,13 @@ void BenchGRUKernel() {
     step.gates = x_data;
     step.ht_1 = ht_1_data;
     step.ht = ht_data;
-    BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr);
+    BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchSeqPoolKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelSeqPool() {
+  using T = typename KernelTuple::data_type;
   std::vector<jit::SeqPoolType> pool_types = {
       jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
   for (auto type : pool_types) {
@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
         RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
         const T* x_data = x.data<T>();
         T* y_data = y.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
-                                                            y_data, &attr);
+        BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchEmbSeqPoolKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelEmbSeqPool() {
+  using T = typename KernelTuple::data_type;
   std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
   int64_t tbl_h = 1e4;
   for (int tbl_w : {10, 16, 256}) {
@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
                              tbl_h - 1);
           const int64_t* idx_data = idx.data<int64_t>();
           T* o_data = out.mutable_data<T>(PlaceType());
-          BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
-              attr, table_data, idx_data, o_data, &attr);
+          BenchAllImpls<KernelTuple, PlaceType>(attr, table_data, idx_data,
+                                                o_data, &attr);
         }
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchSgdKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelSgd() {
+  using T = typename KernelTuple::data_type;
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
@@ -364,15 +345,16 @@ void BenchSgdKernel() {
         const T* grad_data = grad.data<T>();
         const int64_t* rows_data = rows.data();
         jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>(
-            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
+        BenchAllImpls<KernelTuple, PlaceType>(attr, &lr, param_data, grad_data,
+                                              rows_data, param_data, &attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchMatMulKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelMatMul() {
+  using T = typename KernelTuple::data_type;
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
       for (int k : TestSizes()) {
@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
         const T* b_data = b.data<T>();
         T* c_data = c.mutable_data<T>(PlaceType());
         const jit::matmul_attr_t attr{m, n, k};
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
-                                                           c_data, &attr);
+        BenchAllImpls<KernelTuple, PlaceType>(attr, a_data, b_data, c_data,
+                                              &attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchSoftmaxKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelSoftmax() {
+  using T = typename KernelTuple::data_type;
   for (int bs : {1, 2, 10}) {
     for (int n : TestSizes()) {
       Tensor x, y;
@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
       RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
       const T* x_data = x.data<T>();
       T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
-                                                          bs);
+      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchLayerNormKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelLayerNorm() {
+  using T = typename KernelTuple::data_type;
   const T epsilon = 9.99999975e-06;
   for (int n : {1, 2, 10}) {
     for (int x_dim_0 : {1, 9, 17, 50}) {
@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
         T* var_data = var.data<T>();
         T* out_data = out.mutable_data<T>(PlaceType());
 
-        BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
-            right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
-            left, epsilon, right);
+        BenchAllImpls<KernelTuple, PlaceType>(right, x_data, out_data,
+                                              mean_data, var_data, scale_data,
+                                              bias_data, left, epsilon, right);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchCRFDecodingKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelCRFDecoding() {
+  using T = typename KernelTuple::data_type;
   constexpr int state_trans_base_idx = 2;
   for (int seq_len : {1, 11, 17, 50}) {
     for (int tag_num : TestSizes()) {
@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
       T* alpha_data = alpha.mutable_data<T>(PlaceType());
       int* track_data = track.mutable_data<int>(PlaceType());
 
-      BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
-          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
+      BenchAllImpls<KernelTuple, PlaceType>(tag_num, seq_len, x_data, w_data,
+                                            alpha_data, track_data, tag_num);
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void BenchVBroadcastKernel() {
+template <typename KernelTuple, typename PlaceType>
+void BenchKernelVBroadcast() {
+  using T = typename KernelTuple::data_type;
   for (int64_t w : {1, 16, 64, 100, 256}) {
     Tensor x;
     x.Resize({w});
@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
       Tensor y;
       y.Resize({h * w});
       T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
-          w, x_data, y_data, static_cast<int64_t>(h), w);
+      BenchAllImpls<KernelTuple, PlaceType>(w, x_data, y_data,
+                                            static_cast<int64_t>(h), w);
     }
   }
 }
 
-using T = float;
-using CPUPlace = paddle::platform::CPUPlace;
+#define BenchKernelVMul BenchKernelXYZN
+#define BenchKernelVAdd BenchKernelXYZN
+#define BenchKernelVAddRelu BenchKernelXYZN
+#define BenchKernelVSub BenchKernelXYZN
 
-// xyzn
-BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
+#define BenchKernelVScal BenchKernelAXYN
+#define BenchKernelVAddBias BenchKernelAXYN
 
-// axyn
-BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
+#define BenchKernelVRelu BenchKernelXYN
+#define BenchKernelVIdentity BenchKernelXYN
+#define BenchKernelVSquare BenchKernelXYN
+#define BenchKernelVExp BenchKernelXYN
+#define BenchKernelVSigmoid BenchKernelXYN
+#define BenchKernelVTanh BenchKernelXYN
+#define BenchKernelVCopy BenchKernelXYN
 
-// xrn
-BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
-BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
+#define BenchKernelHMax BenchKernelXRN
+#define BenchKernelHSum BenchKernelXRN
 
-// xyn
-BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }
-
-// lstm and peephole
-BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
-BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
-
-// gru functions
-BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
-BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
-BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
-
-// seq pool function
-BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
-
-// embedding seq pool function
-BENCH_FP32_CPU(kEmbSeqPool) {
-  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
-}
+#define BenchKernelLSTMCtHt BenchKernelLSTM
+#define BenchKernelLSTMC1H1 BenchKernelLSTM
 
-// sgd function
-BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); }
+#define BenchKernelGRUH1 BenchKernelGRU
+#define BenchKernelGRUHtPart1 BenchKernelGRU
+#define BenchKernelGRUHtPart2 BenchKernelGRU
 
-// matmul
-BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
+using CPUPlace = paddle::platform::CPUPlace;
 
-// softmax
-BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
+#define BENCH_FP32_CPU(name)                                \
+  BENCH_JITKERNEL(name, FP32, CPU) {                        \
+    BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
+  }
 
-// layernorm
-BENCH_FP32_CPU(kLayerNorm) {
-  BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
-}
+// xyzn
+BENCH_FP32_CPU(VMul);
+BENCH_FP32_CPU(VAdd);
+BENCH_FP32_CPU(VAddRelu);
+BENCH_FP32_CPU(VSub);
 
-// crfdecoding
-BENCH_FP32_CPU(kCRFDecoding) {
-  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
-}
+// axyn
+BENCH_FP32_CPU(VScal);
+BENCH_FP32_CPU(VAddBias);
 
-// vbroadcast function
-BENCH_FP32_CPU(kVBroadcast) {
-  BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
-}
+// xyn
+BENCH_FP32_CPU(VRelu);
+BENCH_FP32_CPU(VIdentity);
+BENCH_FP32_CPU(VSquare);
+BENCH_FP32_CPU(VExp);
+BENCH_FP32_CPU(VSigmoid);
+BENCH_FP32_CPU(VTanh);
+BENCH_FP32_CPU(VCopy);
+
+// xrn
+BENCH_FP32_CPU(HMax);
+BENCH_FP32_CPU(HSum);
+
+// LSTM
+BENCH_FP32_CPU(LSTMCtHt);
+BENCH_FP32_CPU(LSTMC1H1);
+
+// GRU
+BENCH_FP32_CPU(GRUH1);
+BENCH_FP32_CPU(GRUHtPart1);
+BENCH_FP32_CPU(GRUHtPart2);
+
+BENCH_FP32_CPU(LayerNorm);
+BENCH_FP32_CPU(CRFDecoding);
+
+BENCH_FP32_CPU(SeqPool);
+BENCH_FP32_CPU(EmbSeqPool);
+BENCH_FP32_CPU(MatMul);
+BENCH_FP32_CPU(Softmax);
+BENCH_FP32_CPU(Sgd);
+BENCH_FP32_CPU(VBroadcast);
 
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index e7a7375879..ad68e792c7 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/act.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override;                              \
+    bool CanBeUsed(const int& attr) const override;                          \
     size_t CodeSize(const int& d) const override;                            \
     std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
       return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);
 
 // TODO(TJ): tuning use me
-bool VReluCreator::UseMe(const int& d) const {
+bool VReluCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx);
 }
 
-bool VSquareCreator::UseMe(const int& d) const {
+bool VSquareCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx);
 }
 
-bool VIdentityCreator::UseMe(const int& d) const {
+bool VIdentityCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx);
 }
 
-bool VExpCreator::UseMe(const int& d) const {
+bool VExpCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx) && d < 32;
 }
 
-bool VSigmoidCreator::UseMe(const int& d) const {
+bool VSigmoidCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx);
 }
 
-bool VTanhCreator::UseMe(const int& d) const {
+bool VTanhCreator::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx);
 }
 
diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc
index 5da24c359e..c126b9077a 100644
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/blas.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
 
 class NCHW16CMulNCCreator : public JitCodeCreator<int> {
  public:
-  bool UseMe(const int& attr) const override {
+  bool CanBeUsed(const int& attr) const override {
     return platform::MayIUse(platform::avx512f);
   }
   size_t CodeSize(const int& d) const override { return 256 * 1024; }
@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 #define DECLARE_BLAS_CREATOR(name)                                           \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
       return platform::MayIUse(platform::avx) && attr <= 1024;               \
     }                                                                        \
     size_t CodeSize(const int& d) const override {                           \
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
index 23837a3fb9..331a4b0d07 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/jit/gen/embseqpool.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
 
 class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
  public:
-  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
     return platform::MayIUse(platform::avx) &&
            attr.table_width % YMM_FLOAT_BLOCK == 0;
   }
diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc
index 13f7a14111..b5b0cffa80 100644
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/jit/gen/gru.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
   class name##Creator : public JitCodeCreator<gru_attr_t> {       \
    public:                                                        \
     /* TODO(TJ): enable more */                                   \
-    bool UseMe(const gru_attr_t& attr) const override {           \
+    bool CanBeUsed(const gru_attr_t& attr) const override {       \
       return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
     }                                                             \
     size_t CodeSize(const gru_attr_t& attr) const override {      \
diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc
index e788401719..462ac68a93 100644
--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/hopv.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
 #define DECLARE_HOP_CREATOR(name)                                            \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
       return platform::MayIUse(platform::avx);                               \
     }                                                                        \
     size_t CodeSize(const int& d) const override {                           \
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 39847d1b65..228db7cc72 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
   virtual void genCode() = 0;
 
   size_t getSize() const override { return CodeGenerator::getSize(); }
-  const unsigned char* getCodeInternal() override {
+  const unsigned char* getCodeInternal() const override {
     const Xbyak::uint8* code = CodeGenerator::getCode();
     return code;
   }
diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc
index 08bafb5a81..2c3bc985e9 100644
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/jit/gen/lstm.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
   class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
    public:                                                        \
     /* TODO(TJ): enable more */                                   \
-    bool UseMe(const lstm_attr_t& attr) const override {          \
+    bool CanBeUsed(const lstm_attr_t& attr) const override {      \
       return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
     }                                                             \
     size_t CodeSize(const lstm_attr_t& attr) const override {     \
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index ae3858eab2..d9955c8cc6 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/operators/jit/gen/matmul.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
-
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
 
 class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
  public:
-  bool UseMe(const matmul_attr_t& attr) const override {
+  bool CanBeUsed(const matmul_attr_t& attr) const override {
     return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
            attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
   }
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index 530d24ee1f..d9e5904add 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/seqpool.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
 
 class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
  public:
-  bool UseMe(const seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
     return platform::MayIUse(platform::avx);
   }
   size_t CodeSize(const seq_pool_attr_t& attr) const override {
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
index a745a27f95..e65d3500b4 100644
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/jit/gen/sgd.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
 
 class SgdCreator : public JitCodeCreator<sgd_attr_t> {
  public:
-  bool UseMe(const sgd_attr_t& attr) const override {
+  bool CanBeUsed(const sgd_attr_t& attr) const override {
     return platform::MayIUse(platform::avx) &&
            attr.grad_width % YMM_FLOAT_BLOCK == 0;
   }
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
index 3f9fbdbd82..66a8d75fd4 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
 
 class VBroadcastCreator : public JitCodeCreator<int64_t> {
  public:
-  bool UseMe(const int64_t& w) const override {
+  bool CanBeUsed(const int64_t& w) const override {
     return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
   }
   size_t CodeSize(const int64_t& w) const override {
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index f3603875ad..4c49eff49e 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -31,7 +31,7 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
-// refer do not need useme, it would be the last one.
+// refer do not need CanBeUsed, it would be the last one.
 void GenBase::dumpCode(const unsigned char* code) const {
   if (code) {
     static int counter = 0;
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index a7c7a35a7e..033c603c07 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -31,9 +31,10 @@ class GenBase : public Kernel {
   virtual ~GenBase() = default;
   virtual std::string name() const = 0;
   virtual size_t getSize() const = 0;
-  virtual const unsigned char* getCodeInternal() = 0;
+  virtual const unsigned char* getCodeInternal() const = 0;
+  const char* ImplType() const override { return "JitCode"; }
   template <typename Func>
-  Func getCode() {
+  Func getCode() const {
     const unsigned char* code = this->getCodeInternal();
     if (FLAGS_dump_jitcode) {
       this->dumpCode(code);
@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
   virtual ~JitCodeCreator() = default;
 
   // condition when this jit code can be used.
-  virtual bool UseMe(const Attr& attr) const = 0;
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
 
   // estimate this code size
   virtual size_t CodeSize(const Attr& attr) const = 0;
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index d85c719c1c..1ac5318d46 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -16,6 +16,8 @@
 
 #include <iostream>
 #include <string>
+#include <unordered_map>
+#include <utility>  // for std::move
 #include <vector>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
@@ -27,35 +29,34 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    std::is_same<typename KernelTuples::data_type, float>::value &&
+    std::is_same<typename KernelTuple::data_type, float>::value &&
         std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
-  using Func = typename KernelTuples::func_type;
-  using Attr = typename KernelTuples::attr_type;
-  size_t key = JitCodeKey<Attr>(attr);
-  auto& codes = JitCodePool<KT>().Instance();
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
+  using Attr = typename KernelTuple::attr_type;
+  int64_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
   if (codes.Has(key)) {
-    return codes.AllKernels().at(key)->template getCode<Func>();
+    return codes.AllKernels().at(key).get();
   }
 
   // creator is not related with attr, so can use KernelKey as key
-  KernelKey kkey(KT, PlaceType());
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
   // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
-  auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
+  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
   auto iter = creator_map.find(kkey);
   if (iter != creator_map.end()) {
     auto& creators = iter->second;
     for (auto& cur : creators) {
       auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
-      if (i && i->UseMe(attr)) {
+      if (i && i->CanBeUsed(attr)) {
         auto p = i->CreateJitCode(attr);
         if (p) {
-          auto f = p->template getCode<Func>();
+          auto res = p.get();
           codes.Insert(key, std::move(p));
-          return f;
+          return res;
         }
       }
     }
@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
   return nullptr;
 }
 
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    !std::is_same<typename KernelTuples::data_type, float>::value ||
+    !std::is_same<typename KernelTuple::data_type, float>::value ||
         !std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
   return nullptr;
 }
 
 // Refer code do not related with attr, which is just for cast
 // Refer is always on CPUPlace
-template <KernelType KT, typename KernelTuples>
-inline typename KernelTuples::func_type GetRefer() {
-  auto& ref_pool = ReferKernelPool().Instance().AllKernels();
-  KernelKey kkey(KT, platform::CPUPlace());
+template <typename KernelTuple>
+inline const Kernel* GetReferKernel() {
+  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
+  KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
   PADDLE_ENFORCE(ref_iter != ref_pool.end(),
                  "Every Kernel should have reference function.");
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
-    auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
+    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
     if (i) {
-      return i->GetFunc();
+      return i;
     }
   }
   return nullptr;
 }
 
-template <KernelType KT, typename KernelTuples,
-          typename PlaceType = platform::CPUPlace>
-typename KernelTuples::func_type Get(
-    const typename KernelTuples::attr_type& attr) {
-  auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitfunc) {
-    return jitfunc;
+template <typename KernelTuple>
+inline typename KernelTuple::func_type GetReferFunc() {
+  auto ker = GetReferKernel<KernelTuple>();
+  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
+  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  return p->GetFunc();
+}
+
+// Return all Kernels that can be used
+template <typename KernelTuple, typename PlaceType>
+std::vector<const Kernel*> GetAllCandidateKernels(
+    const typename KernelTuple::attr_type& attr) {
+  // the search order shoudl be jitcode > more > refer
+  std::vector<const Kernel*> res;
+  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
+  if (jitker) {
+    res.emplace_back(jitker);
   }
 
-  // pool: (KernelKey(type, place), vector<KernelPtr>)
-  KernelKey kkey(KT, PlaceType());
-  auto& pool = KernelPool().Instance().AllKernels();
+  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
+  auto& pool = KernelPool::Instance().AllKernels();
   auto iter = pool.find(kkey);
   if (iter != pool.end()) {
     auto& impls = iter->second;
     for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        return i->GetFunc();
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
+      if (i && i->CanBeUsed(attr)) {
+        res.emplace_back(i);
       }
     }
   }
 
   // The last implementation should be reference function on CPUPlace.
-  return GetRefer<KT, KernelTuples>();
+  auto ref = GetReferKernel<KernelTuple>();
+  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  res.emplace_back(ref);
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<std::pair<std::string, typename KernelTuple::func_type>>
+GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
+  using Func = typename KernelTuple::func_type;
+  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
+  std::vector<std::pair<std::string, Func>> res;
+  for (auto k : kers) {
+    std::string name = k->ImplType();
+    if (name == "JitCode") {
+      auto i = dynamic_cast<const GenBase*>(k);
+      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
+    } else {
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
+      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->GetFunc()));
+    }
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  std::vector<typename KernelTuple::func_type> res;
+  for (auto& i : funcs) {
+    res.emplace_back(i.second);
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+typename KernelTuple::func_type GetDefaultBestFunc(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  // Here could do some runtime benchmark of this attr and return the best one.
+  // But yet just get the first one as the default best one,
+  // which is searched in order and tuned by offline.
+  return funcs[0];
 }
 
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 class KernelFuncs {
  public:
   KernelFuncs() = default;
   static KernelFuncs& Cache() {
-    static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
+    static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
     return g_func_cache;
   }
 
-  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
-
-  void Insert(int key, typename KernelTuples::func_type func) {
-    funcs_.emplace(key, func);
-  }
-
-  typename KernelTuples::func_type At(int key) {
+  // the exposed interface to use
+  typename KernelTuple::func_type At(
+      const typename KernelTuple::attr_type& attr) {
+    // Maybe here is not good enough, not all kernels should have jitcode
+    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
     if (Has(key)) {
       return funcs_.at(key);
     }
-    auto func = Get<KT, KernelTuples, PlaceType>(key);
+    // If do not have this attr in cache then get the default best
+    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
     Insert(key, func);
     return func;
   }
 
+  typename KernelTuple::func_type operator[](
+      const typename KernelTuple::attr_type& attr) {
+    return At(attr);
+  }
+
+ protected:
+  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
+  void Insert(int64_t key, typename KernelTuple::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
  private:
-  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
   DISABLE_COPY_AND_ASSIGN(KernelFuncs);
 };
 
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 96e162a21b..bd34d7dfc7 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -62,26 +62,55 @@ typedef enum {
   kSqrt,
 } SeqPoolType;
 
+// x, y, z, n
 template <typename T>
-struct XYZNTuples {
+struct XYZNTuple {
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(const T*, const T*, T*, int);
 };
 
+// a, x, y, n
 template <typename T>
-struct AXYNTuples : public XYZNTuples<T> {};
+struct AXYNTuple : public XYZNTuple<T> {};
 
+// x, y, n
 template <typename T>
-struct XYNTuples {
+struct XYNTuple {
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(const T*, T*, int);
 };
 
-// x, return and int
+// x, returned value, n
 template <typename T>
-struct XRNTuples : public XYNTuples<T> {};
+struct XRNTuple : public XYNTuple<T> {};
+
+#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
+  template <typename T>                                \
+  struct type##Tuple : public kernel_tuple<T> {        \
+    static constexpr KernelType kernel_type = k##type; \
+  }
+
+// Tuple should be corresponding to the KernelType
+DECLARE_KERNELTUPLE(XYZNTuple, VMul);
+DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
+DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
+DECLARE_KERNELTUPLE(XYZNTuple, VSub);
+
+DECLARE_KERNELTUPLE(AXYNTuple, VScal);
+DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
+
+DECLARE_KERNELTUPLE(XYNTuple, VRelu);
+DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
+DECLARE_KERNELTUPLE(XYNTuple, VSquare);
+DECLARE_KERNELTUPLE(XYNTuple, VExp);
+DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
+DECLARE_KERNELTUPLE(XYNTuple, VTanh);
+DECLARE_KERNELTUPLE(XYNTuple, VCopy);
+
+DECLARE_KERNELTUPLE(XRNTuple, HMax);
+DECLARE_KERNELTUPLE(XRNTuple, HSum);
 
 typedef struct {
   void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
 typedef struct lstm_attr_s lstm_attr_t;
 
 template <typename T>
-struct LSTMTuples {
+struct LSTMTuple {
   typedef T data_type;
   typedef lstm_attr_t attr_type;
   typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
 };
 
 template <typename T>
-struct GRUTuples {
+struct GRUTuple {
   typedef T data_type;
   typedef gru_attr_t attr_type;
   typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };
 
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
+
+DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
+
+#undef DECLARE_KERNELTUPLE
+
 template <typename T>
-struct VBroadcastTuples {
+struct VBroadcastTuple {
+  static constexpr KernelType kernel_type = kVBroadcast;
   typedef T data_type;
   typedef int64_t attr_type;
   typedef void (*func_type)(const T*, T*, int64_t, int64_t);
@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
 } seq_pool_attr_t;
 
 template <typename T>
-struct SeqPoolTuples {
+struct SeqPoolTuple {
+  static constexpr KernelType kernel_type = kSeqPool;
   typedef T data_type;
   typedef seq_pool_attr_t attr_type;
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
 } emb_seq_pool_attr_t;
 
 template <typename T>
-struct EmbSeqPoolTuples {
+struct EmbSeqPoolTuple {
+  static constexpr KernelType kernel_type = kEmbSeqPool;
   typedef T data_type;
   typedef emb_seq_pool_attr_t attr_type;
   typedef void (*func_type)(const T*, const int64_t*, T*,
@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
 } sgd_attr_t;
 
 template <typename T>
-struct SgdTuples {
+struct SgdTuple {
+  static constexpr KernelType kernel_type = kSgd;
   typedef T data_type;
   typedef sgd_attr_t attr_type;
   typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
 } matmul_attr_t;
 
 template <typename T>
-struct MatMulTuples {
+struct MatMulTuple {
+  static constexpr KernelType kernel_type = kMatMul;
   typedef T data_type;
   typedef matmul_attr_t attr_type;
   typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };
 
 template <typename T>
-struct CRFDecodingTuples {
+struct CRFDecodingTuple {
+  static constexpr KernelType kernel_type = kCRFDecoding;
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
 };
 
 template <typename T>
-struct LayerNormTuples {
+struct LayerNormTuple {
+  static constexpr KernelType kernel_type = kLayerNorm;
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
@@ -236,7 +281,8 @@ struct LayerNormTuples {
 };
 
 template <typename T>
-struct SoftmaxTuples {
+struct SoftmaxTuple {
+  static constexpr KernelType kernel_type = kSoftmax;
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(const T*, T*, int, int);
@@ -244,7 +290,8 @@ struct SoftmaxTuples {
 
 // nChw16c = nChw16c .* NC
 template <typename T>
-struct NCHW16CMulNCTuples {
+struct NCHW16CMulNCTuple {
+  static constexpr KernelType kernel_type = kNCHW16CMulNC;
   typedef T data_type;
   typedef int attr_type;
   typedef void (*func_type)(const T*, const T*, T*, int, int);
@@ -255,28 +302,29 @@ class Kernel {
  public:
   Kernel() = default;
   virtual ~Kernel() = default;
+  virtual const char* ImplType() const = 0;
   DISABLE_COPY_AND_ASSIGN(Kernel);
 };
 
-template <typename KernelTuples>
+template <typename KernelTuple>
 class KernelMore : public Kernel {
  public:
-  using T = typename KernelTuples::data_type;
-  using Func = typename KernelTuples::func_type;
-  using Attr = typename KernelTuples::attr_type;
+  using T = typename KernelTuple::data_type;
+  using Func = typename KernelTuple::func_type;
+  using Attr = typename KernelTuple::attr_type;
   virtual Func GetFunc() const { return func; }
-  virtual bool UseMe(const Attr& attr) const = 0;
-  virtual const char* ImplType() const = 0;
+  // specify this kernel can be used, means it should not fail if use it.
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
 
  protected:
   Func func{nullptr};
 };
 
-template <typename KernelTuples>
-class ReferKernel : public KernelMore<KernelTuples> {
+template <typename KernelTuple>
+class ReferKernel : public KernelMore<KernelTuple> {
  public:
   // Refer code can always be used
-  bool UseMe(const typename KernelTuples::attr_type& attr) const override {
+  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
     return true;
   }
   const char* ImplType() const override { return "Refer"; }
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 1c2fddcae7..1ad220b397 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_key.h"
+#include <xxhash.h>  // XXH64: 13.8 GB/s
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -20,71 +21,46 @@ namespace operators {
 namespace jit {
 
 template <>
-size_t JitCodeKey<int>(const int& d) {
+int64_t JitCodeKey<int>(const int& d) {
   return d;
 }
 
 template <>
-size_t JitCodeKey<int64_t>(const int64_t& d) {
+int64_t JitCodeKey<int64_t>(const int64_t& d) {
   return d;
 }
 
-// TODO(TJ): refine and benchmark JitCodeKey generatation
-constexpr int act_type_shift = 3;  // suppot 2^3 act types
-static inline int act_type_convert(KernelType type) {
-  if (type == kVIdentity) {
-    return 0;
-  } else if (type == kVExp) {
-    return 1;
-  } else if (type == kVRelu) {
-    return 2;
-  } else if (type == kVSigmoid) {
-    return 3;
-  } else if (type == kVTanh) {
-    return 4;
-  }
-  PADDLE_THROW("Unsupported act type %d", type);
-  return 0;
-}
-
 template <>
-size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
-  size_t key = attr.d;
-  int gate_key = act_type_convert(attr.act_gate) << 1;
-  int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift);
-  int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2);
-  return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
-         attr.use_peephole;
+int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
+  return XXH64(&attr, sizeof(gru_attr_t), 0);
 }
 
 template <>
-size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
-  size_t key = attr.d;
-  return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) +
-         (act_type_convert(attr.act_cand) << act_type_shift);
+int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
+  int keys[5] = {
+      attr.d, static_cast<int>(attr.act_gate), static_cast<int>(attr.act_cand),
+      static_cast<int>(attr.act_cell), static_cast<int>(attr.use_peephole)};
+  return XXH64(keys, sizeof(int) * 5, 0);
 }
 
 template <>
-size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
-  size_t key = attr.w;
-  constexpr int pool_type_shift = 3;
-  return (key << pool_type_shift) + static_cast<int>(attr.type);
+int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
+  int keys[2] = {attr.w, static_cast<int>(attr.type)};
+  return XXH64(keys, sizeof(int) * 2, 0);
 }
 
 template <>
-size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
-  size_t key = attr.m;
-  constexpr int shift = 21;
-  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
+int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+  return XXH64(&attr, sizeof(int) * 3, 0);  // m, n, k
 }
 
 template <>
-size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
   return attr.table_width;
 }
 
 template <>
-size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
+int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
   return attr.grad_width;
 }
 
diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h
index 611a0210d6..b2cf92f23e 100644
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
@@ -46,7 +46,7 @@ struct KernelKey {
 
 // Every JitCode should have a method to get the key from attribution
 template <typename Attr>
-size_t JitCodeKey(const Attr& attr);
+int64_t JitCodeKey(const Attr& attr);
 
 }  // namespace jit
 }  // namespace operators
diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h
index 3e15242af2..04710a54ac 100644
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
@@ -17,6 +17,7 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <unordered_map>
+#include <utility>  // for move
 #include <vector>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
@@ -30,7 +31,7 @@ namespace jit {
 template <KernelType KT>
 class JitCodePool {
   typedef std::unique_ptr<GenBase> GenBasePtr;
-  typedef std::unordered_map<size_t, GenBasePtr> JitCodeMap;
+  typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
 
  public:
   JitCodePool() = default;
@@ -41,9 +42,9 @@ class JitCodePool {
 
   const JitCodeMap& AllKernels() { return codes_; }
 
-  bool Has(size_t key) const { return codes_.find(key) != codes_.end(); }
+  bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
 
-  void Insert(size_t key, GenBasePtr value) {
+  void Insert(int64_t key, GenBasePtr value) {
     codes_.emplace(key, std::move(value));
   }
 
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
index 16c91f8246..1254d00189 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
   }
 }
 
-bool CRFDecodingKernel::UseMe(const int& d) const {
+bool CRFDecodingKernel::CanBeUsed(const int& d) const {
 #ifdef __AVX512F__
   constexpr int block = ZMM_FLOAT_BLOCK;
 #else
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
index 24179d90dd..49b1a1fea4 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@@ -26,11 +26,11 @@ namespace intrinsic {
 void CRFDecoding(const int seq_len, const float* x, const float* w,
                  float* alpha, int* track, int tag_num);
 
-class CRFDecodingKernel : public KernelMore<CRFDecodingTuples<float>> {
+class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
  public:
   CRFDecodingKernel() { this->func = CRFDecoding; }
-  bool UseMe(
-      const typename CRFDecodingTuples<float>::attr_type&) const override;
+  bool CanBeUsed(
+      const typename CRFDecodingTuple<float>::attr_type&) const override;
   const char* ImplType() const override { return "Intrinsic"; }
 };
 
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
index e9b6e401c6..a4e3246f10 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
   }
 }
 
-bool LayerNormKernel::UseMe(const int& d) const {
+bool LayerNormKernel::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
 }
 
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
index 89da2940f4..7b9f676050 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
                const float* scale, const float* bias, int height,
                const float epsilon, int right);
 
-class LayerNormKernel : public KernelMore<LayerNormTuples<float>> {
+class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
  public:
   LayerNormKernel() { this->func = LayerNorm; }
-  bool UseMe(const typename LayerNormTuples<float>::attr_type&) const override;
+  bool CanBeUsed(
+      const typename LayerNormTuple<float>::attr_type&) const override;
   const char* ImplType() const override { return "Intrinsic"; }
 };
 
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index 0036d1c238..6e709a16d2 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -23,6 +23,8 @@ namespace jit {
 namespace more {
 namespace mix {
 
+using CPUPlace = platform::CPUPlace;
+
 void VSigmoid(const T* x, T* y, int n) {
   const float min = SIGMOID_THRESHOLD_MIN;
   const float max = SIGMOID_THRESHOLD_MAX;
@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) {
     y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
     y[i] = static_cast<T>(0) - y[i];
   }
-  auto compute = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
   compute(y, y, n);
   for (int i = 0; i < n; ++i) {
     y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) {
 
 void VTanh(const T* x, T* y, int n) {
   const T a = 2, b = -1;
-  auto compute_scal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
-  auto compute_addbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
-  auto compute_sigmoid = Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
   compute_scal(&a, x, y, n);
   compute_sigmoid(y, y, n);
   compute_scal(&a, y, y, n);
@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) {
 }
 
 void Softmax(const T* x, T* y, int n, int bs) {
-  auto compute_hmax =
-      KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
-  auto compute_hsum =
-      KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
-  auto compute_vscal =
-      KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
   auto compute_vaddbias =
-      KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
-  auto compute_vexp =
-      KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n);
+      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
 
   for (int i = 0; i < bs; ++i) {
     T scalar;
@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) {
 
 void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   if (type == kVSigmoid) {
-    return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
   } else if (type == kVRelu) {
-    return Get<kVRelu, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
   } else if (type == kVTanh) {
-    return Get<kVTanh, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
   } else if (type == kVIdentity) {
-    return Get<kVIdentity, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
   }
   PADDLE_THROW("Not support type: %s", type);
   return nullptr;
@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
   const int d = attr->d;
   const int d2 = d * 2;
   const int d3 = d * 3;
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
-  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
-  auto vadd_d2 = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d2);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
   auto act_gate_d = getActFunc(attr->act_gate, d);
   auto act_gate_d2 = getActFunc(attr->act_gate, d2);
   auto act_gate_d3 = getActFunc(attr->act_gate, d3);
@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
   int d = attr->d;
   int d2 = d * 2;
   int d3 = d * 3;
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
-  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
   auto act_gate_d = getActFunc(attr->act_gate, d);
   auto act_cand_d = getActFunc(attr->act_cand, d);
   auto act_cell_d = getActFunc(attr->act_cell, d);
@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
   int d2 = d * 2;
   auto act_gate = getActFunc(attr->act_gate, d);
   auto act_cand = getActFunc(attr->act_cand, d);
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
   act_gate(gates, gates, d);
   act_cand(gates + d2, gates + d2, d);
   vmul_d(gates, gates + d2, ht, d);
@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
   T* ht = reinterpret_cast<T*>(step->ht);
   const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
   auto act_gate = getActFunc(attr->act_gate, attr->d);
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(attr->d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
   act_gate(gates + attr->d, gates + attr->d, attr->d);
   vmul_d(ht_1, gates + attr->d, ht, attr->d);
 }
@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
 }
 
 // TODO(TJ): tuning me
-bool VSigmoidKernel::UseMe(const int& d) const { return true; }
+bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
 
-bool VTanhKernel::UseMe(const int& d) const { return true; }
+bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
 
-bool SoftmaxKernel::UseMe(const int& d) const { return true; }
+bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
 
-bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
+bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
 
-bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
+bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
 
-bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 
-bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 
-bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 
 }  // namespace mix
 }  // namespace more
@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
 
 namespace mix = paddle::operators::jit::more::mix;
 
-#define REGISTER_MORE_KERNEL(key, func) \
-  REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel)
-
-REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
-REGISTER_MORE_KERNEL(kVTanh, VTanh);
-REGISTER_MORE_KERNEL(kSoftmax, Softmax);
-REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
-REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
-REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
-REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1);
-REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2);
+#define REGISTER_MORE_KERNEL(func) \
+  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
+
+REGISTER_MORE_KERNEL(VSigmoid);
+REGISTER_MORE_KERNEL(VTanh);
+REGISTER_MORE_KERNEL(Softmax);
+REGISTER_MORE_KERNEL(LSTMCtHt);
+REGISTER_MORE_KERNEL(LSTMC1H1);
+REGISTER_MORE_KERNEL(GRUH1);
+REGISTER_MORE_KERNEL(GRUHtPart1);
+REGISTER_MORE_KERNEL(GRUHtPart2);
 
 #undef REGISTER_MORE_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
index d64af19219..994d485909 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
 
-#define DECLARE_MORE_KERNEL(name, tuples)                            \
-  class name##Kernel : public KernelMore<tuples<T>> {                \
-   public:                                                           \
-    name##Kernel() { this->func = name; }                            \
-    bool UseMe(const typename tuples<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "Mixed"; }        \
+#define DECLARE_MORE_KERNEL(name)                                             \
+  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
+   public:                                                                    \
+    name##Kernel() { this->func = name; }                                     \
+    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "Mixed"; }                 \
   }
 
 // XYN
-DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
-DECLARE_MORE_KERNEL(VTanh, XYNTuples);
+DECLARE_MORE_KERNEL(VSigmoid);
+DECLARE_MORE_KERNEL(VTanh);
 
 // XRN
-DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
+DECLARE_MORE_KERNEL(Softmax);
 
-DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
-DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
+DECLARE_MORE_KERNEL(LSTMCtHt);
+DECLARE_MORE_KERNEL(LSTMC1H1);
 
-DECLARE_MORE_KERNEL(GRUH1, GRUTuples);
-DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples);
-DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples);
+DECLARE_MORE_KERNEL(GRUH1);
+DECLARE_MORE_KERNEL(GRUHtPart1);
+DECLARE_MORE_KERNEL(GRUHtPart2);
 
 #undef DECLARE_MORE_KERNEL
 
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 4f51353bce..4f600b3814 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -130,105 +130,106 @@ void ASum<double>(const double* x, double* res, int n) {
 
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
-bool VMulKernel<float>::UseMe(const int& d) const {
+bool VMulKernel<float>::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx512f) && d > 512;
 }
 
 template <>
-bool VAddKernel<float>::UseMe(const int& d) const {
+bool VAddKernel<float>::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx) && d > 512;
 }
 
 template <>
-bool VScalKernel<float>::UseMe(const int& d) const {
+bool VScalKernel<float>::CanBeUsed(const int& d) const {
   return platform::MayIUse(platform::avx512f) && d > 512;
 }
 
 template <>
-bool VExpKernel<float>::UseMe(const int& d) const {
+bool VExpKernel<float>::CanBeUsed(const int& d) const {
   return d > 7;
 }
 
 template <>
-bool VSquareKernel<float>::UseMe(const int& d) const {
+bool VSquareKernel<float>::CanBeUsed(const int& d) const {
   return d > 7;
 }
 
 template <>
-bool VCopyKernel<float>::UseMe(const int& d) const {
+bool VCopyKernel<float>::CanBeUsed(const int& d) const {
   return d > 15;
 }
 
 template <>
-bool VBroadcastKernel<float>::UseMe(const int64_t& d) const {
+bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
   return d > 127;
 }
 
 template <>
-bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const {
+bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
   return true;
 }
 
 template <>
-bool VSigmoidKernel<float>::UseMe(const int& d) const {
+bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
   return d > 7;
 }
 
 template <>
-bool VTanhKernel<float>::UseMe(const int& d) const {
+bool VTanhKernel<float>::CanBeUsed(const int& d) const {
   return d > 7;
 }
 
 template <>
-bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const {
+bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
+bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const {
+bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
+bool EmbSeqPoolKernel<double>::CanBeUsed(
+    const emb_seq_pool_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool SgdKernel<float>::UseMe(const sgd_attr_t& attr) const {
+bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool SgdKernel<double>::UseMe(const sgd_attr_t& attr) const {
+bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
+bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
   return platform::MayIUse(platform::avx);
 }
 
 template <>
-bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
+bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
   return true;
 }
 
 template <>
-bool SoftmaxKernel<float>::UseMe(const int& d) const {
+bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
   // tuned on avx2
   return platform::MayIUse(platform::avx) && d < 60;
 }
 
-#define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
-  template <>                                            \
-  bool func##Kernel<double>::UseMe(const int& d) const { \
-    return true;                                         \
+#define AWALYS_USE_ME_WITH_DOUBLE(func)                      \
+  template <>                                                \
+  bool func##Kernel<double>::CanBeUsed(const int& d) const { \
+    return true;                                             \
   }
 
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax);
 
 namespace mkl = paddle::operators::jit::more::mkl;
 
-#define REGISTER_MKL_KERNEL(key, func)                        \
-  REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
+#define REGISTER_MKL_KERNEL(func)                                 \
+  REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel<float>, \
                           mkl::func##Kernel<double>)
 
-REGISTER_MKL_KERNEL(kMatMul, MatMul);
-REGISTER_MKL_KERNEL(kVMul, VMul);
-REGISTER_MKL_KERNEL(kVAdd, VAdd);
-REGISTER_MKL_KERNEL(kVScal, VScal);
-REGISTER_MKL_KERNEL(kVExp, VExp);
-REGISTER_MKL_KERNEL(kVSquare, VSquare);
-REGISTER_MKL_KERNEL(kVCopy, VCopy);
-REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast);
-REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
-REGISTER_MKL_KERNEL(kVTanh, VTanh);
-REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
-REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
-REGISTER_MKL_KERNEL(kSoftmax, Softmax);
-REGISTER_MKL_KERNEL(kSgd, Sgd);
+REGISTER_MKL_KERNEL(MatMul);
+REGISTER_MKL_KERNEL(VMul);
+REGISTER_MKL_KERNEL(VAdd);
+REGISTER_MKL_KERNEL(VScal);
+REGISTER_MKL_KERNEL(VExp);
+REGISTER_MKL_KERNEL(VSquare);
+REGISTER_MKL_KERNEL(VCopy);
+REGISTER_MKL_KERNEL(VBroadcast);
+REGISTER_MKL_KERNEL(VSigmoid);
+REGISTER_MKL_KERNEL(VTanh);
+REGISTER_MKL_KERNEL(SeqPool);
+REGISTER_MKL_KERNEL(EmbSeqPool);
+REGISTER_MKL_KERNEL(Softmax);
+REGISTER_MKL_KERNEL(Sgd);
 
 #undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index db2d6faed4..f51dca654c 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
   }
 }
 
-#define DECLARE_MKL_KERNEL(name, tuples)                             \
-  template <typename T>                                              \
-  class name##Kernel : public KernelMore<tuples<T>> {                \
-   public:                                                           \
-    name##Kernel() { this->func = name<T>; }                         \
-    bool UseMe(const typename tuples<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "MKL"; }          \
+#define DECLARE_MKL_KERNEL(name)                                              \
+  template <typename T>                                                       \
+  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
+   public:                                                                    \
+    name##Kernel() { this->func = name<T>; }                                  \
+    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "MKL"; }                   \
   }
 
 // ABCMNK
-DECLARE_MKL_KERNEL(MatMul, MatMulTuples);
+DECLARE_MKL_KERNEL(MatMul);
 
 // XYZN
-DECLARE_MKL_KERNEL(VMul, XYZNTuples);
-DECLARE_MKL_KERNEL(VAdd, XYZNTuples);
+DECLARE_MKL_KERNEL(VMul);
+DECLARE_MKL_KERNEL(VAdd);
 
 // AXYN
-DECLARE_MKL_KERNEL(VScal, AXYNTuples);
+DECLARE_MKL_KERNEL(VScal);
 
 // XYN
-DECLARE_MKL_KERNEL(VExp, XYNTuples);
-DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
-DECLARE_MKL_KERNEL(VTanh, XYNTuples);
-DECLARE_MKL_KERNEL(VSquare, XYNTuples);
-DECLARE_MKL_KERNEL(VCopy, XYNTuples);
-
-DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
-
-DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
-
-DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
-
-DECLARE_MKL_KERNEL(Sgd, SgdTuples);
-
-DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
+DECLARE_MKL_KERNEL(VExp);
+DECLARE_MKL_KERNEL(VSigmoid);
+DECLARE_MKL_KERNEL(VTanh);
+DECLARE_MKL_KERNEL(VSquare);
+DECLARE_MKL_KERNEL(VCopy);
+
+// others
+DECLARE_MKL_KERNEL(SeqPool);
+DECLARE_MKL_KERNEL(EmbSeqPool);
+DECLARE_MKL_KERNEL(Softmax);
+DECLARE_MKL_KERNEL(Sgd);
+DECLARE_MKL_KERNEL(VBroadcast);
 
 #undef DECLARE_MKL_KERNEL
 
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index c279d1b2ca..0d1c477090 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -17,51 +17,43 @@
 
 namespace refer = paddle::operators::jit::refer;
 
-#define REGISTER_REFER_KERNEL(key, func)                    \
-  REGISTER_JITKERNEL_REFER(key, refer::func##Kernel<float>, \
+#define REGISTER_REFER_KERNEL(func)                             \
+  REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel<float>, \
                            refer::func##Kernel<double>)
 
-REGISTER_REFER_KERNEL(kVMul, VMul);
-REGISTER_REFER_KERNEL(kVAdd, VAdd);
-REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu);
-REGISTER_REFER_KERNEL(kVSub, VSub);
-
-REGISTER_REFER_KERNEL(kVScal, VScal);
-REGISTER_REFER_KERNEL(kVAddBias, VAddBias);
-
-REGISTER_REFER_KERNEL(kVRelu, VRelu);
-REGISTER_REFER_KERNEL(kVCopy, VCopy);
-REGISTER_REFER_KERNEL(kVIdentity, VIdentity);
-REGISTER_REFER_KERNEL(kVSquare, VSquare);
-REGISTER_REFER_KERNEL(kVExp, VExp);
-REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid);
-REGISTER_REFER_KERNEL(kVTanh, VTanh);
-
-REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt);
-REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1);
-
-REGISTER_REFER_KERNEL(kGRUH1, GRUH1);
-REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1);
-REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2);
-
-REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding);
-REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
-
-REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC);
-
-REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
-
-REGISTER_REFER_KERNEL(kMatMul, MatMul);
-
-REGISTER_REFER_KERNEL(kHMax, HMax);
-REGISTER_REFER_KERNEL(kHSum, HSum);
-
-REGISTER_REFER_KERNEL(kSoftmax, Softmax);
-
-REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
-
-REGISTER_REFER_KERNEL(kSgd, Sgd);
-
-REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast);
+REGISTER_REFER_KERNEL(VMul);
+REGISTER_REFER_KERNEL(VAdd);
+REGISTER_REFER_KERNEL(VAddRelu);
+REGISTER_REFER_KERNEL(VSub);
+
+REGISTER_REFER_KERNEL(VScal);
+REGISTER_REFER_KERNEL(VAddBias);
+
+REGISTER_REFER_KERNEL(VRelu);
+REGISTER_REFER_KERNEL(VCopy);
+REGISTER_REFER_KERNEL(VIdentity);
+REGISTER_REFER_KERNEL(VSquare);
+REGISTER_REFER_KERNEL(VExp);
+REGISTER_REFER_KERNEL(VSigmoid);
+REGISTER_REFER_KERNEL(VTanh);
+
+REGISTER_REFER_KERNEL(LSTMCtHt);
+REGISTER_REFER_KERNEL(LSTMC1H1);
+
+REGISTER_REFER_KERNEL(GRUH1);
+REGISTER_REFER_KERNEL(GRUHtPart1);
+REGISTER_REFER_KERNEL(GRUHtPart2);
+
+REGISTER_REFER_KERNEL(CRFDecoding);
+REGISTER_REFER_KERNEL(LayerNorm);
+REGISTER_REFER_KERNEL(NCHW16CMulNC);
+REGISTER_REFER_KERNEL(SeqPool);
+REGISTER_REFER_KERNEL(MatMul);
+REGISTER_REFER_KERNEL(HMax);
+REGISTER_REFER_KERNEL(HSum);
+REGISTER_REFER_KERNEL(Softmax);
+REGISTER_REFER_KERNEL(EmbSeqPool);
+REGISTER_REFER_KERNEL(Sgd);
+REGISTER_REFER_KERNEL(VBroadcast);
 
 #undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index b3b2097828..cac705a484 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
   }
 }
 
-#define DECLARE_REFER_KERNEL(name, tuples)             \
-  template <typename T>                                \
-  class name##Kernel : public ReferKernel<tuples<T>> { \
-   public:                                             \
-    name##Kernel() { this->func = name<T>; }           \
+#define DECLARE_REFER_KERNEL(name)                          \
+  template <typename T>                                     \
+  class name##Kernel : public ReferKernel<name##Tuple<T>> { \
+   public:                                                  \
+    name##Kernel() { this->func = name<T>; }                \
   }
 
 // const T* x, const T* y, T* z, int n
-DECLARE_REFER_KERNEL(VMul, XYZNTuples);
-DECLARE_REFER_KERNEL(VAdd, XYZNTuples);
-DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples);
-DECLARE_REFER_KERNEL(VSub, XYZNTuples);
+DECLARE_REFER_KERNEL(VMul);
+DECLARE_REFER_KERNEL(VAdd);
+DECLARE_REFER_KERNEL(VAddRelu);
+DECLARE_REFER_KERNEL(VSub);
 
 // const T* a, const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VScal, AXYNTuples);
-DECLARE_REFER_KERNEL(VAddBias, AXYNTuples);
+DECLARE_REFER_KERNEL(VScal);
+DECLARE_REFER_KERNEL(VAddBias);
 
 // const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VRelu, XYNTuples);
-DECLARE_REFER_KERNEL(VIdentity, XYNTuples);
-DECLARE_REFER_KERNEL(VExp, XYNTuples);
-DECLARE_REFER_KERNEL(VSigmoid, XYNTuples);
-DECLARE_REFER_KERNEL(VTanh, XYNTuples);
-DECLARE_REFER_KERNEL(VSquare, XYNTuples);
-DECLARE_REFER_KERNEL(VCopy, XYNTuples);
+DECLARE_REFER_KERNEL(VRelu);
+DECLARE_REFER_KERNEL(VIdentity);
+DECLARE_REFER_KERNEL(VExp);
+DECLARE_REFER_KERNEL(VSigmoid);
+DECLARE_REFER_KERNEL(VTanh);
+DECLARE_REFER_KERNEL(VSquare);
+DECLARE_REFER_KERNEL(VCopy);
 
 // lstm_t*, const lstm_attr_t*
-DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples);
-DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples);
+DECLARE_REFER_KERNEL(LSTMCtHt);
+DECLARE_REFER_KERNEL(LSTMC1H1);
 
 // gru_t*, const gru_attr_t*
-DECLARE_REFER_KERNEL(GRUH1, GRUTuples);
-DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples);
-DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples);
-
-DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples);
-DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
-
-DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples);
-
-DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
-
-DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
-
-DECLARE_REFER_KERNEL(HMax, XRNTuples);
-DECLARE_REFER_KERNEL(HSum, XRNTuples);
-
-DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
-
-DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
-
-DECLARE_REFER_KERNEL(Sgd, SgdTuples);
-
-DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples);
+DECLARE_REFER_KERNEL(GRUH1);
+DECLARE_REFER_KERNEL(GRUHtPart1);
+DECLARE_REFER_KERNEL(GRUHtPart2);
+
+DECLARE_REFER_KERNEL(HMax);
+DECLARE_REFER_KERNEL(HSum);
+
+// others
+DECLARE_REFER_KERNEL(CRFDecoding);
+DECLARE_REFER_KERNEL(LayerNorm);
+DECLARE_REFER_KERNEL(NCHW16CMulNC);
+DECLARE_REFER_KERNEL(SeqPool);
+DECLARE_REFER_KERNEL(MatMul);
+DECLARE_REFER_KERNEL(Softmax);
+DECLARE_REFER_KERNEL(EmbSeqPool);
+DECLARE_REFER_KERNEL(Sgd);
+DECLARE_REFER_KERNEL(VBroadcast);
 
 #undef DECLARE_REFER_KERNEL
 
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
index cb32c48720..567a903236 100644
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <tuple>
 #include <type_traits>
+#include <utility>  // for std::move
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_pool.h"
 #include "paddle/fluid/platform/place.h"
@@ -49,8 +50,8 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
 
   void operator()(KernelType kt) const {
     KernelKey kkey(kt, PlaceType());
-    Pool().Instance().Insert(kkey,
-                             std::move(make_unique<const KERNEL_IMPL_TYPE>()));
+    Pool::Instance().Insert(kkey,
+                            std::move(make_unique<const KERNEL_IMPL_TYPE>()));
     constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
     JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1,
                               KernelImpls...>
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cdec14dc43..6c099a7a06 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <iostream>
 #include <random>
 #include <string>
 #include <vector>
@@ -64,413 +65,23 @@ std::vector<int> TestSizes() {
 namespace jit = paddle::operators::jit;
 using CPUPlace = paddle::platform::CPUPlace;
 
-template <typename KernelTuples, typename... Args>
-struct TestFuncWithRefer {
-  void operator()(const typename KernelTuples::func_type tgt, Args... args) {
-    LOG(FATAL) << "Should specify this function.";
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>> {
-  void operator()(const typename jit::XYZNTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& y,
-                  const std::vector<T>& zref) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(zref.size(), x.size());
-    EXPECT_EQ(zref.size(), y.size());
-    const T* x_data = x.data();
-    const T* y_data = y.data();
-    const T* zref_data = zref.data();
-    const int d = zref.size();
-
-    std::vector<T> ztgt(d);
-    T* ztgt_data = ztgt.data();
-    // test normal
-    tgt(x_data, y_data, ztgt_data, d);
-    ExpectEQ<T>(ztgt_data, zref_data, d);
-    // test inplace x
-    std::copy(x.begin(), x.end(), ztgt.begin());
-    tgt(ztgt_data, y_data, ztgt_data, d);
-    ExpectEQ<T>(ztgt_data, zref_data, d);
-    // test inplace y
-    std::copy(y.begin(), y.end(), ztgt.begin());
-    tgt(x_data, ztgt_data, ztgt_data, d);
-    ExpectEQ<T>(ztgt_data, zref_data, d);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
-                         std::vector<T>> {
-  void operator()(const typename jit::AXYNTuples<T>::func_type tgt, const T a,
-                  const std::vector<T>& x, const std::vector<T>& yref) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(yref.size(), x.size());
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    const int d = yref.size();
-    std::vector<T> ytgt(d);
-    T* ytgt_data = ytgt.data();
-    // test normal
-    tgt(&a, x_data, ytgt_data, d);
-    ExpectEQ<T>(ytgt_data, yref_data, d);
-    // test inplace x
-    std::copy(x.begin(), x.end(), ytgt.begin());
-    tgt(&a, ytgt_data, ytgt_data, d);
-    ExpectEQ<T>(ytgt_data, yref_data, d);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
-                         int, int> {
-  void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref, int n,
-                  int bs) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(yref.size(), x.size());
-    EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    std::vector<T> ytgt(n * bs);
-    T* ytgt_data = ytgt.data();
-    // test normal
-    tgt(x_data, ytgt_data, n, bs);
-    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-    // test inplace x
-    std::copy(x.begin(), x.end(), ytgt.begin());
-    tgt(ytgt_data, ytgt_data, n, bs);
-    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
-  void operator()(const typename jit::XRNTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const T ref_res) {
-    EXPECT_TRUE(tgt != nullptr);
-    T tgt_res;
-    tgt(x.data(), &tgt_res, x.size());
-    ExpectEQ<T>(&tgt_res, &ref_res, 1);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
-                         std::vector<T>, int64_t,
-                         typename jit::VBroadcastTuples<T>::attr_type> {
-  void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref,
-                  int64_t h,
-                  const typename jit::VBroadcastTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size(), static_cast<size_t>(attr));
-    EXPECT_EQ(yref.size(), x.size() * h);
-    std::vector<T> y(yref.size());
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    T* y_data = y.data();
-    tgt(x_data, y_data, h, attr);
-    ExpectEQ<T>(y_data, yref_data, yref.size());
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
-  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(yref.size(), x.size());
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    const int d = yref.size();
-    std::vector<T> ytgt(d);
-    T* ytgt_data = ytgt.data();
-    // test normal
-    tgt(x_data, ytgt_data, d);
-    ExpectEQ<T>(ytgt_data, yref_data, d);
-    // test inplace x
-    std::copy(x.begin(), x.end(), ytgt.begin());
-    tgt(ytgt_data, ytgt_data, d);
-    ExpectEQ<T>(ytgt_data, yref_data, d);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>,
-                         typename jit::LSTMTuples<T>::attr_type> {
-  void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
-                  const std::vector<T>& xsrc, const std::vector<T>& wp,
-                  const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
-                  const std::vector<T>& ht_ref,
-                  const typename jit::LSTMTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(ct_ref.size(), ht_ref.size());
-    EXPECT_EQ(ct_1.size(), ht_ref.size());
-    EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
-    EXPECT_EQ(wp.size(), 3 * ht_ref.size());
-
-    // x could be changed after compute, so copy to save src
-    int d = ht_ref.size();
-    std::vector<T> x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size());
-    std::vector<T> checked(2 * d);
-    std::copy(xsrc.begin(), xsrc.end(), x.begin());
-
-    const T* ct_1_data = ct_1.data();
-    const T* wp_data = wp.data();
-    const T* ct_ref_data = ct_ref.data();
-    const T* ht_ref_data = ht_ref.data();
-    T* x_data = x.data();
-    T* ct_data = ct.data();
-    T* ht_data = ht.data();
-    T* checked_data = checked.data();
-
-    jit::lstm_t step;
-    step.gates = x_data;
-    step.ct_1 = ct_1_data;
-    step.ct = ct_data;
-    step.ht = ht_data;
-    if (attr.use_peephole) {
-      step.wp = wp_data;
-      step.checked = checked_data;
-    }
-
-    tgt(&step, &attr);
-    ExpectEQ<T>(ct_data, ct_ref_data, d);
-    ExpectEQ<T>(ht_data, ht_ref_data, d);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>,
-                         typename jit::GRUTuples<T>::attr_type> {
-  void operator()(const typename jit::GRUTuples<T>::func_type tgt,
-                  const std::vector<T>& xsrc, const std::vector<T>& ht_1,
-                  const std::vector<T>& ht_ref,
-                  const typename jit::GRUTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(ht_1.size(), ht_ref.size());
-    EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
-
-    // x could be changed after compute, so copy to save src
-    int d = ht_ref.size();
-    std::vector<T> x(xsrc.size()), ht(ht_ref.size());
-    std::copy(xsrc.begin(), xsrc.end(), x.begin());
-    const T* ht_1_data = ht_1.data();
-    const T* ht_ref_data = ht_ref.data();
-    T* x_data = x.data();
-    T* ht_data = ht.data();
-    jit::gru_t step;
-    step.gates = x_data;
-    step.ht_1 = ht_1_data;
-    step.ht = ht_data;
-    tgt(&step, &attr);
-    ExpectEQ<T>(ht_data, ht_ref_data, d);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
-                         typename jit::SeqPoolTuples<T>::attr_type> {
-  void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref,
-                  const typename jit::SeqPoolTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
-    int w = yref.size();
-    std::vector<T> y(w);
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    T* y_data = y.data();
-    tgt(x_data, y_data, &attr);
-    ExpectEQ<T>(y_data, yref_data, w);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
-                         std::vector<int64_t>, std::vector<T>,
-                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
-  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
-                  const std::vector<T>& table, const std::vector<int64_t>& idx,
-                  const std::vector<T>& oref,
-                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(table.size(),
-              static_cast<size_t>(attr.table_height * attr.table_width));
-    EXPECT_EQ(idx.size(),
-              static_cast<size_t>(attr.index_height * attr.index_width));
-    EXPECT_EQ(oref.size(),
-              static_cast<size_t>(attr.table_width * attr.index_width));
-    const T* table_data = table.data();
-    const int64_t* idx_data = idx.data();
-    const T* oref_data = oref.data();
-    int o_w = oref.size();
-    std::vector<T> out(o_w);
-    T* o_data = out.data();
-    tgt(table_data, idx_data, o_data, &attr);
-    ExpectEQ<T>(o_data, oref_data, o_w);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>,
-                         std::vector<int64_t>, std::vector<T>,
-                         typename jit::SgdTuples<T>::attr_type> {
-  void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr,
-                  const std::vector<T>& param, const std::vector<T>& grad,
-                  const std::vector<int64_t>& rows, const std::vector<T>& oref,
-                  const typename jit::SgdTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(param.size(),
-              static_cast<size_t>(attr.param_height * attr.param_width));
-    EXPECT_EQ(grad.size(),
-              static_cast<size_t>(attr.grad_height * attr.grad_width));
-    EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
-    EXPECT_EQ(param.size(), oref.size());
-    const T* param_data = param.data();
-    const T* grad_data = grad.data();
-    const int64_t* rows_data = rows.data();
-    const T* oref_data = oref.data();
-
-    std::vector<T> out(oref.size());
-    T* o_data = out.data();
-    tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
-    // only the selected rows should be equal
-    for (size_t i = 0; i < rows.size(); ++i) {
-      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
-    }
-
-    // inplace
-    std::copy(param.begin(), param.end(), out.begin());
-    tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
-    for (size_t i = 0; i < rows.size(); ++i) {
-      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
-    }
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>,
-                         typename jit::MatMulTuples<T>::attr_type> {
-  void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
-                  const std::vector<T>& a, const std::vector<T>& b,
-                  const std::vector<T>& cref,
-                  const typename jit::MatMulTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
-    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
-    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
-    std::vector<T> c(cref.size());
-    const T* a_data = a.data();
-    const T* b_data = b.data();
-    const T* cref_data = cref.data();
-    T* c_data = c.data();
-    tgt(a_data, b_data, c_data, &attr);
-    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, std::vector<T>, int, float, int> {
-  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
-                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
-                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
-                  const std::vector<T>& scale, const std::vector<T>& bias,
-                  int left, const float epsilon, int right) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
-    EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
-    EXPECT_EQ(mean.size(), static_cast<size_t>(left));
-    EXPECT_EQ(var.size(), static_cast<size_t>(left));
-    EXPECT_EQ(scale.size(), static_cast<size_t>(right));
-    EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-    std::vector<T> outtgt(outref.size());
-    const T* scale_data = scale.data();
-    const T* bias_data = bias.data();
-    T* x_data = x.data();
-    T* mean_data = mean.data();
-    T* var_data = var.data();
-    T* outref_data = outref.data();
-    T* outtgt_data = outtgt.data();
-
-    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
-        epsilon, right);
-    ExpectEQ<T>(outtgt_data, outref_data, left * right);
-  }
-};
-
-template <typename T>
-struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<int>,
-                         int> {
-  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
-                  const int seq_len, const std::vector<T>& x,
-                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
-                  std::vector<int>& trackref, int tag_num) {          // NOLINT
-    constexpr int state_trans_base_idx = 2;
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
-    EXPECT_EQ(w.size(),
-              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
-    EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
-    EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
-    std::vector<T> alphatgt(alpharef.size());
-    std::vector<int> tracktgt(trackref.size());
-
-    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
-    tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
-        tracktgt.data(), tag_num);
-    ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
-    ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
-  }
-};
-
-template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+template <typename KernelTuple, typename PlaceType, typename Tester,
           typename... Args>
-void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
-  TestFuncWithRefer<KernelTuples, Args...> test;
-  // test jitcode
-  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitcode) {
-    VLOG(10) << "Test Jitcode Kernel ";
-    test(jitcode, args...);
+void TestAllImpls(const typename KernelTuple::attr_type& attr,
+                  const Tester& verifier, const Args&... args) {
+  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  for (auto f : funcs) {
+    VLOG(10) << "Test Kernel " << f.first;
+    verifier(f.second, args...);
   }
-  // test all impls in more
-  jit::KernelKey kkey(KT, PlaceType());
-  auto& pool = jit::KernelPool().Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        auto more = i->GetFunc();
-        VLOG(10) << "Test More Kernel : " << i->ImplType();
-        test(more, args...);
-      }
-    }
-  }
-  // test result from Get function
-  // VLOG(10) << "Test Get function ";
-  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
-  test(tgt, args...);
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXYZNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelXYZN() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
+    auto ref = jit::GetReferFunc<KernelTuple>();
     EXPECT_TRUE(ref != nullptr);
 
     std::vector<T> x(d), y(d), zref(d);
@@ -494,16 +105,42 @@ void TestKernelXYZNTuples() {
     ExpectEQ<T>(xinp_data, zref_data, d);
     ExpectEQ<T>(yinp_data, zref_data, d);
 
-    TestAllImpls<KT, jit::XYZNTuples<T>, PlaceType, std::vector<T>,
-                 std::vector<T>, std::vector<T>>(d, x, y, zref);
+    auto verifier = [](const typename KernelTuple::func_type tgt,
+                       const std::vector<T>& x, const std::vector<T>& y,
+                       const std::vector<T>& zref) {
+      EXPECT_TRUE(tgt != nullptr);
+      EXPECT_EQ(zref.size(), x.size());
+      EXPECT_EQ(zref.size(), y.size());
+      const T* x_data = x.data();
+      const T* y_data = y.data();
+      const T* zref_data = zref.data();
+      const int d = zref.size();
+
+      std::vector<T> ztgt(d);
+      T* ztgt_data = ztgt.data();
+      // test normal
+      tgt(x_data, y_data, ztgt_data, d);
+      ExpectEQ<T>(ztgt_data, zref_data, d);
+      // test inplace x
+      std::copy(x.begin(), x.end(), ztgt.begin());
+      tgt(ztgt_data, y_data, ztgt_data, d);
+      ExpectEQ<T>(ztgt_data, zref_data, d);
+      // test inplace y
+      std::copy(y.begin(), y.end(), ztgt.begin());
+      tgt(x_data, ztgt_data, ztgt_data, d);
+      ExpectEQ<T>(ztgt_data, zref_data, d);
+    };
+
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, y, zref);
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelAXYNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelAXYN() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
+    auto ref = jit::GetReferFunc<KernelTuple>();
     EXPECT_TRUE(ref != nullptr);
 
     const T a = static_cast<T>(3);
@@ -520,34 +157,33 @@ void TestKernelAXYNTuples() {
     ref(&a, xinp_data, xinp_data, d);
     ExpectEQ<T>(xinp_data, yref_data, d);
 
-    TestAllImpls<KT, jit::AXYNTuples<T>, PlaceType, T, std::vector<T>,
-                 std::vector<T>>(d, a, x, yref);
-  }
-}
-
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXRNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = FLAGS_acc;
-  FLAGS_acc = 1e-4;
-  for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d);
-    RandomVec<T>(d, x.data());
-    T ref_res;
-    ref(x.data(), &ref_res, d);
-    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
-                                                                      ref_res);
+    auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
+                       const std::vector<T>& x, const std::vector<T>& yref) {
+      EXPECT_TRUE(tgt != nullptr);
+      EXPECT_EQ(yref.size(), x.size());
+      const T* x_data = x.data();
+      const T* yref_data = yref.data();
+      const int d = yref.size();
+      std::vector<T> ytgt(d);
+      T* ytgt_data = ytgt.data();
+      // test normal
+      tgt(&a, x_data, ytgt_data, d);
+      ExpectEQ<T>(ytgt_data, yref_data, d);
+      // test inplace x
+      std::copy(x.begin(), x.end(), ytgt.begin());
+      tgt(&a, ytgt_data, ytgt_data, d);
+      ExpectEQ<T>(ytgt_data, yref_data, d);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref);
   }
-  FLAGS_acc = last_acc;
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXYNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelXYN() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
+    auto ref = jit::GetReferFunc<KernelTuple>();
     EXPECT_TRUE(ref != nullptr);
 
     std::vector<T> x(d), yref(d);
@@ -562,15 +198,57 @@ void TestKernelXYNTuples() {
     ref(x_data, yref_data, d);
     ref(xinp_data, xinp_data, d);
     ExpectEQ<T>(xinp_data, yref_data, d);
+    auto verifier = [](const typename KernelTuple::func_type tgt,
+                       const std::vector<T>& x, const std::vector<T>& yref) {
+      EXPECT_TRUE(tgt != nullptr);
+      EXPECT_EQ(yref.size(), x.size());
+      const T* x_data = x.data();
+      const T* yref_data = yref.data();
+      const int d = yref.size();
+      std::vector<T> ytgt(d);
+      T* ytgt_data = ytgt.data();
+      // test normal
+      tgt(x_data, ytgt_data, d);
+      ExpectEQ<T>(ytgt_data, yref_data, d);
+      // test inplace x
+      std::copy(x.begin(), x.end(), ytgt.begin());
+      tgt(ytgt_data, ytgt_data, d);
+      ExpectEQ<T>(ytgt_data, yref_data, d);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, yref);
+  }
+}
 
-    TestAllImpls<KT, jit::XYNTuples<T>, PlaceType, std::vector<T>,
-                 std::vector<T>>(d, x, yref);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelXRN() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  auto last_acc = FLAGS_acc;
+  FLAGS_acc = 1e-4;
+  for (int d : TestSizes()) {
+    auto ref = jit::GetReferFunc<KernelTuple>();
+    EXPECT_TRUE(ref != nullptr);
+    std::vector<T> x(d);
+    RandomVec<T>(d, x.data());
+    T ref_res;
+    ref(x.data(), &ref_res, d);
+
+    auto verifier = [](const typename KernelTuple::func_type tgt,
+                       const std::vector<T>& x, const T ref_res) {
+      EXPECT_TRUE(tgt != nullptr);
+      T tgt_res;
+      tgt(x.data(), &tgt_res, x.size());
+      ExpectEQ<T>(&tgt_res, &ref_res, 1);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res);
   }
+  FLAGS_acc = last_acc;
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelLSTMTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelLSTM() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   auto test_sizes = TestSizes();
   test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
@@ -582,7 +260,7 @@ void TestKernelLSTMTuples() {
             const jit::lstm_attr_t attr(
                 d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
                 jit::to_kerneltype(act_cell), use_peephole);
-            auto ref = jit::GetRefer<KT, jit::LSTMTuples<T>>();
+            auto ref = jit::GetReferFunc<KernelTuple>();
             EXPECT_TRUE(ref != nullptr);
             std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
             std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
@@ -609,10 +287,51 @@ void TestKernelLSTMTuples() {
             }
             ref(&step, &attr);
             VLOG(10) << attr;
-            TestAllImpls<KT, jit::LSTMTuples<T>, PlaceType, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>>(attr, xsrc, wp, ct_1, ct_ref, ht_ref,
-                                         attr);
+
+            auto verifier = [](
+                const typename KernelTuple::func_type tgt,
+                const std::vector<T>& xsrc, const std::vector<T>& wp,
+                const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
+                const std::vector<T>& ht_ref,
+                const typename KernelTuple::attr_type& attr) {
+              EXPECT_TRUE(tgt != nullptr);
+              EXPECT_EQ(ct_ref.size(), ht_ref.size());
+              EXPECT_EQ(ct_1.size(), ht_ref.size());
+              EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
+              EXPECT_EQ(wp.size(), 3 * ht_ref.size());
+
+              // x could be changed after compute, so copy to save src
+              int d = ht_ref.size();
+              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
+                  ht(ht_ref.size());
+              std::vector<T> checked(2 * d);
+              std::copy(xsrc.begin(), xsrc.end(), x.begin());
+
+              const T* ct_1_data = ct_1.data();
+              const T* wp_data = wp.data();
+              const T* ct_ref_data = ct_ref.data();
+              const T* ht_ref_data = ht_ref.data();
+              T* x_data = x.data();
+              T* ct_data = ct.data();
+              T* ht_data = ht.data();
+              T* checked_data = checked.data();
+
+              jit::lstm_t step;
+              step.gates = x_data;
+              step.ct_1 = ct_1_data;
+              step.ct = ct_data;
+              step.ht = ht_data;
+              if (attr.use_peephole) {
+                step.wp = wp_data;
+                step.checked = checked_data;
+              }
+
+              tgt(&step, &attr);
+              ExpectEQ<T>(ct_data, ct_ref_data, d);
+              ExpectEQ<T>(ht_data, ht_ref_data, d);
+            };
+            TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
+                                                 ct_ref, ht_ref, attr);
           }
         }
       }
@@ -620,9 +339,10 @@ void TestKernelLSTMTuples() {
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelGRUTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelGRU() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   auto test_sizes = TestSizes();
   test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
@@ -631,7 +351,7 @@ void TestKernelGRUTuples() {
       for (auto& act_cand : all_acts) {
         const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
                                    jit::to_kerneltype(act_cand));
-        auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
         EXPECT_TRUE(ref != nullptr);
         std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
         RandomVec<T>(3 * d, xsrc.data());
@@ -648,17 +368,218 @@ void TestKernelGRUTuples() {
         step.ht = ht_ref_data;
         ref(&step, &attr);
         VLOG(10) << attr;
-        TestAllImpls<KT, jit::GRUTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(attr, xsrc, ht_1, ht_ref,
-                                                     attr);
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const std::vector<T>& xsrc,
+                           const std::vector<T>& ht_1,
+                           const std::vector<T>& ht_ref,
+                           const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(ht_1.size(), ht_ref.size());
+          EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
+
+          // x could be changed after compute, so copy to save src
+          int d = ht_ref.size();
+          std::vector<T> x(xsrc.size()), ht(ht_ref.size());
+          std::copy(xsrc.begin(), xsrc.end(), x.begin());
+          const T* ht_1_data = ht_1.data();
+          const T* ht_ref_data = ht_ref.data();
+          T* x_data = x.data();
+          T* ht_data = ht.data();
+          jit::gru_t step;
+          step.gates = x_data;
+          step.ht_1 = ht_1_data;
+          step.ht = ht_data;
+          tgt(&step, &attr);
+          ExpectEQ<T>(ht_data, ht_ref_data, d);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, ht_1, ht_ref,
+                                             attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelSeqPoolTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelNCHW16CMulNC() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  const int n = 3, c = 16 * 4, h = 10, w = 10;
+  auto ref = jit::GetReferFunc<KernelTuple>();
+  EXPECT_TRUE(ref != nullptr);
+  int sz = n * c * h * w;
+  std::vector<T> x(sz), y(n * c), zref(sz);
+  std::vector<T> ztgt(sz), zjit(sz);
+  RandomVec<T>(sz, x.data());
+  RandomVec<T>(n * c, y.data());
+
+  const T* x_data = x.data();
+  const T* y_data = y.data();
+  T* zref_data = zref.data();
+  T* ztgt_data = ztgt.data();
+  T* zjit_data = zjit.data();
+  constexpr int simd_width = ZMM_FLOAT_BLOCK;
+  int C = c / simd_width;
+  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(0);
+  auto funcs = jit::GetAllCandidateFuncs<KernelTuple, PlaceType>(0);
+  EXPECT_GT(funcs.size(), 0UL);
+  auto jitcode = funcs[0];
+  EXPECT_TRUE(tgt != nullptr);
+
+  if (std::is_same<T, float>::value &&
+      paddle::platform::MayIUse(paddle::platform::avx512f)) {
+    EXPECT_TRUE(jitcode != nullptr);
+  }
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < C; ci++) {
+      auto ptr_x =
+          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+      auto ptr_zref =
+          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_ztgt =
+          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+      ref(ptr_x, ptr_y, ptr_zref, h, w);
+      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
+
+      if (jitcode) {
+        auto ptr_zjit =
+            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
+      }
+    }
+  }
+  ExpectEQ<T>(ztgt_data, zref_data, sz);
+  if (jitcode) {
+    ExpectEQ<T>(zjit_data, zref_data, sz);
+  }
+}
+
+template <typename KernelTuple, typename PlaceType>
+void TestKernelLayerNorm() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        auto ref = jit::GetReferFunc<KernelTuple>();
+        EXPECT_TRUE(ref != nullptr);
+        int sz = left * right;
+        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
+            outref(sz);
+        RandomVec<T>(sz, x.data());
+        RandomVec<T>(left, mean.data());
+        RandomVec<T>(left, var.data());
+        RandomVec<T>(right, scale.data());
+        RandomVec<T>(right, bias.data());
+
+        const T* scale_data = scale.data();
+        const T* bias_data = bias.data();
+        T* x_data = x.data();
+        T* mean_data = mean.data();
+        T* var_data = var.data();
+        T* outref_data = outref.data();
+
+        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+
+        auto verifier = [](
+            const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
+            const std::vector<T>& outref_, const std::vector<T>& mean_,
+            const std::vector<T>& var_, const std::vector<T>& scale,
+            const std::vector<T>& bias, const int& left, const float& epsilon,
+            const typename KernelTuple::attr_type& right) {
+          EXPECT_TRUE(tgt != nullptr);
+          std::vector<T> outtgt(outref_.size());
+          std::vector<T> x(x_.size());
+          std::vector<T> mean(mean_.size());
+          std::vector<T> var(var_.size());
+          std::vector<T> outref(outref_.size());
+          std::copy(x_.begin(), x_.end(), x.begin());
+          std::copy(mean_.begin(), mean_.end(), mean.begin());
+          std::copy(var_.begin(), var_.end(), var.begin());
+          std::copy(outref_.begin(), outref_.end(), outref.begin());
+
+          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+          EXPECT_EQ(var.size(), static_cast<size_t>(left));
+          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+
+          const T* scale_data = scale.data();
+          const T* bias_data = bias.data();
+          T* x_data = x.data();
+          T* mean_data = mean.data();
+          T* var_data = var.data();
+          T* outref_data = outref.data();
+          T* outtgt_data = outtgt.data();
+          tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
+              left, epsilon, right);
+          ExpectEQ<T>(outtgt_data, outref_data, left * right);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
+                                             var, scale, bias, left, epsilon,
+                                             right);
+      }
+    }
+  }
+}
+
+template <typename KernelTuple, typename PlaceType>
+void TestKernelCRFDecoding() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  constexpr int state_trans_base_idx = 2;
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : test_sizes) {
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
+      std::vector<int> trackref(x_sz);
+      RandomVec<T>(x_sz, x.data());
+      RandomVec<T>(w_sz, w.data());
+
+      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
+          trackref.data(), tag_num);
+
+      auto verifier = [](
+          const typename KernelTuple::func_type tgt, const int& seq_len,
+          const std::vector<T>& x, const std::vector<T>& w,
+          const std::vector<T>& alpharef, const std::vector<int>& trackref,
+          const typename KernelTuple::attr_type& tag_num) {
+        constexpr int state_trans_base_idx = 2;
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
+        EXPECT_EQ(w.size(), static_cast<size_t>(
+                                (tag_num + state_trans_base_idx) * tag_num));
+        EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
+        EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
+        std::vector<T> alphatgt(alpharef.size());
+        std::vector<int> tracktgt(trackref.size());
+        memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int));
+        tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
+            tracktgt.data(), tag_num);
+        ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
+        ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
+      };
+      TestAllImpls<KernelTuple, PlaceType>(tag_num, verifier, seq_len, x, w,
+                                           alpharef, trackref, tag_num);
+    }
+  }
+}
+
+template <typename KernelTuple, typename PlaceType>
+void TestKernelSeqPool() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   std::vector<jit::SeqPoolType> pool_types = {
       jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
   auto test_sizes = TestSizes();
@@ -668,7 +589,7 @@ void TestKernelSeqPoolTuples() {
       jit::seq_pool_attr_t attr(w, type);
       for (int h : test_sizes) {
         attr.h = h;
-        auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
         EXPECT_TRUE(ref != nullptr);
         std::vector<T> x(h * w), yref(w);
         RandomVec<T>(h * w, x.data());
@@ -676,16 +597,86 @@ void TestKernelSeqPoolTuples() {
         T* yref_data = yref.data();
         ref(x_data, yref_data, &attr);
         VLOG(10) << attr;
-        TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>>(attr, x, yref, attr);
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const std::vector<T>& x, const std::vector<T>& yref,
+                           const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
+          int w = yref.size();
+          std::vector<T> y(w);
+          const T* x_data = x.data();
+          const T* yref_data = yref.data();
+          T* y_data = y.data();
+          tgt(x_data, y_data, &attr);
+          ExpectEQ<T>(y_data, yref_data, w);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, x, yref, attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelMatMulTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelEmbSeqPool() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  int64_t tbl_h = 1e4;
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum};  // only support sum yet
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int tbl_w : test_sizes) {
+    std::vector<T> table(tbl_h * tbl_w);
+    RandomVec<T>(tbl_h * tbl_w, table.data());
+    const T* table_data = table.data();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          auto ref = jit::GetReferFunc<KernelTuple>();
+          EXPECT_TRUE(ref != nullptr);
+          std::vector<int64_t> idx(idx_h * idx_w);
+          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
+          int64_t out_w = tbl_w * idx_w;
+          std::vector<T> oref(out_w);
+          const int64_t* idx_data = idx.data();
+          T* o_data = oref.data();
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          ref(table_data, idx_data, o_data, &attr);
+
+          auto verifier = [](const typename KernelTuple::func_type tgt,
+                             const std::vector<T>& table,
+                             const std::vector<int64_t>& idx,
+                             const std::vector<T>& oref,
+                             const typename KernelTuple::attr_type& attr) {
+            EXPECT_TRUE(tgt != nullptr);
+            EXPECT_EQ(table.size(), static_cast<size_t>(attr.table_height *
+                                                        attr.table_width));
+            EXPECT_EQ(idx.size(), static_cast<size_t>(attr.index_height *
+                                                      attr.index_width));
+            EXPECT_EQ(oref.size(),
+                      static_cast<size_t>(attr.table_width * attr.index_width));
+            const T* table_data = table.data();
+            const int64_t* idx_data = idx.data();
+            const T* oref_data = oref.data();
+            int o_w = oref.size();
+            std::vector<T> out(o_w);
+            T* o_data = out.data();
+            tgt(table_data, idx_data, o_data, &attr);
+            ExpectEQ<T>(o_data, oref_data, o_w);
+          };
+          TestAllImpls<KernelTuple, PlaceType>(attr, verifier, table, idx, oref,
+                                               attr);
+        }
+      }
+    }
+  }
+}
+
+template <typename KernelTuple, typename PlaceType>
+void TestKernelMatMul() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   auto last_acc = FLAGS_acc;
   // export MKL_CBWR=AVX would make MKL force to use AVX
   // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
@@ -693,7 +684,7 @@ void TestKernelMatMulTuples() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
-        auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
         EXPECT_TRUE(ref != nullptr);
         std::vector<T> a(m * k), b(k * n), c(m * n);
         RandomVec<T>(m * k, a.data());
@@ -703,20 +694,36 @@ void TestKernelMatMulTuples() {
         T* c_data = c.data();
         const jit::matmul_attr_t attr{m, n, k};
         ref(a_data, b_data, c_data, &attr);
-        TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const std::vector<T>& a, const std::vector<T>& b,
+                           const std::vector<T>& cref,
+                           const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
+          EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
+          EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
+          std::vector<T> c(cref.size());
+          const T* a_data = a.data();
+          const T* b_data = b.data();
+          const T* cref_data = cref.data();
+          T* c_data = c.data();
+          tgt(a_data, b_data, c_data, &attr);
+          ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, a, b, c, attr);
       }
     }
   }
   FLAGS_acc = last_acc;
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelSoftmaxTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelSoftmax() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   for (int bs : {1, 2, 10}) {
     for (int n : TestSizes()) {
-      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
+      auto ref = jit::GetReferFunc<KernelTuple>();
       EXPECT_TRUE(ref != nullptr);
       std::vector<T> x(bs * n), y(bs * n);
       RandomVec<T>(bs * n, x.data());
@@ -730,51 +737,33 @@ void TestKernelSoftmaxTuples() {
       ref(xinp_data, xinp_data, n, bs);
       ExpectEQ<T>(xinp_data, y_data, n * bs);
 
-      TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
-                   std::vector<T>>(n, x, y, n, bs);
-    }
-  }
-}
-
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelEmbSeqPoolTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  int64_t tbl_h = 1e4;
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum};  // only support sum yet
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int tbl_w : test_sizes) {
-    std::vector<T> table(tbl_h * tbl_w);
-    RandomVec<T>(tbl_h * tbl_w, table.data());
-    const T* table_data = table.data();
-    for (auto type : pool_types) {
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
-          EXPECT_TRUE(ref != nullptr);
-          std::vector<int64_t> idx(idx_h * idx_w);
-          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
-          int64_t out_w = tbl_w * idx_w;
-          std::vector<T> oref(out_w);
-          const int64_t* idx_data = idx.data();
-          T* o_data = oref.data();
-          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
-                                        type);
-          ref(table_data, idx_data, o_data, &attr);
-
-          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
-                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
-                                                             oref, attr);
-        }
-      }
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const std::vector<T>& x, const std::vector<T>& yref,
+                         int n, int bs) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(yref.size(), x.size());
+        EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
+        const T* x_data = x.data();
+        const T* yref_data = yref.data();
+        std::vector<T> ytgt(n * bs);
+        T* ytgt_data = ytgt.data();
+        // test normal
+        tgt(x_data, ytgt_data, n, bs);
+        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+        // test inplace x
+        std::copy(x.begin(), x.end(), ytgt.begin());
+        tgt(ytgt_data, ytgt_data, n, bs);
+        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+      };
+      TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs);
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelSgdTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelSgd() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
@@ -802,7 +791,7 @@ void TestKernelSgdTuples() {
         RandomVec<T>(rows_size * grad_w, grad.data());
         const int64_t* rows_data = rows.data();
         const T* grad_data = grad.data();
-        auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
         EXPECT_TRUE(ref != nullptr);
         jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
         ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
@@ -818,227 +807,488 @@ void TestKernelSgdTuples() {
                       grad_w);
         }
 
-        TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>,
-                     std::vector<T>, std::vector<int64_t>, std::vector<T>>(
-            attr, lr, param, grad, rows, param_out, attr);
+        auto verifier = [](
+            const typename KernelTuple::func_type tgt, const T lr,
+            const std::vector<T>& param, const std::vector<T>& grad,
+            const std::vector<int64_t>& rows, const std::vector<T>& oref,
+            const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(param.size(),
+                    static_cast<size_t>(attr.param_height * attr.param_width));
+          EXPECT_EQ(grad.size(),
+                    static_cast<size_t>(attr.grad_height * attr.grad_width));
+          EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
+          EXPECT_EQ(param.size(), oref.size());
+          const T* param_data = param.data();
+          const T* grad_data = grad.data();
+          const int64_t* rows_data = rows.data();
+          const T* oref_data = oref.data();
+
+          std::vector<T> out(oref.size());
+          T* o_data = out.data();
+          tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
+          // only the selected rows should be equal
+          for (size_t i = 0; i < rows.size(); ++i) {
+            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
+          }
+
+          // inplace
+          std::copy(param.begin(), param.end(), out.begin());
+          tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
+          for (size_t i = 0; i < rows.size(); ++i) {
+            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
+          }
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, lr, param, grad,
+                                             rows, param_out, attr);
       }
     }
   }
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelNCHW16CMulNCTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  const int n = 3, c = 16 * 4, h = 10, w = 10;
-  auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
-  EXPECT_TRUE(ref != nullptr);
-  int sz = n * c * h * w;
-  std::vector<T> x(sz), y(n * c), zref(sz);
-  std::vector<T> ztgt(sz), zjit(sz);
-  RandomVec<T>(sz, x.data());
-  RandomVec<T>(n * c, y.data());
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* zref_data = zref.data();
-  T* ztgt_data = ztgt.data();
-  T* zjit_data = zjit.data();
-  constexpr int simd_width = ZMM_FLOAT_BLOCK;
-  int C = c / simd_width;
-  auto tgt = jit::Get<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
-  auto jitcode = jit::GetJitCode<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
-  EXPECT_TRUE(tgt != nullptr);
-
-  if (std::is_same<T, float>::value &&
-      paddle::platform::MayIUse(paddle::platform::avx512f)) {
-    EXPECT_TRUE(jitcode != nullptr);
-  }
-  for (int ni = 0; ni < n; ni++) {
-    for (int ci = 0; ci < C; ci++) {
-      auto ptr_x =
-          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-      auto ptr_zref =
-          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_ztgt =
-          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-
-      ref(ptr_x, ptr_y, ptr_zref, h, w);
-      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
+template <typename KernelTuple, typename PlaceType>
+void TestKernelVBroadcast() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int w : TestSizes()) {
+    std::vector<T> x(w);
+    RandomVec<T>(w, x.data());
+    const T* x_data = x.data();
+    for (int64_t h : {1, 2, 6}) {
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> y(w * h);
+      T* y_data = y.data();
+      ref(x_data, y_data, h, w);
 
-      if (jitcode) {
-        auto ptr_zjit =
-            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
-      }
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const std::vector<T>& x, const std::vector<T>& yref,
+                         const int64_t& h,
+                         const typename KernelTuple::attr_type& attr) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(x.size(), static_cast<size_t>(attr));
+        EXPECT_EQ(yref.size(), x.size() * h);
+        std::vector<T> y(yref.size());
+        const T* x_data = x.data();
+        const T* yref_data = yref.data();
+        T* y_data = y.data();
+        tgt(x_data, y_data, h, attr);
+        ExpectEQ<T>(y_data, yref_data, yref.size());
+      };
+      TestAllImpls<KernelTuple, PlaceType>(static_cast<int64_t>(w), verifier, x,
+                                           y, h, static_cast<int64_t>(w));
     }
   }
-  ExpectEQ<T>(ztgt_data, zref_data, sz);
-  if (jitcode) {
-    ExpectEQ<T>(zjit_data, zref_data, sz);
-  }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelLayerNormTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
-        EXPECT_TRUE(ref != nullptr);
-        int sz = left * right;
-        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
-            outref(sz);
-        RandomVec<T>(sz, x.data());
-        RandomVec<T>(left, mean.data());
-        RandomVec<T>(left, var.data());
-        RandomVec<T>(right, scale.data());
-        RandomVec<T>(right, bias.data());
-
-        const T* scale_data = scale.data();
-        const T* bias_data = bias.data();
-        T* x_data = x.data();
-        T* mean_data = mean.data();
-        T* var_data = var.data();
-        T* outref_data = outref.data();
+// test pool
+TEST(JITKernel_pool, jitcreator) {
+  const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators();
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(jitcreators.size(), 0UL);
+#else
+  EXPECT_EQ(jitcreators.size(), 25UL);
+#endif
+}
 
-        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
-            left, epsilon, right);
+TEST(JITKernel_pool, jitpool) {
+  // jitpool is related with attr
+  const auto& kers = jit::JitCodePool<jit::kVAdd>().Instance().AllKernels();
+  EXPECT_EQ(kers.size(), 0UL);
+  jit::GetAllCandidateKernels<jit::VAddTuple<float>, CPUPlace>(3);
+// after call GetAllCandidateKernels, it will create jitcode Automatically
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(kers.size(), 0UL);
+#else
+  EXPECT_EQ(kers.size(), 1UL);
+#endif
+}
 
-        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>, std::vector<T>,
-                     std::vector<T>, std::vector<T>, int, float>(
-            right, x, outref, mean, var, scale, bias, left, epsilon, right);
-      }
-    }
-  }
+TEST(JITKernel_pool, more) {
+  const auto& kers = jit::KernelPool::Instance().AllKernels();
+#if defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(kers.size(), 10UL);
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_EQ(kers.size(), 21UL);
+#else
+  EXPECT_EQ(kers.size(), 8UL);
+#endif
+#endif
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelCRFDecodingTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  constexpr int state_trans_base_idx = 2;
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : test_sizes) {
-      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
-      EXPECT_TRUE(ref != nullptr);
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
-      std::vector<int> trackref(x_sz);
-      RandomVec<T>(x_sz, x.data());
-      RandomVec<T>(w_sz, w.data());
+TEST(JITKernel_pool, refer) {
+  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
+  EXPECT_EQ(kers.size(), 29UL);
+}
 
-      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
-          trackref.data(), tag_num);
+// test helper
+TEST(JITKernel_helper, GetAllCandidateKernels) {
+  auto fp_kers =
+      jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(fp_kers.size(), 1UL);  // refer
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
+#else
+  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode, refer
+#endif
+#endif
+
+  auto db_kers =
+      jit::GetAllCandidateKernels<jit::VExpTuple<double>, CPUPlace>(10);
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
+#else
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#endif
+#endif
+}
 
-      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
-                   std::vector<T>, std::vector<T>, std::vector<T>,
-                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
-                                          trackref, tag_num);
-    }
-  }
+TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) {
+  auto fp_kers =
+      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<float>, CPUPlace>(10);
+#if defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(fp_kers.size(), 1UL);  // refer
+#else
+#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32)
+  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode/mkl, refer
+#else
+  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
+#endif
+#endif
+
+  auto db_kers =
+      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<double>, CPUPlace>(10);
+#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML)
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#else
+  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
+#endif
 }
 
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelVBroadcastTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  for (int w : TestSizes()) {
-    std::vector<T> x(w);
-    RandomVec<T>(w, x.data());
-    const T* x_data = x.data();
-    for (int64_t h : {1, 2, 6}) {
-      auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>();
-      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> y(w * h);
-      T* y_data = y.data();
-      ref(x_data, y_data, h, w);
+TEST(JITKernel_helper, KernelFuncs) {
+  auto f1 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache().At(3);
+  auto f2 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[3];
+  EXPECT_TRUE(f1 != nullptr);
+  EXPECT_TRUE(f1 == f2);
+
+  auto f3 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[5];
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_TRUE(f2 == f3);
+#else
+  EXPECT_TRUE(f2 != f3);
+#endif
+}
 
-      TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>,
-                   std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h,
-                                            static_cast<int64_t>(w));
-    }
+TEST(JITKernel_helper, GetAllCandidateFuncs) {
+  auto funcs = jit::GetAllCandidateFuncs<jit::VExpTuple<float>, CPUPlace>(10);
+  auto kers = jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
+  EXPECT_EQ(funcs.size(), kers.size());
+
+  std::vector<float> x(10), tgt(10);
+  RandomVec<float>(10, x.data());
+  auto best = jit::GetDefaultBestFunc<jit::VExpTuple<float>, CPUPlace>(10);
+  best(x.data(), tgt.data(), 10);
+  for (auto f : funcs) {
+    std::vector<float> y(10);
+    f(x.data(), y.data(), 10);
+    ExpectEQ<float>(y.data(), tgt.data(), 10);
   }
 }
 
-#define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
-  TEST(JITKernel, kernel_type) {                                 \
-    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
-    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
+TEST(JITKernel_helper, pack_weights) {
+  const int N = 8 * 60, K = 2;
+  float src[K][N], yref[K][N], y[K * N];
+  float* x = &(src[0][0]);
+  float* ref = &(yref[0][0]);
+  for (int i = 0; i < N * K; ++i) {
+    *(x + i) = static_cast<float>(i);
+  }
+  int block = 0;
+  std::vector<int> groups;
+  if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    groups.push_back(30);
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    groups.insert(groups.end(), {14, 14, 14, 14, 4});
   }
 
-TEST_CPU_KERNEL(XYZNTuples, kVMul);
-TEST_CPU_KERNEL(XYZNTuples, kVAdd);
-TEST_CPU_KERNEL(XYZNTuples, kVAddRelu);
-TEST_CPU_KERNEL(XYZNTuples, kVSub);
-
-TEST_CPU_KERNEL(AXYNTuples, kVScal);
-TEST_CPU_KERNEL(AXYNTuples, kVAddBias);
+  int offset = 0;
+  int acc = 0;
+  for (int g : groups) {
+    g = g * block;
+    for (int k = 0; k < K; ++k) {
+      for (int i = 0; i < g; ++i) {
+        *(ref + offset) = src[k][i + acc];
+        offset++;
+      }
+    }
+    acc += g;
+  }
 
-TEST_CPU_KERNEL(XRNTuples, kHMax);
-TEST_CPU_KERNEL(XRNTuples, kHSum);
+  jit::pack_weights<float>(x, y, N, K);
+  ExpectEQ<float>(y, ref, N * K);
+}
 
-TEST_CPU_KERNEL(XYNTuples, kVRelu);
-TEST_CPU_KERNEL(XYNTuples, kVIdentity);
-TEST_CPU_KERNEL(XYNTuples, kVSquare);
-TEST_CPU_KERNEL(XYNTuples, kVExp);
-TEST_CPU_KERNEL(XYNTuples, kVSigmoid);
-TEST_CPU_KERNEL(XYNTuples, kVTanh);
-TEST_CPU_KERNEL(XYNTuples, kVCopy);
+TEST(JITKernel_helper, attr) {
+  std::ostringstream out;
+  // KernelTypes
+  out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding)
+      << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1)
+      << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2)
+      << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax)
+      << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1)
+      << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul)
+      << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool)
+      << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd)
+      << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu)
+      << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy)
+      << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity)
+      << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu)
+      << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd)
+      << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare)
+      << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh);
+  EXPECT_EQ(out.str().size(), 234);
+
+  // SeqPoolTypes
+  out.str("");
+  out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg)
+      << jit::to_string(jit::kSqrt);
+  EXPECT_EQ(out.str().size(), 13);
+
+  EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu);
+  EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity);
+  EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp);
+  EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid);
+  EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh);
+
+  out.str("");
+  out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  EXPECT_EQ(out.str().size(), 89);
+
+  out.str("");
+  out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid);
+  EXPECT_EQ(out.str().size(), 52);
+
+  out.str("");
+  out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum);
+  EXPECT_EQ(out.str().size(), 44);
+
+  out.str("");
+  out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg);
+  EXPECT_EQ(out.str().size(), 93);
+
+  out.str("");
+  out << jit::sgd_attr_t(1, 2, 3, 4, 5);
+  EXPECT_EQ(out.str().size(), 81);
+
+  out.str("");
+  out << jit::matmul_attr_t(1, 2, 3);
+  EXPECT_EQ(out.str().size(), 14);
+}
 
-TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt);
-TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1);
+// test keys
+TEST(JITKernel_key, int) {
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int>(2));
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int64_t>(2));
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) != jit::JitCodeKey<int>(3));
+}
 
-TEST_CPU_KERNEL(GRUTuples, kGRUH1);
-TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1);
-TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2);
+TEST(JITKernel_key, gru) {
+  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
+  jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity);
 
-TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC);
+  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::gru_attr_t>(attr5);
 
-TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool);
-TEST_CPU_KERNEL(MatMulTuples, kMatMul);
-TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax);
-TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
-TEST_CPU_KERNEL(SgdTuples, kSgd);
-TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
-TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
-TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key2 != key5);
+  EXPECT_TRUE(key3 != key4);
+  EXPECT_TRUE(key3 != key5);
+  EXPECT_TRUE(key4 != key5);
+}
 
 TEST(JITKernel_key, lstm) {
   jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
   jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
   jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
+  jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
 
   auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
   auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
   auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
   auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::lstm_attr_t>(attr5);
+  auto key6 = jit::JitCodeKey<jit::lstm_attr_t>(attr6);
 
-  EXPECT_TRUE(key1 != key2);
-  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key2 != key5);
   EXPECT_TRUE(key3 != key4);
+  EXPECT_TRUE(key3 != key5);
+  EXPECT_TRUE(key4 != key5);
+  EXPECT_TRUE(key5 == key6);
 }
 
-TEST(JITKernel_key, gru) {
-  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
+TEST(JITKernel_key, seq_pool) {
+  jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1);
+  jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3);
+  jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3);
+  jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3);
 
-  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
+  auto key1 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr4);
 
-  EXPECT_TRUE(key1 != key2);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key3 != key4);
+}
+
+TEST(JITKernel_key, matmul) {
+  jit::matmul_attr_t attr1(1, 2, 3);
+  jit::matmul_attr_t attr2(1, 2, 3);
+  jit::matmul_attr_t attr3(1, 3, 3);
+  jit::matmul_attr_t attr4(2, 3, 4);
+
+  auto key1 = jit::JitCodeKey<jit::matmul_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::matmul_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::matmul_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::matmul_attr_t>(attr4);
+
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key3 != key4);
+}
+
+TEST(JITKernel_key, emb_seq_pool) {
+  jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg);
+  jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum);
+
+  auto key1 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr5);
+
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key2 != key5);
+  EXPECT_TRUE(key4 != key5);
+}
+
+TEST(JITKernel_key, sgd) {
+  jit::sgd_attr_t attr1(1, 2, 3, 4, 5);
+  jit::sgd_attr_t attr2(1, 2, 3, 4, 5);
+  jit::sgd_attr_t attr3(9, 8, 7, 4, 6);
+  jit::sgd_attr_t attr4(1, 2, 3, 6, 5);
+  jit::sgd_attr_t attr5(10, 9, 8, 7, 6);
+
+  auto key1 = jit::JitCodeKey<jit::sgd_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::sgd_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::sgd_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::sgd_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::sgd_attr_t>(attr5);
+
+  EXPECT_TRUE(key1 == key2);
   EXPECT_TRUE(key2 == key3);
   EXPECT_TRUE(key3 != key4);
+  EXPECT_TRUE(key3 != key5);
+  EXPECT_TRUE(key4 != key5);
 }
-// TODO(TJ): add more test about key and pool
+
+// test kernerls
+#define TestKernelVMul TestKernelXYZN
+#define TestKernelVAdd TestKernelXYZN
+#define TestKernelVAddRelu TestKernelXYZN
+#define TestKernelVSub TestKernelXYZN
+
+#define TestKernelVScal TestKernelAXYN
+#define TestKernelVAddBias TestKernelAXYN
+
+#define TestKernelVRelu TestKernelXYN
+#define TestKernelVIdentity TestKernelXYN
+#define TestKernelVSquare TestKernelXYN
+#define TestKernelVExp TestKernelXYN
+#define TestKernelVSigmoid TestKernelXYN
+#define TestKernelVTanh TestKernelXYN
+#define TestKernelVCopy TestKernelXYN
+
+#define TestKernelHMax TestKernelXRN
+#define TestKernelHSum TestKernelXRN
+
+#define TestKernelLSTMCtHt TestKernelLSTM
+#define TestKernelLSTMC1H1 TestKernelLSTM
+
+#define TestKernelGRUH1 TestKernelGRU
+#define TestKernelGRUHtPart1 TestKernelGRU
+#define TestKernelGRUHtPart2 TestKernelGRU
+
+#define TEST_CPU_KERNEL(kernel_type)                                      \
+  TEST(JITKernel, kernel_type) {                                          \
+    TestKernel##kernel_type<jit::kernel_type##Tuple<float>, CPUPlace>();  \
+    TestKernel##kernel_type<jit::kernel_type##Tuple<double>, CPUPlace>(); \
+  }
+
+TEST_CPU_KERNEL(VMul);
+TEST_CPU_KERNEL(VAdd);
+TEST_CPU_KERNEL(VAddRelu);
+TEST_CPU_KERNEL(VSub);
+
+TEST_CPU_KERNEL(VScal);
+TEST_CPU_KERNEL(VAddBias);
+
+TEST_CPU_KERNEL(VRelu);
+TEST_CPU_KERNEL(VIdentity);
+TEST_CPU_KERNEL(VSquare);
+TEST_CPU_KERNEL(VExp);
+TEST_CPU_KERNEL(VSigmoid);
+TEST_CPU_KERNEL(VTanh);
+TEST_CPU_KERNEL(VCopy);
+
+TEST_CPU_KERNEL(HMax);
+TEST_CPU_KERNEL(HSum);
+
+TEST_CPU_KERNEL(LSTMCtHt);
+TEST_CPU_KERNEL(LSTMC1H1);
+
+TEST_CPU_KERNEL(GRUH1);
+TEST_CPU_KERNEL(GRUHtPart1);
+TEST_CPU_KERNEL(GRUHtPart2);
+
+TEST_CPU_KERNEL(NCHW16CMulNC);
+TEST_CPU_KERNEL(LayerNorm);
+TEST_CPU_KERNEL(CRFDecoding);
+
+TEST_CPU_KERNEL(SeqPool);
+TEST_CPU_KERNEL(EmbSeqPool);
+TEST_CPU_KERNEL(MatMul);
+TEST_CPU_KERNEL(Softmax);
+TEST_CPU_KERNEL(Sgd);
+TEST_CPU_KERNEL(VBroadcast);
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index b9db6daf08..9b1a854a31 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/layer_norm_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     }
     if (ctx->HasOutput(framework::GradVarName("Bias"))) {
       ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
+                        ctx->GetInputDim("Scale"));
     }
   }
 
@@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("layer_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Mean", Output("Mean"));
+    op->SetInput("Variance", Output("Variance"));
+    if (ForwardOp().Inputs().count("Scale") > 0) {
+      op->SetInput("Scale", Input("Scale"));
+      op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    }
+
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LayerNormGradOpDescMaker);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index f564a10396..db794ed421 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(bias->numel(), right);
 
     auto ker =
-        jit::Get<jit::kLayerNorm, jit::LayerNormTuples<T>, platform::CPUPlace>(
-            right);
+        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
+            .At(right);
     ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
         scale->data<T>(), bias->data<T>(), static_cast<int>(left),
         static_cast<const float>(epsilon), right);
@@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
     auto x = *ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
     auto* mean = ctx.Input<Tensor>("Mean");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
     auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
 
@@ -275,18 +273,13 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
       x.Resize(matrix_shape);
       temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
 
-      if (!(bias && scale)) {
-        temp_norm.ShareDataWith(*y);
-        temp_norm.Resize(matrix_shape);
-      } else {
-        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-        // get x_norm
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-            ctx, &temp_norm, var, /*axis*/ 0,
-            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-      }
+      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+      // get x_norm
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
     }
 
     if (d_bias) {
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index f5c802986e..2948cf71a9 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/load_combine_op.h"
 
 namespace paddle {
 namespace operators {
 
-class LoadCombineOp : public framework::OperatorBase {
+class LoadCombineOp : public framework::OperatorWithKernel {
  public:
-  LoadCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto model_from_memory = Attr<bool>("model_from_memory");
-    auto out_var_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-    if (!model_from_memory) {
-      std::ifstream fin(filename, std::ios::binary);
-      PADDLE_ENFORCE(static_cast<bool>(fin),
-                     "Cannot open file %s for load_combine op", filename);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    } else {
-      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename, std::ios::in | std::ios::binary);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    }
-  }
-  void LoadParamsFromBuffer(
-      const framework::Scope &scope, const platform::Place &place,
-      std::istream *buffer, bool load_as_fp16,
-      const std::vector<std::string> &out_var_names) const {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      auto *out_var = scope.FindVar(out_var_names[i]);
-
-      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                     out_var_names[i]);
-
-      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
-
-      // Get data from fin to tensor
-      DeserializeFromStream(*buffer, tensor, dev_ctx);
-
-      auto in_dtype = tensor->type();
-      auto out_dtype =
-          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        // convert to float16 tensor
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor fp16_tensor;
-        // copy LoD info to the new tensor
-        fp16_tensor.set_lod(tensor->lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                                 &fp16_tensor);
-
-        // reset output tensor
-        out_var->Clear();
-        tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(fp16_tensor.lod());
-        tensor->ShareDataWith(fp16_tensor);
-      }
-    }
-    buffer->peek();
-    PADDLE_ENFORCE(buffer->eof(),
-                   "You are not allowed to load partial data via "
-                   "load_combine_op, use load_op instead.");
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
   }
 };
 
@@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LoadCombine Operator.
 
-LoadCombine operator loads LoDTensor variables from a file, which could be 
-loaded in memory already. The file should contain one or more LoDTensors 
+LoadCombine operator loads LoDTensor variables from a file, which could be
+loaded in memory already. The file should contain one or more LoDTensors
 serialized using the SaveCombine operator. The
-LoadCombine operator applies a deserialization strategy to appropriately load 
-the LodTensors, and this strategy complements the serialization strategy used 
+LoadCombine operator applies a deserialization strategy to appropriately load
+the LodTensors, and this strategy complements the serialization strategy used
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
-with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+with the SaveCombine operator, and can only deserialize one or more LoDTensors
 that were saved using the SaveCombine operator.
 
 )DOC");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
                   ops::LoadCombineOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu
new file mode 100644
index 0000000000..2a42c0daa7
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
new file mode 100644
index 0000000000..8f620ba7d2
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto model_from_memory = ctx.Attr<bool>("model_from_memory");
+    auto &out_var_names = ctx.Outputs("Out");
+
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename, std::ios::binary);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+
+  void LoadParamsFromBuffer(
+      const framework::ExecutionContext &context, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto out_vars = context.MultiOutputVar("Out");
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      PADDLE_ENFORCE(out_vars[i] != nullptr,
+                     "Output variable %s cannot be found", out_var_names[i]);
+
+      auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
+
+      // Get data from fin to tensor
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
+
+      auto in_dtype = tensor->type();
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
+        out_vars[i]->Clear();
+        tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
+      }
+    }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 4bce4eba22..2d8e6ca854 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
 
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include <string>
+
+#include "paddle/fluid/operators/load_op.h"
 
 namespace paddle {
 namespace operators {
 
-class LoadOp : public framework::OperatorBase {
+class LoadOp : public framework::OperatorWithKernel {
  public:
-  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    auto out_var_name = Output("Out");
-    auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Output variable %s cannot be found in scope %p",
-                   out_var_name, &scope);
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 
-    if (out_var->IsType<framework::LoDTensor>()) {
-      LoadLodTensor(fin, place, out_var);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "Load only support LoDTensor and SelectedRows, %s has wrong type",
-          out_var_name);
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin, const platform::Place &place,
-                     framework::Variable *var) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor, dev_ctx);
-
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->type();
-    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                               &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, platform::CPUPlace());
+    return kt;
   }
 };
 
@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "file.");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu
new file mode 100644
index 0000000000..90f78110f8
--- /dev/null
+++ b/paddle/fluid/operators/load_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
new file mode 100644
index 0000000000..3bf3c6bed2
--- /dev/null
+++ b/paddle/fluid/operators/load_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    auto filename = ctx.Attr<std::string>("file_path");
+    std::ifstream fin(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = ctx.Outputs("Out").data();
+    auto *out_var = ctx.OutputVar("Out");
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
+                   out_var_name);
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var, ctx);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
+
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var,
+                     const framework::ExecutionContext &ctx) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
+
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto in_dtype = tensor->type();
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+
+      // reset output tensor
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
+    }
+  }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 166952fe23..0a43ac0c52 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o).SetType(
-          framework::proto::VarType::LOD_RANK_TABLE);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("Out")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 7c8fe5fbd7..a814c365d7 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(level0.size(), 1,
                         "If Input(Y) not provided, the target lod should be "
                         "specified by attribute `target_lod`.");
+    } else {
+      ctx->ShareLoD("Y", "Out");
     }
+
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 9b91cf5260..61e3427370 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 0029932bc0..8d1ebe6b1c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
new file mode 100644
index 0000000000..8cc24200d3
--- /dev/null
+++ b/paddle/fluid/operators/math.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include "math.h"  // NOLINT
+
+namespace paddle {
+namespace operators {
+
+inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
+  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
+
+inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
+
+inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
+  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
+
+inline HOSTDEVICE double real_log(double x) { return ::log(x); }
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index cb200ec8d6..44cbdf2e98 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -20,17 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-namespace {
-
-__device__ __forceinline__ float real_log(float x) { return logf(x); }
-
-__device__ __forceinline__ double real_log(double x) { return log(x); }
-
-__device__ __forceinline__ platform::float16 real_log(
-    const platform::float16& val) {
-  return static_cast<platform::float16>(logf(static_cast<float>(val)));
-}
-
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int N, const int D,
@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
     Y[blockIdx.x] = -val;
   }
 }
-}  // namespace
 
 template <typename T>
 class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index 0ad57c51be..66ce57594a 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
     return;
   }
   if (relu) {
-    auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
-                                    platform::CPUPlace>::Cache()
-                       .At(N);
+    auto compute =
+        jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
+            N);
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
       compute(B, dst, dst, N);
     }
   } else {
-    auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
-                                    platform::CPUPlace>::Cache()
-                       .At(N);
+    auto compute =
+        jit::KernelFuncs<jit::VAddTuple<T>, platform::CPUPlace>::Cache().At(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 035e10dcbe..1b43306790 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                    "The numel of 'pad_value' can only be 1 or be equal to the "
                    "'step_width'.");
 
-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
-      pad_tensor->Resize(pad_tensor_dims);
-      return;
-    }
-
     const int kBlockSize = 512;
 
     /* At least use 32 threads to copy sequence_width elements,
@@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-
+    /*
     if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
       TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
       seq_tensor->Resize(seq_tensor_dims);
       return;
     }
+    */
 
     const int kBlockSize = 512;
 
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 2a47502614..7af44f2b2c 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
           static_cast<int>(input.numel() / input.dims()[0]),
           jit::SeqPoolType::kSum);
       auto seqpool =
-          jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
-              attr);
+          jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache()
+              .At(attr);
       for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
         attr.h = static_cast<int>(lod[i + 1] - lod[i]);
         seqpool(src, dst, &attr);
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index a1cb3f9728..d77b6712c5 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
     const int kClassDim = 1;
     // 2D data. Batch x C
     auto compute_softmax =
-        jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>,
-                         platform::CPUPlace>::Cache()
+        jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
             .At(in_dims[kClassDim]);
     compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
   }
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 242a1b9ae9..f182827452 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel {
                                      context->Attrs().Get<bool>("transpose_Y"));
 
     PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
-    PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
-                   mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
+                     mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    }
     std::vector<int64_t> dim_out;
     if (mat_dim_x.batch_size_ != 0) {
       dim_out = framework::vectorize(dim_x);
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 54c6a71111..97387af92f 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
 }
 
 static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
-                                                const mkldnn::engine& engine) {
-  constexpr auto data_type = mkldnn::memory::f32;
+                                                const mkldnn::engine& engine,
+                                                const memory::data_type& dt) {
   const auto dims = paddle::framework::vectorize2int(input.dims());
   const auto format = input.format();
-  auto description = memory::desc(dims, data_type, format);
+  auto description = memory::desc(dims, dt, format);
   auto mem_prim_desc = memory::primitive_desc(description, engine);
   return mem_prim_desc;
 }
 
+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
+
 static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine(
   return dev_ctx.GetEngine();
 }
 
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const std::vector<const Tensor*> multi_input,
+                      const int64_t& concat_axis, const memory::data_type& dt) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  for (size_t i = 0; i < multi_input.size(); i++) {
+    platform::MKLDNNHandler::AppendKeyDims(
+        &key, paddle::framework::vectorize2int(multi_input[i]->dims()));
+  }
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  return key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
  public:
   concat::primitive_desc CreateConcatPrimDescriptor(
       const std::vector<const Tensor*> multi_input, Tensor* output,
-      int concat_axis, const mkldnn::engine& mkldnn_engine) {
-    CreateSourcesDescriptors(multi_input, mkldnn_engine);
-    auto dst_desc = CreateDstMemDescriptor(output);
+      int concat_axis, const mkldnn::engine& mkldnn_engine,
+      const memory::data_type& dt = memory::data_type::f32) {
+    CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
+    auto dst_desc = CreateDstMemDescriptor(output, dt);
     return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
   }
 
@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory {
     return concat(concat_pd, inputs, dst_mem.get());
   }
 
+  void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
+                               void* handler) {
+    srcs[i].set_data_handle(handler);
+  }
+
+  void SetDstDataHandle(const memory& dst_mem, void* handler) {
+    dst_mem.set_data_handle(handler);
+  }
+
+  std::vector<memory> GetSrcs() { return srcs; }
+
+  memory GetDst() { return dst_mem.get(); }
+
  private:
-  memory::desc CreateDstMemDescriptor(Tensor* output) {
+  memory::desc CreateDstMemDescriptor(Tensor* output,
+                                      const memory::data_type& dt) {
     auto dst_dims = paddle::framework::vectorize2int(output->dims());
-    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
-                        memory::format::any);
+    return memory::desc(dst_dims, dt, memory::format::any);
   }
 
   mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
-                                 Tensor* output, platform::CPUPlace place) {
+                                 Tensor* output,
+                                 const platform::CPUPlace& place) {
     return memory(concat_pd.dst_primitive_desc(),
                   output->mutable_data<T>(place));
   }
 
   void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
-                                const mkldnn::engine& mkldnn_engine) {
+                                const mkldnn::engine& mkldnn_engine,
+                                const memory::data_type& dt) {
     for (size_t i = 0; i < multi_input.size(); i++) {
-      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      auto mem_prim_desc =
+          CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
       srcs_pd.push_back(mem_prim_desc);
       srcs.push_back(
           memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
@@ -120,21 +158,59 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto place = GetCpuPlace(ctx);
-    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
-
     auto multi_input = ctx.MultiInput<Tensor>("X");
     EnforceLayouts(multi_input);
     Tensor* output = ctx.Output<Tensor>("Out");
     int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    auto place = GetCpuPlace(ctx);
+
+    memory::data_type dt =
+        paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
 
     ConcatPrimitiveFactory<T> prim_creator;
-    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
-        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
-    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
-    stream(stream::kind::eager).submit({concat}).wait();
+    std::string key = CreateKey(ctx, multi_input, concat_axis, dt);
+    const std::string key_prim = key + "@concat_p";
+    const std::string key_concat_pd = key + "@concat_pd";
+    const std::string key_srcs = key + "@concat_srcs";
+    const std::string key_dst = key + "@concat_dst";
+
+    std::shared_ptr<concat::primitive_desc> concat_pd;
+    std::shared_ptr<std::vector<memory>> srcs;
+    std::shared_ptr<memory> dst_mem;
+    auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
+
+    if (concat_p == nullptr) {
+      const auto& mkldnn_engine = dev_ctx.GetEngine();
+      concat_pd = std::make_shared<concat::primitive_desc>(
+          prim_creator.CreateConcatPrimDescriptor(multi_input, output,
+                                                  static_cast<int>(concat_axis),
+                                                  mkldnn_engine, dt));
+      concat_p = std::make_shared<concat>(
+          prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
+      srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
+      dst_mem = std::make_shared<memory>(prim_creator.GetDst());
+      dev_ctx.SetBlob(key_prim, concat_p);
+      dev_ctx.SetBlob(key_concat_pd, concat_pd);
+      dev_ctx.SetBlob(key_srcs, srcs);
+      dev_ctx.SetBlob(key_dst, dst_mem);
+    } else {
+      srcs = std::static_pointer_cast<std::vector<memory>>(
+          dev_ctx.GetBlob(key_srcs));
+      dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
+      concat_pd = std::static_pointer_cast<concat::primitive_desc>(
+          dev_ctx.GetBlob(key_concat_pd));
+      for (size_t i = 0; i < multi_input.size(); i++) {
+        prim_creator.SetSrcDataHandleByIndex(
+            *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
+      }
+      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+    }
+
+    stream(stream::kind::eager).submit({*concat_p}).wait();
 
-    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
+    output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
   }
 };
 }  // namespace operators
@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>)
+                   ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<int8_t>,
+                   ops::ConcatMKLDNNOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 14ca3e8073..8d96ae7e42 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                  &dst_memory_p);
         } else {
+          need_s8_to_u8 = fuse_relu;
           platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
                                                 &dst_memory_p);
         }
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 3a926a716f..69c0486eb6 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
 
@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
 
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
     auto dst_memory = mem.dst(output_data);
@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
 
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
     if (input_grad) {
+      input_grad->Resize(input->dims());
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     }
     if (w_grad) {
+      w_grad->Resize(w->dims());
       w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
     }
 
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const T* input_data = input->data<T>();
-
-    const Tensor* w = ctx.Input<Tensor>("W");
-    const T* w_data = w->data<T>();
-
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index e41bfb80df..4debc7ca5e 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> axis_int8 = {0, 2, 3, 1};
+    if (axis.size() != 1) {
+      PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size());
+      for (size_t i = 0; i < axis.size(); i++) {
+        PADDLE_ENFORCE_EQ(axis[i], axis_int8[i],
+                          "Current INT8 MKLDNN Transpose kernel only surpport "
+                          "axis with [0, 2, 3, 1] due to MKL-DNN kernel "
+                          "implementation.");
+      }
+    }
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->ShareDataWith(*input);
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(input->format());
+  }
+};
+
 template <typename T>
 class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
+                   ops::TransposeMKLDNNOpKernel<float>,
+                   ops::TransposeINT8MKLDNNOpKernel<uint8_t>,
+                   ops::TransposeINT8MKLDNNOpKernel<int8_t>);
+
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 0018139cb0..6a0ae0dede 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Communicator").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Communicator").front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 256da34912..fa7cc58c08 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 
 class NCEOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
 
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to SelectedRows";
-      block->Var(weight_grad)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to LoDTensor";
-      block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
+    ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index 41037d9039..9f73bbc1fd 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
@@ -42,130 +41,199 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
   for (int i = 0; i < dims.size(); ++i) {
     int k = dims[i];
     k = k == 0 ? 1 : k;
-    sp.push_back(k);
+    sp.emplace_back(k);
   }
   return sp;
 }
 
+static framework::DDim Shape2Ddim(const ngraph::Shape& shape) {
+  std::vector<int64_t> dims;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    int64_t k = shape[i];
+    dims.emplace_back(k);
+  }
+  return framework::make_ddim(dims);
+}
+
 static std::map<framework::proto::VarType::Type, ngraph::element::Type>
     pd2ng_type_map = {
         {framework::proto::VarType::FP32, ngraph::element::f32},
         {framework::proto::VarType::FP64, ngraph::element::f64},
         {framework::proto::VarType::INT32, ngraph::element::i32},
         {framework::proto::VarType::INT64, ngraph::element::i64},
-        {framework::proto::VarType::BOOL, ngraph::element::boolean},
-};
-
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphEngine::func_cache_ = {};
+        {framework::proto::VarType::BOOL, ngraph::element::boolean}};
+
+static std::map<ngraph::element::Type, framework::proto::VarType::Type>
+    ng2pd_type_map = {
+        {ngraph::element::f32, framework::proto::VarType::FP32},
+        {ngraph::element::f64, framework::proto::VarType::FP64},
+        {ngraph::element::i32, framework::proto::VarType::INT32},
+        {ngraph::element::i64, framework::proto::VarType::INT64},
+        {ngraph::element::boolean, framework::proto::VarType::BOOL}};
+
+std::vector<std::string> NgraphEngine::feed_vars = {};
+std::vector<std::string> NgraphEngine::fetch_vars = {};
+framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
+const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+
+std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
+std::unordered_map<std::string,
+                   std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+    NgraphEngine::t_in_cache_ = {};
 
 std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
     ngraph::runtime::Backend::create("CPU");
 
 static std::vector<std::vector<int>> NgraphOpIntervals(
-    framework::BlockDesc* block) {
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
+  NgraphEngine::feed_vars.clear();
+  NgraphEngine::fetch_vars.clear();
   std::vector<std::vector<int>> intervals;
-  auto ops = block->AllOps();
-  int size = ops.size();
+
+  int size = ops->size();
   int left = 0;
-  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != framework::kFetchOpType) {
     ++left;
   }
-  if (left == size) {
-    return intervals;
-  }
-  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
+
+  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+    for (auto& var_name_item : ops->at(left)->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        NgraphEngine::feed_vars.emplace_back(var_name);
+      }
+    }
     ++left;
   }
 
   int right = left;
-  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
+  while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
     ++right;
   }
-  if (right == size) {
-    return intervals;
+
+  int index = right;
+  while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
+    for (auto& var_name_item : ops->at(index)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        NgraphEngine::fetch_vars.emplace_back(var_name);
+      }
+    }
+    ++index;
+  }
+
+  if (left == size || ops->at(left)->Type() == framework::kFetchOpType) {
+    left = 0;
   }
-  if (left >= right) return intervals;
 
   // (left, right - 1) represents indices between feed and fetch
   int pivot = left;
   while (pivot < right) {
-    auto op_type = ops.at(pivot)->Type();
+    auto op_type = ops->at(pivot)->Type();
     if (NgraphBridge::isRegister(op_type)) {
       ++pivot;
     } else {
       int start = pivot, end = start;
       while (pivot < right &&
-             (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
+             (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) {
         ++pivot;
         ++end;
       }
       std::vector<int> interval = {start, end};
-      intervals.push_back(interval);
+      intervals.emplace_back(interval);
     }
   }  // end while
   return intervals;
 }
 
-static void SubstituteNgraphOp(framework::BlockDesc* block,
-                               std::string block_str,
-                               std::vector<int> interval) {
-  framework::ProgramDesc program;
-  block->RemoveOp(interval.at(0), interval.at(1));
-  auto* ng_op = block->InsertOp(interval.at(0));
-  ng_op->SetType("ngraph_engine");
-  ng_op->SetAttr("interval", interval);
-  ng_op->SetAttr("graph", block_str);
+static void SubstituteNgraphOp(
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops,
+    std::string engine_key, std::string block_str, std::vector<int> interval) {
+  framework::OpDesc ng_op_desc(nullptr);
+  ng_op_desc.SetType("ngraph_engine");
+  ng_op_desc.SetAttr("interval", interval);
+  ng_op_desc.SetAttr("engine_key", engine_key);
+  ng_op_desc.SetAttr("graph", block_str);
+
+  ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
+  ops->insert(ops->begin() + interval[0],
+              framework::OpRegistry::CreateOp(ng_op_desc));
 }
 
-// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
-void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(4) << "use_ngraph=True";
-  for (size_t bid = 0; bid < program.Size(); ++bid) {
-    // TODO(baojun-nervana): Remove the const_cast
-    auto* block =
-        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
-    std::string block_str = block->Proto()->SerializeAsString();
-    auto intervals = NgraphOpIntervals(block);
-    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-      SubstituteNgraphOp(block, block_str, *it);
-    }
+std::string SerializedBlock(const std::vector<framework::OpDesc*>& op_descs) {
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+
+  for (auto* op_desc : op_descs) {
+    auto* op = block_desc.AppendOp();
+    *op->Proto() = *op_desc->Proto();
+  }
+  return block_desc.Proto()->SerializeAsString();
+}
+
+std::string GenerateEngineKey(const framework::BlockDesc& bdesc) {
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+
+  for (auto& op_desc : bdesc.AllOps()) {
+    auto* op = block_desc.AppendOp();
+    *op->Proto() = *op_desc->Proto();
+  }
+  auto engine_key = std::to_string(
+      std::hash<std::string>()(block_desc.Proto()->SerializeAsString()));
+  return engine_key;
+}
+
+std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
+                              const std::vector<std::string>& engine_outputs,
+                              int size) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += std::to_string(size);
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+
+void NgraphEngine::FuseNgraphOps(
+    const framework::BlockDesc& block_desc,
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
+  NgraphEngine::p_bdesc = &block_desc;
+  auto intervals = NgraphOpIntervals(ops);
+  std::string engine_key =
+      GenerateEngineKey(feed_vars, fetch_vars, ops->size());
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    SubstituteNgraphOp(ops, engine_key, "", *it);
   }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
 }
 
 NgraphEngine::NgraphEngine(const framework::Scope& scope,
                            const platform::Place& place,
-                           const std::string& serialized_graph,
-                           const std::vector<int>& interval)
+                           const framework::ExecutionContext& ctx)
     : scope_(scope), place_(place) {
+  std::string serialized_graph = ctx.Attr<std::string>("graph");
+  auto interval = ctx.Attr<std::vector<int>>("interval");
+  std::string engine_key = ctx.Attr<std::string>("engine_key");
+
   var_in_node_map_ = std::make_shared<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
 
   var_node_map_ = std::make_shared<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
 
-  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
-                    serialized_graph;
-
-  framework::proto::BlockDesc bdesc;
-  bdesc.ParseFromString(serialized_graph);
-  framework::BlockDesc block(nullptr, &bdesc);
-
-  Prepare(block, interval);
-
-  BuildNgIO();
-
-  GetNgFunction();
+  GetNgFunction(engine_key, interval);
 }
 
-void NgraphEngine::Prepare(const framework::BlockDesc& block,
-                           const std::vector<int>& interval) {
-  for (auto& var : block.AllVars()) {
+void NgraphEngine::Prepare(const std::vector<int>& interval) {
+  bool has_fetch = false, is_full = false;
+  for (auto& var : p_bdesc->AllVars()) {
     if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
           var->GetType() == framework::proto::VarType::LOD_TENSOR ||
           var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
@@ -192,125 +260,79 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block,
     }
   }
 
-  auto ops_desc = block.AllOps();
-  int idx = interval[0];
-  while (idx < interval[1]) {
-    auto op_desc = ops_desc.at(idx);
-    auto op = framework::OpRegistry::CreateOp(*op_desc);
-    fused_ops_.push_back(std::move(op));
-    ++idx;
-  }
-
-  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
-    auto op_desc = ops_desc.at(idx);
-    for (auto& var_name_item : op_desc->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        post_op_inputs_.insert(var_name);
-      }
+  std::vector<paddle::framework::OpDesc*> ops_desc;
+  for (auto op_desc : p_bdesc->AllOps()) {
+    ops_desc.emplace_back(op_desc);
+    if (op_desc->Type() == framework::kFetchOpType) {
+      has_fetch = true;
     }
-    ++idx;
-  }
-
-  while (idx < static_cast<int>(ops_desc.size()) &&
-         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
-    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
-    fetches_.insert(fetch_target_name);
-    ++idx;
   }
 
-  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
-      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
-    ng_op_state_ = OpState::FULL;
-  }
-
-  for (auto* op_desc : ops_desc) {
+  for (auto op_desc : ops_desc) {
     if (op_desc->Type().find("_grad") != std::string::npos) {
-      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
-                                                   : OpState::PARTIAL_TRAIN;
+      this->is_test_ = false;
       break;
     }
   }
 
-  if (ng_op_state_ != OpState::FULL_TRAIN &&
-      ng_op_state_ != OpState::PARTIAL_TRAIN) {
-    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
-                                                 : OpState::PARTIAL_TEST;
+  if (interval[0] > 0 &&
+      ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
+      interval[1] < static_cast<int>(ops_desc.size()) &&
+      ops_desc.at(interval[1])->Type() == framework::kFetchOpType) {
+    is_full = true;
   }
-}
 
-void NgraphEngine::GetNgInputShape(
-    std::shared_ptr<framework::OperatorBase> op) {
-  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-  op->RuntimeInferShape(scope_, place_, ctx);
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<framework::LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto sp = Ddim2Shape(tensor_pd->dims());
-        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-            var_in_.end()) {
-          if (var_node_map_->find(var_name) == var_node_map_->end()) {
-            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm =
-                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-            (*var_node_map_)[var_name] = prm;
-            (*var_in_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
+  if (is_full) {
+    this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
+  } else {
+    this->op_state_ =
+        this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN;
   }
-}
 
-void NgraphEngine::BuildNgNodes() {
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
+  int idx = interval[0];
+  while (idx < interval[1]) {
+    this->fused_ops_.emplace_back(
+        framework::OpRegistry::CreateOp(*(ops_desc[idx])));
+    ++idx;
+  }
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+    auto op_desc = ops_desc.at(idx);
+    for (auto& var_name_item : op_desc->Inputs()) {
       for (auto& var_name : var_name_item.second) {
-        if (var_node_map_->find(var_name) == var_node_map_->end()) {
-          auto* var = scope_.FindVar(var_name);
-          if (var && var->IsType<framework::LoDTensor>()) {
-            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-            auto& ddim = tensor_pd->dims();
-            auto ng_shape = Ddim2Shape(ddim);
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
-                                                               ng_shape, true);
-            (*var_node_map_)[var_name] = prm;
-          }
-        }
+        this->post_op_inputs_.insert(var_name);
       }
     }
+    ++idx;
   }
-  NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
+
+  if (!has_fetch) {
+    op_state_ = OpState::UNKNOWN;
   }
+
+  BuildNgIO(ops_desc, interval);
 }
 
-void NgraphEngine::BuildNgIO() {
+void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
+                             const std::vector<int>& interval) {
   std::unordered_set<std::string> inputs;
   std::unordered_set<std::string> outputs;
-
-  for (auto& op : fused_ops_) {
+  for (int i = interval[0]; i < interval[1]; ++i) {
+    auto op = ops_desc[i];
     for (auto& var_name_item : op->Inputs()) {
       for (auto& var_name : var_name_item.second) {
         inputs.insert(var_name);
         const bool is_output = outputs.find(var_name) != outputs.end();
         if (!is_output &&
             std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end()) {
+                var_in_.end() &&
+            scope_.FindVar(var_name)) {
           // fill var_in here to keep lhs and rhs order
-          var_in_.push_back(var_name);
+          this->var_in_.emplace_back(var_name);
         }
       }
     }
 
-    if (op->Type() != "fill_constant") {
-      GetNgInputShape(op);
-    }
-
     for (auto& var_name_item : op->Outputs()) {
       PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
                         "op %s has more than 1 output - Not handling yet",
@@ -322,172 +344,279 @@ void NgraphEngine::BuildNgIO() {
   }
 
   // var_out.clear();
-  for (auto& op : fused_ops_) {
+  for (int i = interval[0]; i < interval[1]; ++i) {
+    auto op = ops_desc[i];
     for (auto& var_name_item : op->Outputs()) {
       PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
                         "op %s has more than 1 output - Not handling yet",
                         op->Type());
       for (auto& var_name : var_name_item.second) {
-        switch (ng_op_state_) {
+        switch (this->op_state_) {
           case OpState::PARTIAL_TEST:
             if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
+                find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end()) {
+              this->var_out_.emplace_back(var_name);
             }
             break;
           case OpState::FULL_TEST:
-            if (fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                fetch_vars.end()) {
+              this->var_out_.emplace_back(var_name);
             }
             break;
           case OpState::PARTIAL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end() ||
                 post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
                 persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
+              this->var_out_.emplace_back(var_name);
             }
             break;
           case OpState::FULL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end() ||
                 persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
+              this->var_out_.emplace_back(var_name);
             }
             break;
           default:
-            var_out_.push_back(var_name);
+            this->var_out_.emplace_back(var_name);
         }
       }
     }
   }
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto var_name = var_in_[i];
+    if (persistables_.find(var_name) == persistables_.end()) {
+      var_in_updates_.emplace_back(i);
+    }
+  }
 }
 
-void NgraphEngine::BuildNgFunction() {
+void NgraphEngine::GetNgInputShape() {
+  for (auto& var_name : var_in_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto sp = Ddim2Shape(tensor_pd->dims());
+      auto ng_type = var_type_map_[var_name];
+      auto prm = std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+      (*var_node_map_)[var_name] = prm;
+      (*var_in_node_map_)[var_name] = prm;
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto* var = scope_.FindVar(var_name);
+          if (var && var->IsType<framework::LoDTensor>()) {
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+            auto& ddim = tensor_pd->dims();
+            auto ng_shape = Ddim2Shape(ddim);
+            auto ng_type = var_type_map_[var_name];
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+
+  NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::RunInferShape() {
+  for (auto& op : fused_ops_) {
+    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+    op->RuntimeInferShape(scope_, place_, ctx);
+  }
+}
+
+void NgraphEngine::BuildNgFunction(const std::vector<int>& interval) {
+  Prepare(interval);
+  RunInferShape();
+  GetNgInputShape();
   BuildNgNodes();
   ngraph_function_ = nullptr;
   ngraph::NodeVector func_outputs;
   ngraph::ParameterVector func_inputs;
 
   for (auto& vo : var_out_) {
-    func_outputs.push_back(var_node_map_->at(vo));
+    func_outputs.emplace_back(var_node_map_->at(vo));
   }
 
   for (auto& vi : var_in_) {
     std::shared_ptr<ngraph::op::Parameter> prm =
         std::dynamic_pointer_cast<ngraph::op::Parameter>(
             var_in_node_map_->at(vi));
-    func_inputs.push_back(prm);
+    func_inputs.emplace_back(prm);
   }
 
   ngraph_function_ =
       std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }
 
-void NgraphEngine::GetNgFunction() {
-  bool cache_on = true;
-  if (cache_on) {
-    std::string input_shape_str;
-    for (auto& var_name : var_in_) {
-      auto shape = var_node_map_->at(var_name)->get_shape();
-      for (size_t i = 0; i < shape.size(); ++i) {
-        input_shape_str += std::to_string(shape.at(i));
+void NgraphEngine::GetNgFunction(std::string engine_key,
+                                 const std::vector<int>& interval) {
+  bool use_cache = true;
+  if (use_cache) {
+    this->func_cache_key_ = "";
+    for (int i = 0; i < std::min(static_cast<int>(feed_vars.size()), 10); ++i) {
+      auto* var = scope_.FindVar(feed_vars[i]);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto dims = tensor_pd->dims();
+        for (int j = 0; j < dims.size(); ++j) {
+          func_cache_key_ += std::to_string(dims[j]);
+        }
       }
     }
-    func_cache_key_ = input_shape_str + func_cache_key_;
-    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
-      ngraph_function_ = func_cache_.at(func_cache_key_);
-    } else {
-      BuildNgFunction();
-      func_cache_[func_cache_key_] = ngraph_function_;
+    func_cache_key_ += std::to_string(interval[0]) + "_" +
+                       std::to_string(interval[1]) + engine_key;
+    func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
+
+    if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
+      if (engine_cache[func_cache_key_].persistables.size() == 0) {
+        engine_cache.clear();
+        t_in_cache_.clear();
+      } else {
+        auto var_name = engine_cache[func_cache_key_].persistables.begin();
+        framework::Variable* var = scope_.FindVar(*var_name);
+        if (var != pre_var_ptr) {
+          engine_cache.clear();
+          t_in_cache_.clear();
+        }
+        pre_var_ptr = var;
+      }
+    }
+
+    if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
+      BuildNgFunction(interval);
+      engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_;
+      engine_cache[func_cache_key_].persistables = this->persistables_;
+      engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
+      engine_cache[func_cache_key_].var_in = this->var_in_;
+      engine_cache[func_cache_key_].var_out = this->var_out_;
+      engine_cache[func_cache_key_].is_test = this->is_test_;
     }
   } else {
-    BuildNgFunction();
+    BuildNgFunction(interval);
   }
 }
 
 void NgraphEngine::Run(const framework::Scope& scope,
                        const platform::Place& place) const {
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+  std::shared_ptr<ngraph::Function> ng_func;
+  const std::set<std::string>* p_persistables;
+  const std::vector<size_t>* p_var_in_updates;
+  const std::vector<std::string>* p_var_in;
+  const std::vector<std::string>* p_var_out;
+  bool is_test;
+
+  bool use_cache = true;
+  if (use_cache) {
+    PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
+                   "Cannot find cached data to run ngraph function");
+    ng_func = engine_cache[func_cache_key_].ngraph_function;
+    p_persistables = &(engine_cache[func_cache_key_].persistables);
+    p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
+    p_var_in = &(engine_cache[func_cache_key_].var_in);
+    p_var_out = &(engine_cache[func_cache_key_].var_out);
+    is_test = engine_cache[func_cache_key_].is_test;
+  } else {
+    ng_func = ngraph_function_;
+    p_persistables = &this->persistables_;
+    p_var_in_updates = &this->var_in_updates_;
+    p_var_in = &this->var_in_;
+    p_var_out = &this->var_out_;
+    is_test = this->is_test_;
+  }
 
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto vi = var_in_.at(i);
-    auto sp = var_node_map_->at(vi)->get_shape();
-    std::shared_ptr<ngraph::runtime::Tensor> ti;
-    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
-                     "Ensure ngraph tensor layout align with paddle tensor");
-      auto ng_type = var_type_map_.at(vi);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i32) {
-        const int* arr = tensor_pd->data<int>();
-        ti = backend_->create_tensor(ngraph::element::i32, sp,
-                                     const_cast<int*>(arr));
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
+
+  auto m_parameters = ng_func->get_parameters();
+  auto m_results = ng_func->get_results();
+  if (is_test && use_cache &&
+      t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
+    p_t_in = &(t_in_cache_[func_cache_key_]);
+    for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
+      int index = p_var_in_updates->at(i);
+      auto vi = p_var_in->at(index);
+      auto sp = m_parameters[index]->get_shape();
+      auto ng_type = m_parameters[index]->get_element_type();
+      std::shared_ptr<ngraph::runtime::Tensor> ti;
+      auto* var = scope.FindVar(vi);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
+        (*p_t_in)[index] = ti;
       } else {
-        PADDLE_THROW("Data type not handling for var %s", vi);
+        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
       }
+    }
+  } else {
+    if (is_test && use_cache) {
+      p_t_in = &(t_in_cache_[func_cache_key_]);
     } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+      p_t_in = &t_in;
     }
-    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
-                    ng_op_state_ == OpState::FULL_TEST)
-                       ? true
-                       : false;
-    bool is_persistable =
-        (persistables_.find(vi) != persistables_.end()) ? true : false;
-    if (is_test && is_persistable) {
-      ti->set_stale(false);
+
+    for (size_t i = 0; i < p_var_in->size(); ++i) {
+      auto vi = p_var_in->at(i);
+      auto sp = m_parameters[i]->get_shape();
+      auto ng_type = m_parameters[i]->get_element_type();
+      std::shared_ptr<ngraph::runtime::Tensor> ti;
+      auto* var = scope.FindVar(vi);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+        PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                       "Ensure ngraph tensor layout align with paddle tensor");
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+      }
+      bool is_persistable =
+          (p_persistables->find(vi) != p_persistables->end()) ? true : false;
+      if (is_test && is_persistable) {
+        ti->set_stale(false);
+      }
+      (*p_t_in).emplace_back(ti);
     }
-    t_in.push_back(ti);
   }
 
-  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto vo = var_out_[i];
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
+  for (size_t i = 0; i < p_var_out->size(); ++i) {
+    auto vo = p_var_out->at(i);
     auto* var = scope.FindVar(vo);
-    std::shared_ptr<ngraph::runtime::Tensor> to;
     if (var && var->IsType<framework::LoDTensor>()) {
+      auto sp = m_results[i]->get_shape();
+      var->GetMutable<framework::LoDTensor>()->Resize(Shape2Ddim(sp));
       auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto dd = tensor_pd->dims();
-      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(vo);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i32) {
-        auto pd_arr = tensor_pd->mutable_data<int>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Data type not handled in for var %s", vo);
-      }
-      t_out.push_back(to);
+      auto ng_type = m_results[i]->get_element_type();
+      void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+      std::shared_ptr<ngraph::runtime::Tensor> to =
+          backend_->create_tensor(ng_type, sp, pd_arr);
+      t_out.emplace_back(to);
     } else {
       PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
     }
   }
 
-  auto handle = backend_->compile(ngraph_function_);
-  handle->call_with_validate(t_out, t_in);
+  auto handle = backend_->compile(ng_func);
+  handle->call_with_validate(t_out, *p_t_in);
 }  // NgraphEngine::Run
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index bf5ff2a743..b6532519e9 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
+#include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
 
 #include "ngraph/ngraph.hpp"
 
@@ -29,33 +35,50 @@ enum class OpState {                /* nGraph support state on ops          */
                      PARTIAL_TRAIN, /* Support partial ops for train        */
                      FULL_TEST,     /* Support full list of ops for test    */
                      PARTIAL_TEST,  /* Support partial list of ops for test */
-                     FULL,          /* All ops supported from feed to fetch */
                      UNKNOWN        /* Output all for debug purpose         */
 };
 
+// cache engine repetitives
+struct EngineCache {
+  std::shared_ptr<ngraph::Function> ngraph_function;
+  std::set<std::string> persistables;
+  std::vector<std::string> var_in;
+  std::vector<std::string> var_out;
+  std::vector<size_t> var_in_updates;
+  bool is_test = true;
+};
+
 // perform graph build through bridge and execute computation
 class NgraphEngine {
  public:
   explicit NgraphEngine(const framework::Scope& scope,
                         const platform::Place& place,
-                        const std::string& serialized_graph,
-                        const std::vector<int>& interval);
+                        const framework::ExecutionContext& ctx);
 
   void Run(const framework::Scope& scope, const platform::Place& place) const;
 
-  static void EnableNgraph(const framework::ProgramDesc& program);
+  static const framework::BlockDesc* p_bdesc;
+  static std::vector<std::string> feed_vars, fetch_vars;
+
+  static void FuseNgraphOps(
+      const framework::BlockDesc& prog,
+      std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
 
  private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
+  static std::unordered_map<std::string, EngineCache> engine_cache;
+  static std::unordered_map<
+      std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+      t_in_cache_;
+  static framework::Variable* pre_var_ptr;
+
   const framework::Scope& scope_;
   const platform::Place& place_;
   std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
   std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
+  std::set<std::string> persistables_;
   std::unordered_set<std::string> post_op_inputs_;
-  OpState ng_op_state_ = OpState::UNKNOWN;
+  OpState op_state_ = OpState::UNKNOWN;
+  bool is_test_{true};
   std::string func_cache_key_;
 
   // ngraph backend eg. CPU
@@ -66,6 +89,8 @@ class NgraphEngine {
   std::vector<std::string> var_in_;
   // var_name of outputs from  fetch in order
   std::vector<std::string> var_out_;
+  // non-persitable var_in
+  std::vector<size_t> var_in_updates_;
   // map input vars to nodes
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -74,19 +99,21 @@ class NgraphEngine {
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
       var_node_map_;
-  // prepare info for nraph engine
-  void Prepare(const framework::BlockDesc& block,
-               const std::vector<int>& interval);
+  // prepare info for ngraph engine need
+  void Prepare(const std::vector<int>& interval);
+  // get ngraph engine input and output list
+  void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
+                 const std::vector<int>& interval);
   // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  void GetNgInputShape();
   // Call ngraph bridge to map ops
   void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
+  // run paddle RuntimeInferShape to get the tensor shape
+  void RunInferShape();
   // build ngraph function call
-  void BuildNgFunction();
+  void BuildNgFunction(const std::vector<int>& interval);
   // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
+  void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
index 3051ca123b..479c95ba08 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDispensable();
     AddOutput("Ys", "A list of outputs").AsDispensable();
     AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::string>("engine_key", "the engine hash key.");
     AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
     AddComment("ngraph engine operator.");
   }
@@ -36,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class NgraphEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
index 2f194a9b87..c9b2a3970e 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& scope = ctx.scope();
     auto place = ctx.GetPlace();
-    std::string serialized_graph = ctx.Attr<std::string>("graph");
-    auto interval = ctx.Attr<std::vector<int>>("interval");
 
-    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    NgraphEngine ngraph_engine(scope, place, ctx);
     ngraph_engine.Run(scope, place);
   }
 };
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
index be36b9d21e..c92ebb7e96 100644
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -27,13 +27,9 @@ namespace paddle {
 namespace operators {
 namespace ngraphs {
 
-void BuildCrossEntropyNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetCrossEntropy(
+    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
+    const bool is_soft_label, int ignore_index) {
   auto label_shape = label->get_shape();
   auto x_shape = x->get_shape();
   auto label_rank = label_shape.size();
@@ -46,18 +42,16 @@ void BuildCrossEntropyNode(
     label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
   }
   if (x_rank > 2) {
-    x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1);
-    x_2d = paddle::platform::NgReshaper(x, x_2d_shape);
+    x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = platform::NgReshaper(x, x_2d_shape);
   }
 
   auto batch_size = x_2d_shape.at(0);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
 
   std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
   if (!is_soft_label) {
-    auto label_1d = paddle::platform::NgReshaper(
-        label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    auto label_1d =
+        platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)});
     node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
   }
   if (x->get_element_type() != node_1_hot->get_element_type()) {
@@ -76,11 +70,9 @@ void BuildCrossEntropyNode(
   auto node_sum =
       std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
   auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
-  auto xe =
-      paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+  auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
 
   if (!is_soft_label) {
-    auto ignore_index = op_attrs.Get<int>("ignore_index");
     auto ignore_node = ngraph::op::Constant::create(
         label->get_element_type(), label_2d_shape, {ignore_index});
     auto not_equal_node =
@@ -89,21 +81,13 @@ void BuildCrossEntropyNode(
                                                       xe->get_element_type());
     xe = xe * mask;
   }
-
-  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+  return xe;
 }
 
-void BuildCrossEntropyGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetCrossEntropyGrad(
+    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
+    std::shared_ptr<ngraph::Node> dy, const bool is_soft_label,
+    int ignore_index) {
   auto x_shape = x->get_shape();
   auto rank = x_shape.size();
 
@@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode(
   if (!is_soft_label) {
     auto label_shape = label->get_shape();
     label_shape.pop_back();
-    label = paddle::platform::NgReshaper(label, label_shape);
+    label = platform::NgReshaper(label, label_shape);
 
-    auto ignore_index = op_attrs.Get<int>("ignore_index");
     auto ignore_node = ngraph::op::Constant::create(
         label->get_element_type(), label_shape, {ignore_index});
     auto not_equal_node =
@@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode(
 
   auto dy_shape = dy->get_shape();
   dy_shape.pop_back();
-  auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape);
+  auto dy_reshape = platform::NgReshaper(dy, dy_shape);
   auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
       dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
   if (x->get_element_type() != label->get_element_type()) {
@@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode(
   if (!is_soft_label) {
     xe_grad = xe_grad * mask;
   }
+  return xe_grad;
+}
 
+void BuildCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  int ignore_index = op_attrs.Get<int>("ignore_index");
+  auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index);
+  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+}
+
+void BuildCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  int ignore_index = op_attrs.Get<int>("ignore_index");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index);
   paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
 }
 }  // namespace ngraphs
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
index 7d5720c460..174b7a91a8 100644
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -27,12 +27,7 @@ namespace paddle {
 namespace operators {
 namespace ngraphs {
 
-void BuildSoftmaxNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetSoftmax(std::shared_ptr<ngraph::Node> x) {
   auto x_shape = x->get_shape();
   int rank = x_shape.size();
   auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1);
@@ -47,16 +42,11 @@ void BuildSoftmaxNode(
           -64., x_shifted);
   auto softmax =
       std::make_shared<ngraph::op::Softmax>(x_clipped, ngraph::AxisSet{1});
-  paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
+  return softmax;
 }
 
-void BuildSoftmaxGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+std::shared_ptr<ngraph::Node> GetSoftmaxGrad(
+    std::shared_ptr<ngraph::Node> out, std::shared_ptr<ngraph::Node> dout) {
   auto out_shape = out->get_shape();
   int rank = out_shape.size();
   auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1);
@@ -70,6 +60,27 @@ void BuildSoftmaxGradNode(
   auto node_bcast = std::make_shared<ngraph::op::Broadcast>(
       node_sum, out_2d_shape, ngraph::AxisSet{1});
   auto dx = (dout - node_bcast) * out;
+  return dx;
+}
+
+void BuildSoftmaxNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto softmax = GetSoftmax(x);
+  paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
+}
+
+void BuildSoftmaxGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto dx = GetSoftmaxGrad(out, dout);
   paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
 }
 }  // namespace ngraphs
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
new file mode 100644
index 0000000000..a6bdf4de95
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
@@ -0,0 +1,90 @@
+/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/cross_entropy_op.h"
+#include "paddle/fluid/operators/ngraph/ops/softmax_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildSoftmaxWithCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto logits = paddle::platform::GetInputNode(op, "Logits", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto softmax = paddle::operators::ngraphs::GetSoftmax(logits);
+
+  auto op_attrs = framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  int ignore_index = op_attrs.Get<int>("ignore_index");
+  auto xe = paddle::operators::ngraphs::GetCrossEntropy(
+      softmax, label, is_soft_label, ignore_index);
+
+  paddle::platform::SetOutputNode(op, "Softmax", softmax, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Loss", xe, ngb_node_map);
+}
+
+void BuildSoftmaxWithCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto softmax = paddle::platform::GetInputNode(op, "Softmax", ngb_node_map);
+  auto loss_grad =
+      paddle::platform::GetInputNode(op, "Loss@GRAD", ngb_node_map);
+  auto softmax_shape = softmax->get_shape();
+  auto rank = softmax_shape.size();
+  if (!is_soft_label) {
+    auto label_shape = label->get_shape();
+    label_shape.pop_back();
+    label = platform::NgReshaper(label, label_shape);
+
+    label =
+        std::make_shared<ngraph::op::OneHot>(label, softmax_shape, rank - 1);
+  }
+
+  auto loss_grad_shape = loss_grad->get_shape();
+  loss_grad_shape.pop_back();
+  auto loss_grad_reshape = platform::NgReshaper(loss_grad, loss_grad_shape);
+  auto loss_grad_bcast = std::make_shared<ngraph::op::Broadcast>(
+      loss_grad_reshape, softmax_shape, ngraph::AxisSet{rank - 1});
+  if (softmax->get_element_type() != label->get_element_type()) {
+    label = std::make_shared<ngraph::op::Convert>(label,
+                                                  softmax->get_element_type());
+  }
+
+  auto logits_grad = loss_grad_bcast * (softmax - label);
+  paddle::platform::SetOutputNode(op, "Logits@GRAD", logits_grad, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(softmax_with_cross_entropy, BuildSoftmaxWithCrossEntropyNode);
+REGISTER_NG_OP(softmax_with_cross_entropy_grad,
+               BuildSoftmaxWithCrossEntropyGradNode);
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 09255f60e6..6262ef0c2d 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
 #include <Eigen/Dense>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> {
     T beta1_pow = *beta1_pow_;
     T beta2_pow = *beta2_pow_;
     lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    size_t row_count = numel / row_numel_;
+    int64_t row_count = static_cast<int64_t>(numel / row_numel_);
 
-    for (size_t i = 0U, j = 0U; i != row_count; ++i) {
+    for (int64_t i = 0, j = 0; i != row_count; ++i) {
       if (i == *(rows_ + j)) {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T g = grad_[j * row_numel_ + k];
           adam_update(i * row_numel_ + k, g);
         }
         ++j;
       } else {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T mom1 = moment1_[i * row_numel_ + k];
           T mom2 = moment2_[i * row_numel_ + k];
           T p = param_[i * row_numel_ + k];
@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows cpu_grad_merge;
+      framework::SelectedRows tmp_grad_merge;
       const framework::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
         // merge duplicated rows if any.
         // The rows of grad_merge have been sorted inside MergeAdd functor
-        framework::SelectedRows* grad_merge_var;
         scatter::MergeAdd<DeviceContext, T> merge_func;
-        if (platform::is_cpu_place(ctx.GetPlace())) {
-          grad_merge_var = &cpu_grad_merge;
-        } else {
-          // FIXME(qiao): GPU also need to fix this
-          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                               .Var()
-                               ->GetMutable<framework::SelectedRows>();
-        }
         merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   grad_merge_var, true);
-        grad_merge_ptr = grad_merge_var;
+                   &tmp_grad_merge, true);
+        grad_merge_ptr = &tmp_grad_merge;
       }
 
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = nullptr;
-// When compiled without CUDA, the CUDAData() interface should not be
-// provided.
-#if defined(PADDLE_WITH_CUDA)
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = grad_merge.rows().data();
-#if defined(PADDLE_WITH_CUDA)
-      }
-#endif
+      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
         }
 #ifndef _WIN32
-        else if (FLAGS_inner_op_parallelism > 1 &&
+        else if (FLAGS_inner_op_parallelism > 1 &&  // NOLINT
                  min_row_size_to_use_multithread > 0 &&
                  param.dims()[0] > min_row_size_to_use_multithread) {
           VLOG(3) << "use multi thread, inner_op_parallelism="
@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
           for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
             int64_t start = i * line_in_each_thread;
             int64_t end = (i + 1) * line_in_each_thread;
-            if (start >= param_row_count) {
+            if (start >= static_cast<int64_t>(param_row_count)) {
               break;
             }
-            if (end > param_row_count) {
-              end = param_row_count;
+            if (end > static_cast<int64_t>(param_row_count)) {
+              end = static_cast<int64_t>(param_row_count);
             }
             fs.push_back(
                 framework::Async([&functor, &row_id_to_grad_row_offset,
@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
           for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
         }
-#endif  // !_WIN32
-        else {
+#endif          // !_WIN32
+        else {  // NOLINT
           functor(param.numel());
         }
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
index 574a03680b..126b665dd4 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
 weight using a local learning rate:
 
 $$
-local\_lr = \eta  * 
+local\_lr = \eta  *
     \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
-velocity = mu * velocity + 
+velocity = mu * velocity +
     local\_lr * (grad + \beta * param) \\
 param = param - velocity. \\
 $$
@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS.
 
 class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index cde238c076..7cf218c20f 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -21,18 +21,14 @@ using Tensor = framework::Tensor;
 
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto input_var = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
-          framework::proto::VarType::SELECTED_ROWS) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& input_var = ctx->Input("Param")[0];
+    for (auto& out_var : ctx->Output("ParamOut")) {
+      if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
+        ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
+      } else if (ctx->GetType(input_var) ==
                  framework::proto::VarType::LOD_TENSOR) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+        ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
       } else {
         PADDLE_THROW(
             "Only support LodTensor and SelectedRows, Unexpected Input Type.");
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 3ed1bff5ff..29a2ae6755 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         VLOG(3) << "Grad SelectedRows contains no data!";
         return;
       }
-      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
+
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows* merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = nullptr;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
       int64_t row_numel =
           merged_grad->value().numel() / merged_grad->rows().size();
       platform::ForRange<DeviceContext> for_range(
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 389c84d246..4550052b2d 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto &grad = grad_var->Get<framework::SelectedRows>();
-      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
-
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows *merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(dev_ctx, grad, merged_grad);
 
       platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+
       auto &merged_tensor = merged_grad->value();
       int64_t row_count = merged_grad->rows().size();
       int64_t row_numel = merged_tensor.numel() / row_count;
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 690381a67f..34e99a14ff 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto input_var_n = op_desc.Input("Param")[0];
-    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &input_var_n = ctx->Input("Param")[0];
+    auto in_var_type = ctx->GetType(input_var_n);
     PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                        in_var_type == framework::proto::VarType::LOD_TENSOR,
                    "The input Var's type should be LoDtensor or SelectedRows,"
                    " but the received var(%s)'s type is %s",
                    input_var_n, in_var_type);
 
-    for (auto &out_var_n : op_desc.Output("ParamOut")) {
-      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
-      if (out_var.GetType() != in_var_type) {
-        out_var.SetType(in_var_type);
+    for (auto &out_var_n : ctx->Output("ParamOut")) {
+      if (ctx->GetType(out_var_n) != in_var_type) {
+        ctx->SetType(out_var_n, in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index c9c9f530fe..5dd5f67e00 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
         T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
 
         auto sgd =
-            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+                attr);
         sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
       } else if (grad_var->IsType<framework::SelectedRows>()) {
         // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
 
         auto sgd =
-            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+                attr);
         sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
       } else {
         PADDLE_THROW("Unsupported Variable Type of Grad");
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 0a0ece162c..7963c27a01 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 53eff2de3e..5300e80747 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -14,8 +14,11 @@
 
 #include "paddle/fluid/operators/py_func_op.h"
 
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable,
   }
 }
 
-class PyFuncOpVarTypInference : public framework::VarTypeInference {
+class PyFuncOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op,
-                  framework::BlockDesc *block) const override {
-    auto &outs = op.Outputs();
-    bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
 
-    auto &ins = op.Inputs();
-    bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
+    bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
 
     /**
      * X or Out can be empty, so that py_func can be more flexible
@@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      */
     PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
 
-    PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0,
-                      "Function id cannot be less than 0");
+    PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
+                      0, "Function id cannot be less than 0");
 
     if (!has_out) return;
 
@@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      * the corresponding forward variable
      */
     const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = outs.at("Out");
+    auto &out_var_names = ctx->Output("Out");
     for (auto &out_var_name : out_var_names) {
       if (out_var_name == framework::kEmptyVarName ||
           out_var_name.size() < kGradVarSuffix.size()) {
@@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
       size_t len = out_var_name.size() - kGradVarSuffix.size();
       if (out_var_name.substr(len) == kGradVarSuffix) {
         auto fwd_var_name = out_var_name.substr(0, len);
-        auto *out_var_desc = block->FindVarRecursive(out_var_name);
-        auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name);
-        PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found",
-                                out_var_name);
-        PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
-                                fwd_var_name);
+        PADDLE_ENFORCE(ctx->HasVar(out_var_name),
+                       "Backward variable %s not found", out_var_name);
+        PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
+                       "Backward variable %s not found", fwd_var_name);
         VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
                  << fwd_var_name << ")";
-        out_var_desc->SetShape(fwd_var_desc->GetShape());
-        out_var_desc->SetDataType(fwd_var_desc->GetDataType());
-        out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel());
-        out_var_desc->SetType(fwd_var_desc->GetType());
+
+        ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
+        ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
+        ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
+        ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
       }
     }
   }
@@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
-                  ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference,
+                  ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
                   ops::PyFuncOpGradDescMaker);
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
new file mode 100644
index 0000000000..ee8c68fd00
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/range_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("Start")) {
+      auto s_dims = ctx->GetInputDim("Start");
+      PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                     "The shape of Input(Start) should be [1].");
+    }
+    if (ctx->HasInput("End")) {
+      auto e_dims = ctx->GetInputDim("End");
+      PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                     "The shape of Input(End) should be [1].");
+    }
+    if (ctx->HasInput("Step")) {
+      auto step_dims = ctx->GetInputDim("Step");
+      PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                     "The shape of Input(Step) should be [1].");
+    }
+    ctx->SetOutputDim("Out", {-1});
+  }
+};
+
+class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "Start of interval. The interval includes this value. It is a "
+             "tensor with shape=[1].");
+    AddInput("End",
+             "End of interval. The interval does not include this value, "
+             "except in some cases where step is not an integer and floating "
+             "point round-off affects the length of out. It is a tensor with "
+             "shape=[1].");
+    AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
+REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
+                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
+                       ops::CPURangeKernel<int64_t>);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
new file mode 100644
index 0000000000..e2c03716d5
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+class CUDARangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
+    T end = n.data<T>()[0];
+    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (size + block - 1) / block;
+    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
+                        ops::CUDARangeKernel<int64_t>,
+                        ops::CUDARangeKernel<float>,
+                        ops::CUDARangeKernel<double>);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
new file mode 100644
index 0000000000..fce58b45c9
--- /dev/null
+++ b/paddle/fluid/operators/range_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
+                 "The step of range op should not be 0.");
+  PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
+                 "The step should be greater than 0 while start < end. And the "
+                 "step should be less than 0 while start > end.");
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+
+template <typename T>
+class CPURangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
+    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = value;
+      value += step;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 7c284312df..5ee1206175 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -17,7 +17,9 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
+cc_library(py_reader SRCS py_reader.cc DEPS reader)
 cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+
 reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
@@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
-reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
 
 if (NOT WIN32 AND NOT ON_INFER)
     cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
@@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
 
-op_library(read_op)
+op_library(read_op DEPS py_reader buffered_reader)
 
 foreach(src ${LOCAL_READER_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 51b980acb5..78d238aa61 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -16,6 +16,7 @@
 
 #include <condition_variable>  // NOLINT
 #include <deque>
+#include <utility>
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -34,7 +35,7 @@ class BlockingQueue {
   explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
       : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
-        capacity_, 0,
+        capacity_, static_cast<size_t>(0),
         "The capacity of a reader::BlockingQueue must be greater than 0.");
   }
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 134807092d..c24e9aedc4 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -30,8 +30,10 @@ BufferedReader::~BufferedReader() {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaStreamDestroy(stream));
-    for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
+    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+    for (auto &event : events_) {
+      PADDLE_ENFORCE(cudaEventDestroy(event));
+    }
   }
 #endif
 }
@@ -46,15 +48,15 @@ BufferedReader::BufferedReader(
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    compute_stream =
+    compute_stream_ =
         ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
                                              .Get(place_)))
             ->stream();
-    events.resize(buffer_size);
-    for (auto &event : events) {
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     }
-    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
   }
 #endif
   cpu_buffer_.resize(buffer_size);
@@ -73,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
+    PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
   }
 #endif
   position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
@@ -91,7 +93,7 @@ void BufferedReader::ReadAsync(size_t i) {
     // commands from different streams cannot run concurrently.
     if (platform::is_gpu_place(place_)) {
       platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
       TensorVec &gpu = gpu_buffer_[i];
       gpu.resize(cpu.size());
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
@@ -106,12 +108,14 @@ void BufferedReader::ReadAsync(size_t i) {
         if (platform::is_cuda_pinned_place(cpu_place)) {
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CUDAPinnedPlace>(cpu_place),
-                       cpu_ptr, size, stream);
+                       cpu_ptr, size, stream_);
         } else if ((platform::is_gpu_place(cpu_place))) {
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
-                       size, stream);
+                       size, stream_);
         } else {
+          // if cpu place is not pinned, async copy is slower than sync copy,
+          // so we use sync copy instead.
           // TODO(zcd): The default stream should not be used here.
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
@@ -119,7 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
         }
         gpu[i].set_lod(cpu[i].lod());
       }
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     }
 #endif
     return i;
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 87680da01a..5f8b2d47c2 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <list>
+#include <memory>
 #include <queue>
 #include <vector>
 #include "ThreadPool.h"
@@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> gpu_buffer_;
   size_t prev_pos_{-1UL};
 #ifdef PADDLE_WITH_CUDA
-  cudaStream_t stream;
-  cudaStream_t compute_stream;
-  std::vector<cudaEvent_t> events;
+  cudaStream_t stream_;
+  cudaStream_t compute_stream_;
+  std::vector<cudaEvent_t> events_;
 #endif
 };
 
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 85394b336f..fdc7b0f6a0 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
     AddComment(R"DOC(
       CreateCustomReader Operator
 
-      A custom reader can be used for input data preprocessing. 
-      A custom reader holds its own sub-block, which will be executed in CPU 
-      in its 'ReadNext()' function. Users can configurate their own 
-      preprocessing pipelines by inserting operators into custom reader's 
+      A custom reader can be used for input data preprocessing.
+      A custom reader holds its own sub-block, which will be executed in CPU
+      in its 'ReadNext()' function. Users can configurate their own
+      preprocessing pipelines by inserting operators into custom reader's
       sub-block.
     )DOC");
   }
@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 
 class CustomReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
-    PADDLE_ENFORCE_NOT_NULL(out_reader);
-    out_reader->SetType(framework::proto::VarType::READER);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& out_var_name = ctx->Output("Out")[0];
+    PADDLE_ENFORCE(ctx->HasVar(out_var_name));
+    ctx->SetType(out_var_name, framework::proto::VarType::READER);
 
     auto sink_var_names =
-        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+        boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
     const auto* sub_block =
-        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+        boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
     std::vector<framework::proto::VarType::Type> res_data_types;
     for (const std::string& var_name : sink_var_names) {
       framework::VarDesc* var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(var);
       res_data_types.emplace_back(var->GetDataType());
     }
-    out_reader->SetDataTypes(res_data_types);
+    ctx->SetDataTypes(out_var_name, res_data_types);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 901a92ab5b..4a6581bbbd 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -12,37 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-class PyReader : public framework::FileReader {
- public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
-      : framework::FileReader() {
-    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    queue_ = queue;
-  }
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    bool success;
-    *out = queue_->Pop(&success);
-    if (!success) out->clear();
-  }
-
-  ~PyReader() { queue_->Close(); }
-
-  void Shutdown() override { queue_->Close(); }
-
-  void Start() override { queue_->ReOpen(); }
-
- private:
-  std::shared_ptr<LoDTensorBlockingQueue> queue_;
-};
-
 class CreatePyReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
new file mode 100644
index 0000000000..155ae859de
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+    : framework::FileReader() {
+  PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+  queue_ = queue;
+}
+
+void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  bool success;
+  *out = queue_->Pop(&success);
+  if (!success) out->clear();
+}
+
+PyReader::~PyReader() { queue_->Close(); }
+
+void PyReader::Shutdown() { queue_->Close(); }
+
+void PyReader::Start() { queue_->ReOpen(); }
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h
new file mode 100644
index 0000000000..4307907514
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_reader.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::FileReader {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+  ~PyReader();
+
+  void Shutdown() override;
+
+  void Start() override;
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 846b2ed77e..33a69ad5fe 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
 
 class ReadInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
     if (infer_out) {
-      std::string reader_name = op_desc.Input("Reader")[0];
-      std::vector<std::string> out_names = op_desc.Output("Out");
-      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-      auto dtypes = reader->GetDataTypes();
+      std::string reader_name = ctx->Input("Reader")[0];
+      std::vector<std::string> out_names = ctx->Output("Out");
+      auto dtypes = ctx->GetDataTypes(reader_name);
       PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
       for (size_t i = 0; i < dtypes.size(); ++i) {
-        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-        out.SetType(framework::proto::VarType::LOD_TENSOR);
-        out.SetDataType(dtypes[i]);
+        ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
+        ctx->SetDataType(out_names[i], dtypes[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 3921eedf94..64a1f6b687 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
   }
 }
 
-void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
-                                        framework::BlockDesc* block) const {
-  std::string reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-  reader->SetType(framework::proto::VarType::READER);
+void FileReaderInferVarType::operator()(
+    framework::InferVarTypeContext* ctx) const {
+  std::string reader_name = ctx->Output("Out")[0];
+  ctx->SetType(reader_name, framework::proto::VarType::READER);
 }
 
 void DecoratedReaderInferShape::operator()(
@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()(
 }
 
 void DecoratedReaderInferVarType::operator()(
-    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
-  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-  std::string out_reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-  out_reader->SetType(framework::proto::VarType::READER);
-  out_reader->SetDataTypes(in_reader->GetDataTypes());
+    framework::InferVarTypeContext* ctx) const {
+  const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
+  const std::string& out_reader_name = ctx->Output("Out")[0];
+  ctx->SetType(out_reader_name, framework::proto::VarType::READER);
+  ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
 }
 
 void DecoratedReaderMakerBase::Make() {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 25c3e7d77b..795a580605 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
@@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
 
 class FileReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 // general infershape for decorated reader
@@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
 // general var type inference for decorated reader
 class DecoratedReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 88c968a0ea..2898a62ddb 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -282,7 +282,9 @@ class RecurrentOp : public RecurrentBase {
 
       // Every inputs are linked now, execute!
       executor.Run(*program, &cur_scope, block->ID(),
-                   false /*create_local_scope*/);
+                   false /*create_local_scope*/, true /*create_vars*/,
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       // get device context from pool
       platform::DeviceContextPool &pool =
@@ -398,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase {
       VLOG(5) << "Recurrent memory linking finished ";
       // Run step block with cur_scope
       executor.Run(*program, &cur_scope, block->ID(),
-                   false /*create_local_scope*/);
+                   false /*create_local_scope*/, true /*create_vars*/,
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       VLOG(5) << "executor.Run finished ";
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 37f69426b6..2b429380fb 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -219,14 +219,6 @@ class ReshapeKernel {
           std::vector<int>(shape_data, shape_data + shape_tensor->numel());
       out_dims = ReshapeOp::ValidateShape(shape, in->dims());
     }
-    if (!in->lod().empty()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0], in->dims()[0],
-          "Reshape operator cannot reshape an input sequence batch "
-          "into an output sequence batch that has a different "
-          "number of time steps. Please consider using "
-          "sequence_reshape op.");
-    }
 
     out->mutable_data(ctx.GetPlace(), in->type());
     framework::TensorCopy(
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index d0edcc170f..62b1e09737 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include <string>
+
+#include "paddle/fluid/operators/save_combine_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SaveCombineOp : public framework::OperatorBase {
+class SaveCombineOp : public framework::OperatorWithKernel {
  public:
-  SaveCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-
-    bool is_present = FileExists(filename);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      auto *var = scope.FindVar(inp_var_names[i]);
-
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-
-      auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensors one by one
-
-      // Check types to see if a fp16 transformation is required
-      auto in_dtype = tensor.type();
-      auto out_dtype =
-          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor out;
-        // copy LoD info to the new tensor
-        out.set_lod(tensor.lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
-      } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
-      }
-    }
-    fout.close();
-  }
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 };
 
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
@@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 SaveCombine operator
 
-This operator will serialize and write a list of input LoDTensor variables 
+This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
@@ -134,3 +65,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
                   ops::SaveCombineOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu
new file mode 100644
index 0000000000..bc4478b51b
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
new file mode 100644
index 0000000000..4ee82e17dd
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SaveCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto &inp_var_names = ctx.Inputs("X");
+    auto &inp_vars = ctx.MultiInputVar("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      PADDLE_ENFORCE(inp_vars[i] != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(inp_vars[i]->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
+      // Serialize tensors one by one
+
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = tensor.type();
+      auto out_dtype =
+          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
+    }
+    fout.close();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 4743e0d949..5594de16b6 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
-USE_NO_KERNEL_OP(save_combine);
-USE_NO_KERNEL_OP(load_combine);
+USE_CPU_ONLY_OP(save_combine);
+USE_CPU_ONLY_OP(load_combine);
 
 template <typename T, typename U>
 T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index ccaea0eef2..d277198a2f 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
-USE_NO_KERNEL_OP(save);
-USE_NO_KERNEL_OP(load);
+USE_CPU_ONLY_OP(save);
+USE_CPU_ONLY_OP(load);
 
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index fcc598f4f1..338e2fbb5d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -15,118 +15,24 @@ limitations under the License. */
 #include <stdint.h>
 #include <fstream>
 #include <numeric>
+#include <string>
+#include <vector>
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/operators/save_op.h"
 
 namespace paddle {
 namespace operators {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-class SaveOp : public framework::OperatorBase {
+class SaveOp : public framework::OperatorWithKernel {
  public:
-  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    if (var->IsType<framework::LoDTensor>()) {
-      SaveLodTensor(place, var);
-    } else if (var->IsType<framework::SelectedRows>()) {
-      SaveSelectedRows(scope, place, var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
-          iname);
-    }
-  }
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void SaveLodTensor(const platform::Place &place,
-                     framework::Variable *var) const {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &tensor = var->Get<framework::LoDTensor>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    auto in_dtype = tensor.type();
-    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor out;
-      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-      // copy LoD info to the new tensor
-      out.set_lod(tensor.lod());
-      framework::SerializeToStream(fout, out, dev_ctx);
-    } else {
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-    }
-    fout.close();
-  }
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 
-  void SaveSelectedRows(const framework::Scope &scope,
-                        const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
-    PADDLE_ENFORCE(
-        lt_var != nullptr,
-        "Can not find variable kLookupTablePath for SaveSelectedRows");
-    std::string filename = lt_var->data();
-    VLOG(4) << "SaveSelectedRows get File name: " << filename;
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &selectedRows = var->Get<framework::SelectedRows>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    framework::SerializeToStream(fout, selectedRows, dev_ctx);
-    fout.close();
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -154,17 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
                          "The \"file_path\" where the variable will be saved.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddOutput(LOOKUP_TABLE_PATH,
+              "(string)"
+              "for pserver: The \"kLookupTablePath\" where checkpoint notify "
+              "to save lookup table variables"
+              " to directory specified.")
+        .AsDispensable();
   }
 };
 
 class SaveOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
     auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+    ctx->SetType(LOOKUP_TABLE_PATH, var_type);
   }
 };
 
@@ -172,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {}
 };
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
-                  ops::SaveOpShapeInference);
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
+                  ops::SaveOpVarTypeInference, ops::SaveOpShapeInference);
+
+REGISTER_OP_CPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu
new file mode 100644
index 0000000000..0a778a694e
--- /dev/null
+++ b/paddle/fluid/operators/save_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
new file mode 100644
index 0000000000..642235aad5
--- /dev/null
+++ b/paddle/fluid/operators/save_op.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+template <typename DeviceContext, typename T>
+class SaveOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+
+    auto *input_var = ctx.InputVar("X");
+    auto iname = ctx.Inputs("X").data();
+    PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    if (input_var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(ctx, place, input_var);
+    } else if (input_var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(ctx, place, input_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+
+  void SaveLodTensor(const framework::ExecutionContext &ctx,
+                     const platform::Place &place,
+                     const framework::Variable *var) const {
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+    auto in_dtype = tensor.type();
+    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor out;
+      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      // copy LoD info to the new tensor
+      out.set_lod(tensor.lod());
+      framework::SerializeToStream(fout, out, dev_ctx);
+    } else {
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+
+  void SaveSelectedRows(const framework::ExecutionContext &ctx,
+                        const platform::Place &place,
+                        const framework::Variable *var) const {
+    framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH);
+    PADDLE_ENFORCE(
+        out_put_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    auto *lt_var = out_put_var->GetMutable<std::string>();
+
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    auto &selectedRows = var->Get<framework::SelectedRows>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4ea77ed30d..4e4a015e18 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 
+#include <memory>
 #include <string>
 
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$
 
 class ScaleOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto &in_var_name = op_desc.Input("X").front();
-    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
-
-    auto out_var_name = op_desc.Output("Out").front();
-    auto *out_var = block->FindVarRecursive(out_var_name);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &in_var_name = ctx->Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
 
     if (in_var_name != out_var_name) {
-      out_var->SetType(in_var.GetType());
-      out_var->SetDataType(in_var.GetDataType());
+      ctx->SetType(out_var_name, ctx->GetType(in_var_name));
+      ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h
index bdb506885c..b2fc834c42 100644
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
@@ -15,13 +15,12 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/for_range.h"
+
 namespace paddle {
 namespace operators {
 
-static HOSTDEVICE float real_exp(float x) { return expf(x); }
-static HOSTDEVICE float real_exp(double x) { return exp(x); }
-
 template <typename T>
 struct SeluFunctor {
   SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index d3dcd1f96a..cc4eedbf4d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
     PADDLE_ENFORCE(
         ctx->HasInput("X"),
         "Input(X) of SequecceEnumerate operator should not be null.");
@@ -33,13 +30,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
         "Output(X) of SequenceEnumerate operator should not be null.");
 
     const auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2,
-        "Input(X) of SequenceEnumerate operator's rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
-                      "Input(X) of SequenceEnumerate operator's 2nd "
-                      "dimension should be 1.");
-
     const auto win_size = ctx->Attrs().Get<int>("win_size");
     ctx->SetOutputDim("Out", {x_dims[0], win_size});
     ctx->ShareLoD("X", "Out");
@@ -62,6 +52,9 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
+        .SetDefault(true);
     AddComment(R"DOC(
 Sequence Enumerate Operator.
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 18da69993b..6a1eb6e625 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -27,30 +27,47 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
     int win_size = context.Attr<int>("win_size");
-    int pad_value = context.Attr<int>("pad_value");
+    auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
 
     auto in_dims = in->dims();
-    auto in_lod = in->lod();
-
+    auto lod0 = in->lod()[0];
     PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        static_cast<uint64_t>(in_dims[0]), lod0.back(),
         "The actual input data's size mismatched with LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(in_dims[1], 1,
+                      "Input(X) of SequenceEnumerate operator's 2nd "
+                      "dimension should be 1.");
 
     // Generate enumerate sequence set
-    auto lod0 = in_lod[0];
     auto in_data = in->data<T>();
     out->Resize({in_dims[0], win_size});
+    out->set_lod(in->lod());
     auto out_data = out->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < lod0.size() - 1; ++i) {
-      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
-        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
-          size_t word_pos = idx + word_idx;
-          out_data[win_size * idx + word_idx] =
-              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+      int start = lod0[i];
+      int end = lod0[i + 1];
+      int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
+      int mid = end + 1 - copy_size;
+      int pad_num = win_size - copy_size;
+      copy_size *= sizeof(T);
+      for (int idx = start; idx < mid; ++idx) {
+        std::memcpy(out_data, in_data + idx, copy_size);
+        out_data += win_size;
+      }
+      for (int idx = mid; idx < end; ++idx) {
+        copy_size -= sizeof(T);
+        pad_num++;
+        std::memcpy(out_data, in_data + idx, copy_size);
+        T* pdata = out_data + copy_size / sizeof(T);
+        for (int i = 0; i < pad_num; ++i) {
+          pdata[i] = pad_value;
         }
+        out_data += win_size;
       }
     }
-    out->set_lod(in->lod());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index cc5e982190..a9dc0a4fda 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
 
 namespace paddle {
@@ -21,9 +22,6 @@ namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
-__device__ __forceinline__ float real_exp(float x) { return expf(x); }
-__device__ __forceinline__ double real_exp(double x) { return exp(x); }
-
 template <typename T, int BlockDim>
 using BlockReduce = cub::BlockReduce<T, BlockDim>;
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 2a4570ef5c..aea69de643 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "cub/cub.cuh"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
@@ -21,11 +22,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-static HOSTDEVICE float real_exp(float x) { return expf(x); }
-static HOSTDEVICE float real_exp(double x) { return exp(x); }
-static HOSTDEVICE float real_log(float x) { return logf(x); }
-static HOSTDEVICE float real_log(double x) { return log(x); }
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
index 5efecb78d1..24a564f9ef 100644
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <size_t D>
+__global__ void Padding(const paddle::platform::float16* d_out,
+                        const int* out_dims, const int* in_dims,
+                        const int* offsets, int64_t n,
+                        paddle::platform::float16* d_in) {
+  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
+    int coords[D] = {0};
+    for (int i = D - 1; i >= 0; --i) {
+      coords[i] = out_idx_tmp % out_dims[i];
+      out_idx_tmp /= out_dims[i];
+      coords[i] += offsets[i];
+    }
+
+    int64_t in_idx = 0;
+    for (int i = 0; i < D; ++i) {
+      in_idx = in_idx * in_dims[i] + coords[i];
+    }
+
+    d_in[in_idx] = d_out[out_idx];
+  }
+}
+
+template <>
+class SliceGradKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+    : public framework::OpKernel<paddle::platform::float16> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
+
+    auto out_dims = d_out->dims();
+    auto in_dims = d_in->dims();
+    int rank = out_dims.size();
+    std::vector<int> offsets(rank, 0);
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+
+    for (size_t i = 0; i < starts.size(); ++i) {
+      if (starts[i] < 0) {
+        starts[i] += in_dims[axes[i]];
+      }
+      offsets[axes[i]] = std::max(starts[i], 0);
+    }
+
+    math::SetConstant<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+        set_zero;
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::CUDADeviceContext>();
+    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
+
+    int64_t numel = d_out->numel();
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
+    auto stream = ctx.cuda_device_context().stream();
+
+    auto out_shape = framework::vectorize2int(out_dims);
+    thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
+    auto in_shape = framework::vectorize2int(in_dims);
+    thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
+    thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
+    const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
+    const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
+    const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
+
+    switch (rank) {
+      case 1:
+        Padding<1><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 2:
+        Padding<2><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 3:
+        Padding<3><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 4:
+        Padding<4><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 5:
+        Padding<5><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 6:
+        Padding<6><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     slice_grad,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 7754d2bfeb..fda971b20e 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -187,7 +188,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", Input("Label"));
     grad_op->SetInput("Softmax", Output("Softmax"));
-    grad_op->SetInput("Loss", Output("Loss"));
     grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
     grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 52b8dcc681..89aaac4cbe 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
+                          context.device_context(), logit_grad);
     T* logit_grad_data = logit_grad->data<T>();
 
     const int batch_size = logit_grad->dims()[0];
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index 0e7b1463d1..88dfebc0cf 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/split_selected_rows_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
 
 class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
     }
   }
 };
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index e389c6a65e..dc15df2c3c 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
                         "tensor's rank.");
     }
 
-    auto out_dims = GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, false);
     ctx->SetOutputDim("Out", out_dims);
     if (x_dims[0] == out_dims[0]) {
       // Only pass LoD when the first dimension of output and Input(X)
@@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
   }
 
   static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
+                                        const framework::DDim &in_dims,
+                                        bool is_runtime) {
     size_t num_squeeze_dims = squeeze_dims.size();
     int cnt_squeezed_dims = 0;
     bool should_squeeze[9] = {false};
@@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
         // Check current index, the upper limit has beed checked in line 36.
         PADDLE_ENFORCE(current >= 0,
                        "Invalid axis, the negative axis is out of range.");
-        PADDLE_ENFORCE(in_dims[current] == 1,
-                       "Invalid axis index, the axis that will be squeezed "
-                       "should be equal to 1.");
+
+        if (is_runtime) {
+          PADDLE_ENFORCE(in_dims[current] == 1,
+                         "Invalid axis index, the axis that will be squeezed "
+                         "should be equal to 1.");
+        }
 
         if (!(should_squeeze[current])) {
           ++cnt_squeezed_dims;
@@ -94,6 +98,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
   }
 };
 
+// TODO(paddle-dev): Should use OpKernel.
 class SqueezeOp : public framework::OperatorBase {
  public:
   using OperatorBase::OperatorBase;
@@ -103,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto &axes = Attr<std::vector<int>>("axes");
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
-    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true);
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
@@ -223,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto &axes = Attr<std::vector<int>>("axes");
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
-    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true);
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7abfbbd3cb..1391148ccf 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -159,24 +160,20 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : op_desc.Input("X")) {
-      VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name).GetType();
+    for (auto& name : ctx->Input("X")) {
+      VLOG(10) << name << " " << ctx->GetType(name);
     }
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name).GetType() ==
-                 framework::proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
+          return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
         });
 
-    auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name).GetType() ==
-             framework::proto::VarType::LOD_TENSOR_ARRAY;
+    auto is_tensor_array = [ctx](const std::string& name) {
+      return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       if (!all_inputs_are_tensor_array) {
         std::ostringstream os;
         for (auto& each : inputs) {
-          os << "    " << each << " type is "
-             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
+          os << "    " << each << " type is " << ctx->GetType(each) << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
@@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       var_type = framework::proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    out_var.SetType(var_type);
-    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
-    out_var.SetDataType(in_var.GetDataType());
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, var_type);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
   }
 };
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
new file mode 100644
index 0000000000..d6cf27fd77
--- /dev/null
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
+REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
new file mode 100644
index 0000000000..a5984bfaaa
--- /dev/null
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -0,0 +1,452 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    T x_sum = 0;
+    T x2_sum = 0;
+    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
+      int id = layout == framework::DataLayout::kNCHW
+                   ? (i / M) * C * M + k * M + i % M
+                   : i * C + k;
+      T x_in = x[id];
+      x_sum += x_in;
+      x2_sum += x_in * x_in;
+    }
+    __syncthreads();
+    T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k] = out / (N * M);
+    }
+    out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k + C] = out / (N * M);
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    mean_var[2 * C] = static_cast<T>(1.0);
+  }
+}
+
+template <typename T>
+__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev,
+                                     const int C, const T momentum,
+                                     const double epsilon, T *sv_mean_data,
+                                     T *sv_inv_var_data, T *moving_means,
+                                     T *moving_variances) {
+  // sync stats across multi-devices
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < C; i += stride) {
+    T mean = means[i] / (*num_dev);
+    T var = variances[i] / (*num_dev);
+    var = var - mean * mean;
+
+    // sync stats
+    sv_mean_data[i] = mean;
+    sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
+    variances[i] = var;
+
+    // moving stats
+    moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
+    moving_variances[i] =
+        moving_variances[i] * momentum + var * (1. - momentum);
+  }
+}
+
+template <typename T, framework::DataLayout layout>
+static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias,
+                                    const T *mean, const T *variance,
+                                    const double epsilon, const int C,
+                                    const int M, const int num, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
+    y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SyncBatchNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    PADDLE_ENFORCE(
+        !use_global_stats,
+        "sync_batch_norm doesn't support to set use_global_stats True. ",
+        "Please use batch_norm in this case.");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    int x_numel = x->numel();
+
+    const T *x_d = x->data<T>();
+    const T *s_d = ctx.Input<Tensor>("Scale")->data<T>();
+    const T *b_d = ctx.Input<Tensor>("Bias")->data<T>();
+
+    auto *y = ctx.Output<Tensor>("Y");
+    T *y_d = y->mutable_data<T>(ctx.GetPlace());
+
+    const T *mean_data = nullptr;
+    const T *var_data = nullptr;
+
+    auto &dev_ctx = ctx.cuda_device_context();
+    auto stream = dev_ctx.stream();
+    auto *comm = dev_ctx.nccl_comm();
+    const int block = 512;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+
+    paddle::memory::AllocationPtr alloc_ptr{nullptr};
+
+    if (is_test) {
+      const auto *est_mean = ctx.Input<Tensor>("Mean");
+      const auto *est_var = ctx.Input<Tensor>("Variance");
+      mean_data = est_mean->data<T>();
+      var_data = est_var->data<T>();
+    } else {
+      auto &allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      // x, x^2, 1, here 1 is used to calc device num
+      // device num also can be got from platform::DeviceContextPool
+      const int bytes = (C * 2 + 1) * sizeof(T);
+      alloc_ptr = allocator.Allocate(bytes);
+
+      T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
+      const int threads = 256;
+      int grid = std::min(C, (max_threads + threads - 1) / threads);
+      if (layout == framework::DataLayout::kNCHW) {
+        KeLocalStats<
+            T, threads,
+            framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+            x_d, N, H * W * D, C, stats);
+      } else {
+        KeLocalStats<
+            T, threads,
+            framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+            x_d, N, H * W * D, C, stats);
+      }
+
+      Tensor c_g_st;
+      T *c_g_st_d = c_g_st.mutable_data<T>({2 * C + 1}, platform::CPUPlace());
+      auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
+
+      int dtype = platform::ToNCCLDataType(x->type());
+      // In-place operation
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+          comm, stream));
+
+      // moving mean/variance
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      T *est_mean_data = mean_out->mutable_data<T>(ctx.GetPlace());
+      T *est_var_data = variance_out->mutable_data<T>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
+      T *sv_mean_data = saved_mean->mutable_data<T>(ctx.GetPlace());
+      T *sv_inv_var_data = saved_inv_variance->mutable_data<T>(ctx.GetPlace());
+
+      // Note, Input('Mean')/Input('Variance') share variable with
+      // Output('MeanOut')/Output('VarianceOut')
+      KeSyncAndMovingStats<T><<<(C + block - 1) / block, block, 0, stream>>>(
+          stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data,
+          sv_inv_var_data, est_mean_data, est_var_data);
+
+      mean_data = sv_mean_data;
+      var_data = stats + C;
+    }
+
+    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      KeNormAffine<T,
+                   framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
+          y_d);
+    } else {
+      KeNormAffine<T,
+                   framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
+          y_d);
+    }
+  }
+};
+
+template <typename T, const int BlockDim, framework::DataLayout layout>
+__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means,
+                                     int N, int M, int C, T *sum_dy_prod) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    T sum1 = 0;
+    T sum2 = 0;
+    T mean = means[k];
+    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
+      int id = layout == framework::DataLayout::kNCHW
+                   ? (i / M) * C * M + k * M + i % M
+                   : i * C + k;
+      T g = dy[id];
+      sum1 += g;
+      sum2 += g * (x[id] - mean);
+    }
+
+    __syncthreads();
+    T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k] = out;
+    }
+    out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k + C] = out;
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    sum_dy_prod[2 * C] = static_cast<T>(1.0);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x,
+                                             const T *mean,
+                                             const T *inv_variance,
+                                             const double epsilon, const int N,
+                                             const int C, const int HxW,
+                                             T *dscale, T *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = static_cast<T>(0);
+    T db_sum = static_cast<T>(0);
+
+    T inv_var_i = inv_variance[i];
+    T mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int id = layout == framework::DataLayout::kNCHW
+                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
+                         : j * outer_size + i;
+      ds_sum += dy[id] * (x[id] - mean_i);
+      db_sum += dy[id];
+    }
+    __syncthreads();
+    double os = BlockReduce(temp_storage)
+                    .Reduce(static_cast<double>(ds_sum), cub::Sum());
+    __syncthreads();
+    double ob = BlockReduce(temp_storage)
+                    .Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      dscale[i] = static_cast<T>(os * inv_var_i);
+      dbias[i] = static_cast<T>(ob);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta,
+                                        const T *mean, const T *inv_variance,
+                                        const T *g_sum_dy,
+                                        const T *g_sum_dy_prod,
+                                        const T *num_dev, const double epsilon,
+                                        const int C, const int HxW,
+                                        const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T scale = static_cast<T>(C) / num;
+  T dev_num = num_dev[0];
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    T inv_var = inv_variance[c];
+    T s_d = beta[c];
+    T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var *
+             (inv_var * inv_var);
+    T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var;
+
+    dx[i] =
+        dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]);
+  }
+}
+
+// Deriving the Gradient for the Backward Pass of Batch Normalization
+// https://kevinzakka.github.io/2016/09/14/batch_normalization/
+template <typename DeviceContext, typename T>
+class SyncBatchNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+    }
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
+
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    }
+
+    const T *x_d = x->data<T>();
+    const T *dy_d = d_y->data<T>();
+
+    auto &dev_ctx = ctx.cuda_device_context();
+    auto stream = dev_ctx.stream();
+    auto *comm = dev_ctx.nccl_comm();
+
+    const T *saved_mean = ctx.Input<Tensor>("SavedMean")->data<T>();
+    const T *saved_inv_var = ctx.Input<Tensor>("SavedVariance")->data<T>();
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    const int bytes = (C * 2 + 1) * sizeof(T);
+    auto alloc_ptr = allocator.Allocate(bytes);
+    T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
+
+    const int threads = 256;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    int grid = std::min(C, (max_threads + threads - 1) / threads);
+    int x_numel = x->numel();
+    int fsize = H * W * D;
+
+    if (layout == framework::DataLayout::kNCHW) {
+      KeBackwardLocalStats<
+          T, threads,
+          framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+          dy_d, x_d, saved_mean, N, fsize, C, stats);
+    } else {
+      KeBackwardLocalStats<
+          T, threads,
+          framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+          dy_d, x_d, saved_mean, N, fsize, C, stats);
+    }
+    int dtype = platform::ToNCCLDataType(x->type());
+    // In-place operation
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+        comm, stream));
+
+    const int block = 512;
+    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T, threads,
+            framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
+            d_scale->data<T>(), d_bias->data<T>());
+      }
+      if (d_x) {
+        KeBNBackwardData<
+            T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
+            stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
+            d_x->data<T>());
+      }
+    } else {
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T, threads,
+            framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
+            d_scale->data<T>(), d_bias->data<T>());
+      }
+      if (d_x) {
+        KeBNBackwardData<
+            T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
+            stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
+            d_x->data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    sync_batch_norm, ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
+    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sync_batch_norm_grad,
+    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
+    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 58a74ec2c1..2b83c42f20 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
 class LoDTensorArray2TensorGradInferVarType
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index a8c86de9f9..6cf3e65e00 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class TensorRTEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 9e77f7252d..db763a051d 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
     PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
-    PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                      "input must have >= k columns");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
+                        "input must have >= k columns");
+    }
 
     framework::DDim dims = input_dims;
     dims[dims.size() - 1] = k;
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index e3132ae76f..bb6a1c5b16 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Out").front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Out").front();
     auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
+        boost::get<int>(ctx->GetAttr("dtype")));
 
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
-      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+    if (ctx->GetType(out_var_name) !=
+        framework::proto::VarType::SELECTED_ROWS) {
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    out_var.SetDataType(var_data_type);
+    ctx->SetDataType(out_var_name, var_data_type);
   }
 };
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 920b43b2b1..d54a3e8670 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool(
   for (auto& p : places) {
     set.insert(p);
   }
-
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -317,6 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+#if !defined(_WIN32)
+  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+#endif
 }
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d376f90ad5..9dbc72f561 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#if !defined(__APPLE__) && !defined(_WIN32)
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
@@ -265,6 +268,14 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+#if !defined(_WIN32)
+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const { return nccl_comm_; }
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
+#endif
+
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
     callback();
@@ -289,6 +300,15 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
 
+#if !defined(_WIN32)
+  // NCCL communicator (single process version) for NCCL collective operations.
+  // NCCL collective operations provides fast collectives over multiple GPUs
+  // both within and across nodes.
+  // But, this collectives is used for collectives over multiple GPUs within
+  // nodes.
+  ncclComm_t nccl_comm_{nullptr};
+#endif
+
   int compute_capability_;
   int runtime_version_;
   int driver_version_;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index b084f1a649..8458b17f82 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_tracer.h"
 
 #include <deque>
 #include <forward_list>
@@ -30,6 +29,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
                                       stream_id, correlation_id, bytes});
   }
 
+  void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                        const Place &place, const std::string &alloc_in,
+                        const std::string &free_in, int64_t thread_id) {
+    if (0 == start_ns || 0 == end_ns) {
+      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
+      return;
+    }
+    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
+        nullptr;
+    if (local_mem_info_record == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      mem_info_record_.emplace_front();
+      local_mem_info_record = &mem_info_record_.front();
+    }
+    local_mem_info_record->emplace_front(MemInfoRecord{
+        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
+  }
+
   void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
                             uint64_t end_ns, int64_t device_id,
                             int64_t thread_id, uint32_t correlation_id) {
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
     correlations_.clear();
     for (auto &tmp : correlations_pairs) tmp.clear();
     for (auto &tmp : cpu_records_) tmp.clear();
+    for (auto &tmp : mem_info_record_) tmp.clear();
     for (auto &tmp : active_kind_records_) tmp.clear();
   }
 
@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
-    if (correlations_.empty())
-      for (auto &tmp : correlations_pairs)
+    if (correlations_.empty()) {
+      for (auto &tmp : correlations_pairs) {
         for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+      }
+    }
+
     for (const KernelRecord &r : kernel_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
       event->set_device_id(r.device_id);
     }
     VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
+
     for (auto &tmp : cpu_records_) {
       for (const CPURecord &r : tmp) {
         auto *event = profile_pb.add_events();
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
         event->set_device_id(r.device_id);
       }
     }
+
     for (auto &tmp : active_kind_records_) {
       for (const ActiveKindRecord &r : tmp) {
         auto *event = profile_pb.add_events();
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
     VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
+
+    for (auto &tmp : mem_info_record_) {
+      for (const auto &r : tmp) {
+        auto *event = profile_pb.add_mem_events();
+        event->set_device_id(0);
+        if (platform::is_cpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CPUPlace);
+        } else if (platform::is_gpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPlace);
+          event->set_device_id(
+              boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
+        } else if (platform::is_cuda_pinned_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else {
+          PADDLE_THROW("The current place is not supported.");
+        }
+        event->set_alloc_in(r.alloc_in);
+        event->set_free_in(r.free_in);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_bytes(r.bytes);
+        event->set_thread_id(r.thread_id);
+      }
+    }
+
     std::ofstream profile_f;
     profile_f.open(profile_path,
                    std::ios::out | std::ios::trunc | std::ios::binary);
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
   std::forward_list<KernelRecord> kernel_records_;
   std::forward_list<MemRecord> mem_records_;
   std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
   std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
   std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
       correlations_pairs;
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
   return annotation_stack.back();
 }
 std::string CurAnnotationName() {
-  if (annotation_stack.empty()) return "";
+  if (annotation_stack.empty()) return "Unknown";
   return annotation_stack.back()->name();
 }
 
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index a8f1d89383..85168a046f 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
@@ -47,6 +48,7 @@ class DeviceTracer {
     int64_t stream_id;
     uint32_t correlation_id;
   };
+
   struct CPURecord {
     std::string name;
     uint64_t start_ns;
@@ -54,6 +56,7 @@ class DeviceTracer {
     int64_t device_id;
     int64_t thread_id;
   };
+
   struct MemRecord {
     std::string name;
     uint64_t start_ns;
@@ -63,6 +66,17 @@ class DeviceTracer {
     uint32_t correlation_id;
     uint64_t bytes;
   };
+
+  struct MemInfoRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    size_t bytes;
+    Place place;
+    int64_t thread_id;
+    std::string alloc_in;
+    std::string free_in;
+  };
+
   struct ActiveKindRecord {
     std::string name;
     uint64_t start_ns;
@@ -71,6 +85,7 @@ class DeviceTracer {
     int64_t thread_id;
     uint32_t correlation_id;
   };
+
   virtual ~DeviceTracer() {}
   // Needs to be called once before use.
   virtual void Enable() = 0;
@@ -97,6 +112,12 @@ class DeviceTracer {
                                     int64_t thread_id,
                                     uint32_t correlation_id) = 0;
 
+  virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
+                                size_t bytes, const Place& place,
+                                const std::string& alloc_in,
+                                const std::string& free_in,
+                                int64_t thread_id) = 0;
+
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
   virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 2dcf966754..e9bdb82a50 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
@@ -64,5 +66,36 @@ class Event {
 #endif
 #endif
 };
+
+class MemEvent {
+ public:
+  MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
+           Place place, int64_t thread_id, const std::string& annotation)
+      : type_(type),
+        start_ns_(start_ns),
+        end_ns_(end_ns),
+        bytes_(bytes),
+        place_(place),
+        thread_id_(thread_id),
+        annotation_(annotation) {}
+
+  const EventType& type() const { return type_; }
+  uint64_t start_ns() const { return start_ns_; }
+  uint64_t end_ns() const { return end_ns_; }
+  size_t bytes() const { return bytes_; }
+  Place place() const { return place_; }
+  int64_t thread_id() const { return thread_id_; }
+  const std::string& annotation() const { return annotation_; }
+
+ private:
+  EventType type_;
+  uint64_t start_ns_ = 0;
+  uint64_t end_ns_ = 0;
+  size_t bytes_;
+  Place place_;
+  int64_t thread_id_;
+  std::string annotation_;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 400a6d7bfa..47cca879b4 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
-
 #include <algorithm>
 #include <cstdlib>
 #include <string>
@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
 
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
 DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_uint64(
+    initial_gpu_memory_in_mb, 0ul,
+    "Allocate a trunk of gpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "truck. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the gpu memory will be requested from gpu with size "
+    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+
+DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
+
 DEFINE_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }
 
 size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+
+size_t GpuInitAllocSize() {
+  if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
+    // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
+  }
+
+  // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
+  // fraction
   size_t total = 0;
   size_t available = 0;
 
   GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
 
-  // Reserve the rest for page tables, etc.
-  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuReallocSize() {
+  if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
+    // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
+  }
+
+  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
+  // by fraction
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
+
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {
@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() {
   GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(0.05 * total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
       std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
                total - reserving);
 
-  // Reserving the rest memory for page tables, etc.
-
-  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
-                                          (total - reserving));
+  size_t allocating = GpuMaxAllocSize();
 
   PADDLE_ENFORCE_LE(allocating, available,
                     "Insufficient GPU memory to allocation.");
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 1e1ab2503f..d4be7ac97b 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the initial allocation size of current GPU device.
+size_t GpuInitAllocSize();
+
+//! Get the re-allocation size of current GPU device.
+size_t GpuReallocSize();
+
 //! Get the minimum chunk size for GPU buddy allocator.
 size_t GpuMinChunkSize();
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4dcf7e7904..d53a4029e1 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <memory>
+#include <set>
 #include <stdexcept>
 #include <string>
 
@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
   platform::DeviceTemporaryAllocator::Init();
+
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 6ae21ee829..b8b14b3d15 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -16,10 +16,13 @@
 #pragma once
 
 #include <stdio.h>
+#include <memory>
 #include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
+#include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -77,6 +80,7 @@ struct NCCLContext {
       : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
 
   cudaStream_t stream() const { return ctx_->stream(); }
+  ncclComm_t comm() const { return comm_; }
 
   int device_id() const {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
@@ -101,9 +105,6 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    if (places.size() <= 1 && num_trainers == 1) {
-      return;
-    }
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
     if (num_trainers == 1 && nccl_id == nullptr) {
@@ -123,8 +124,8 @@ struct NCCLContextMap {
           } else {
             rank = trainer_id;
           }
-          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
-                   << "gpu id: " << gpu_id;
+          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks
+                  << " gpu id: " << gpu_id;
           PADDLE_ENFORCE(cudaSetDevice(gpu_id));
           PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 9a285a6b53..6d055a4421 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
-
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -21,6 +20,8 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <random>
 #include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 namespace paddle {
 namespace platform {
 
-struct EventList;
-
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
 std::mutex profiler_mu;
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
 // The global mutex
 static std::mutex g_all_event_lists_mutex;
 // The total event lists of all threads
-static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
-static thread_local std::shared_ptr<EventList> g_event_list;
-
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  Event* Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-    return &event_blocks.front().back();
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
+static thread_local std::shared_ptr<EventList<Event>> g_event_list;
 
-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<Event>> event_blocks;
-};
+static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
+static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
+static std::mutex g_all_mem_event_lists_mutex;
+static thread_local int32_t g_mem_thread_id;
+static uint32_t g_mem_next_thread_id = 0;
 
 inline uint64_t GetTimeInNsec() {
   using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
   cpu_ns_ = GetTimeInNsec();
 }
 
-const EventType& Event::type() const { return type_; }
+const EventType &Event::type() const { return type_; }
 
-double Event::CpuElapsedMs(const Event& e) const {
+double Event::CpuElapsedMs(const Event &e) const {
   return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }
 
-double Event::CudaElapsedMs(const Event& e) const {
+double Event::CudaElapsedMs(const Event &e) const {
 #ifdef PADDLE_WITH_CUPTI
   return gpu_ns_ / 1000000.0;
 #else
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
 #endif
 }
 
-inline EventList& GetEventList() {
+inline EventList<MemEvent> &GetMemEventList() {
+  if (!g_mem_event_list) {
+    g_mem_event_list = std::make_shared<EventList<MemEvent>>();
+    std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+    g_mem_thread_id = g_mem_next_thread_id++;
+    g_all_mem_event_lists.emplace_front(g_mem_event_list);
+  }
+  return *g_mem_event_list;
+}
+
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
+                           place, g_mem_thread_id, annotation);
+}
+
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
+                           g_mem_thread_id, annotation);
+}
+
+inline EventList<Event> &GetEventList() {
   if (!g_event_list) {
     std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-    g_event_list = std::make_shared<EventList>();
+    g_event_list = std::make_shared<EventList<Event>>();
     g_thread_id = g_next_thread_id++;
     g_all_event_lists.emplace_front(g_event_list);
     RecoreCurThreadId(g_thread_id);
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
   return *g_event_list;
 }
 
-void Mark(const std::string& name) {
+void Mark(const std::string &name) {
   GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-Event* PushEvent(const std::string& name) {
+Event *PushEvent(const std::string &name) {
   return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
 }
 
-void PopEvent(const std::string& name) {
+void PopEvent(const std::string &name) {
   GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }
 
-RecordEvent::RecordEvent(const std::string& name)
+RecordEvent::RecordEvent(const std::string &name)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
   // lock is not needed, the code below is thread-safe
 
   is_enabled_ = true;
   name_ = name;
-  Event* e = PushEvent(name_);
+  Event *e = PushEvent(name_);
   // Maybe need the same push/pop behavior.
   SetCurAnnotation(e);
 }
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   // lock is not needed, the code below is thread-safe
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
                           BlockDepth(), g_thread_id);
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
   PopEvent(name_);
 }
 
-RecordRPCEvent::RecordRPCEvent(const std::string& name) {
+MemEvenRecorder MemEvenRecorder::recorder;
+
+void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+                                    size_t size) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE(events.count(ptr) == 0, "");
+  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
+                          new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
+void MemEvenRecorder::Flush() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  address_memevent_.clear();
+}
+
+MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
+                                                size_t bytes)
+    : place_(place),
+      bytes_(bytes),
+      start_ns_(PosixInNsec()),
+      alloc_in_(CurAnnotationName()) {
+  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
+}
+
+MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
+  DeviceTracer *tracer = GetDeviceTracer();
+  end_ns_ = PosixInNsec();
+
+  auto annotation_free = CurAnnotationName();
+  if (tracer) {
+    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
+                             annotation_free, g_mem_thread_id);
+  }
+  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
+}
+
+RecordRPCEvent::RecordRPCEvent(const std::string &name) {
   if (FLAGS_enable_rpc_profiler) {
     event_.reset(new platform::RecordEvent(name));
   }
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
 RecordBlock::~RecordBlock() {
   // lock is not needed, the code below is thread-safe
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
 void ResetProfiler() {
   SynchronizeAllDevice();
   GetDeviceTracer()->Reset();
+  MemEvenRecorder::Instance().Flush();
   std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
   for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
        ++it) {
     (*it)->Clear();
   }
+  for (auto it = g_all_mem_event_lists.begin();
+       it != g_all_mem_event_lists.end(); ++it) {
+    (*it)->Clear();
+  }
 }
 
 std::vector<std::vector<Event>> GetAllEvents() {
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
   return result;
 }
 
+std::vector<std::vector<MemEvent>> GetMemEvents() {
+  std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+  std::vector<std::vector<MemEvent>> result;
+  for (auto &it : g_all_mem_event_lists) {
+    result.emplace_back((*it).Reduce());
+  }
+  return result;
+}
+
 // The information of each event given in the profiling report
 struct EventItem {
   std::string name;
@@ -263,8 +319,8 @@ struct EventItem {
 };
 
 // Print results
-void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
-                   const std::string& sorted_domain, const size_t name_width,
+void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
+                   const std::string &sorted_domain, const size_t name_width,
                    const size_t data_width, bool merge_thread) {
   // Output header information
   std::cout << "\n------------------------->"
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
             << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
     for (size_t j = 0; j < events_table[i].size(); ++j) {
-      const EventItem& event_item = events_table[i][j];
+      const EventItem &event_item = events_table[i][j];
       std::cout << std::setw(name_width) << event_item.name
                 << std::setw(data_width) << event_item.calls
                 << std::setw(data_width) << event_item.total_time;
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
 }
 
 // Parse the event list and output the profiling report
-void ParseEvents(const std::vector<std::vector<Event>>& events,
+void ParseEvents(const std::vector<std::vector<Event>> &events,
                  bool merge_thread,
                  EventSortingKey sorted_by = EventSortingKey::kDefault) {
   if (g_state == ProfilerState::kDisabled) return;
   if (merge_thread && events.size() < 2) return;
 
   std::string sorted_domain;
-  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
   switch (sorted_by) {
     case EventSortingKey::kCalls:
       sorted_domain = "number of calls";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.calls > b.calls;
       };
       break;
     case EventSortingKey::kTotal:
       sorted_domain = "total time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.total_time > b.total_time;
       };
       break;
     case EventSortingKey::kMin:
       sorted_domain = "minimum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.min_time > b.min_time;
       };
       break;
     case EventSortingKey::kMax:
       sorted_domain = "maximum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.max_time > b.max_time;
       };
       break;
     case EventSortingKey::kAve:
       sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.ave_time > b.ave_time;
       };
       break;
     case EventSortingKey::kGPUTime:
       sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.gpu_time > b.gpu_time;
       };
       break;
     case EventSortingKey::kCPUTime:
       sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
         return a.cpu_time > b.cpu_time;
       };
       break;
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
       sorted_domain = "event first end time";
   }
 
-  const std::vector<std::vector<Event>>* analyze_events;
+  const std::vector<std::vector<Event>> *analyze_events;
   std::vector<std::vector<Event>> merged_events_list;
   if (merge_thread) {
     std::vector<Event> merged_events;
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
       }
     }
     // average time
-    for (auto& item : event_items) {
+    for (auto &item : event_items) {
       item.ave_time = item.total_time / item.calls;
       item.ratio = item.total_time / total;
     }
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
                 merge_thread);
 }
 
+struct MemoryProfierReport {
+  size_t alloc_times{0};
+  size_t alloc_size{0};
+  size_t free_times{0};
+  size_t free_size{0};
+};
+
+// Print results
+void PrintMemProfiler(
+    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+        &annotation_report,
+    const size_t name_width, const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "    Memory Profiling Report     "
+            << "<-------------------------\n\n";
+
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Alloc Calls" << std::setw(data_width) << "Size(MB)"
+            << std::setw(data_width) << "Free Calls" << std::setw(data_width)
+            << "Size(MB)" << std::endl;
+
+  for (auto &tmp : annotation_report) {
+    for (auto &e : tmp.second) {
+      auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
+      std::cout << std::setw(name_width) << event_name;
+      std::cout << std::setw(data_width) << e.second.alloc_times;
+      std::cout << std::setw(data_width)
+                << e.second.alloc_size / (1024.0 * 1024.0);
+      std::cout << std::setw(data_width) << e.second.free_times;
+      std::cout << std::setw(data_width)
+                << e.second.free_size / (1024.0 * 1024.0) << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+// parse memory events
+void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
+  if (g_state == ProfilerState::kDisabled) return;
+  // place, annotation, alloc times,  alloc size
+  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+      annotation_report;
+
+  for (auto &tmp : events) {
+    for (auto &e : tmp) {
+      if (e.type() == EventType::kPushRange) {
+        annotation_report[e.place()][e.annotation()].alloc_times += 1;
+        annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
+      } else if (e.type() == EventType::kPopRange) {
+        annotation_report[e.place()][e.annotation()].free_times += 1;
+        annotation_report[e.place()][e.annotation()].free_size += e.bytes();
+      }
+    }
+  }
+  PrintMemProfiler(annotation_report, 55, 18);
+}
+
 void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path) {
+                     const std::string &profile_path) {
   SynchronizeAllDevice();
+  MemEvenRecorder::Instance().Flush();
+
   std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_");
 
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
   if (tracer->IsEnabled()) {
     tracer->Disable();
     tracer->GenProfile(profile_path);
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
   std::vector<std::vector<Event>> all_events = GetAllEvents();
   ParseEvents(all_events, true, sorted_key);
   ParseEvents(all_events, false, sorted_key);
+  if (VLOG_IS_ON(5)) {
+    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
+    ParseMemEvents(all_mem_events);
+  }
+
   ResetProfiler();
   g_state = ProfilerState::kDisabled;
   should_send_profile_state = true;
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index aec0ae3429..8d11855b70 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -34,8 +41,41 @@ enum ProfilerState {
 
 void Mark(const std::string& name);
 
-Event* PushEvent(const std::string& name);
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place& place);
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place& place);
+
+struct MemEvenRecorder {
+ public:
+  void PushMemRecord(const void* ptr, const Place& place, size_t size);
+  void PopMemRecord(const void* ptr, const Place& place);
+  void Flush();
+  static MemEvenRecorder& Instance() { return recorder; }
 
+ private:
+  struct RecordMemEvent {
+    RecordMemEvent(const Place& place, size_t bytes);
+    ~RecordMemEvent();
+
+    Place place_;
+    size_t bytes_;
+    uint64_t start_ns_;
+    uint64_t end_ns_;
+    std::string alloc_in_;
+    std::string free_in_;
+  };
+
+  static MemEvenRecorder recorder;
+  std::map<Place,
+           std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
+      address_memevent_;
+  std::mutex mtx_;
+  MemEvenRecorder() {}
+  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
+};
+
+Event* PushEvent(const std::string& name);
 void PopEvent(const std::string& name);
 
 struct RecordEvent {
@@ -87,6 +127,41 @@ enum EventSortingKey {
   kGPUTime
 };
 
+template <typename T>
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(T);
+  constexpr static size_t kEventAlign = alignof(T);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  T* Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+    return &event_blocks.front().back();
+  }
+
+  std::vector<T> Reduce() {
+    std::vector<T> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<T>> event_blocks;
+};
+
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);
 
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index e761d7b266..cfa3c6906f 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -34,8 +34,25 @@ message Event {
   optional string detail_info = 9;
 }
 
+message MemEvent {
+  enum Place {
+    CUDAPlace = 0;
+    CPUPlace = 1;
+    CUDAPinnedPlace = 2;
+  }
+  optional uint64 start_ns = 1;
+  optional uint64 end_ns = 2;
+  optional uint64 bytes = 3;
+  optional Place place = 4;
+  optional uint64 thread_id = 5;
+  optional uint32 device_id = 6;
+  optional string alloc_in = 7;
+  optional string free_in = 8;
+}
+
 message Profile {
   repeated Event events = 1;
   optional uint64 start_ns = 2;
   optional uint64 end_ns = 3;
+  repeated MemEvent mem_events = 4;
 }
\ No newline at end of file
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4ac5b83c56..0991eff0fd 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,11 +1,11 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor)
+  tracer analysis_predictor imperative_profiler)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index aeabed19ab..e9ed4e1644 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -13,10 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/imperative.h"
+
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -30,21 +38,23 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
-              framework::BlockDesc* block,
+              imperative::VarBasePtrMap* outputs,
+              framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
-             return self.Trace(op, inputs, outputs, block, expected_place,
+             pybind11::gil_scoped_release release;
+             return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
-              framework::BlockDesc* block,
+              imperative::VarBasePtrMap* outputs,
+              framework::AttributeMap attrs_map,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {
-             return self.Trace(op, inputs, outputs, block, expected_place,
+             pybind11::gil_scoped_release release;
+             return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })
       .def("py_trace", &imperative::Tracer::PyTrace,
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index 8c48b2a715..8496cbfcb1 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
 #include "pybind11/pybind11.h"
@@ -36,6 +37,8 @@ class Layer : public imperative::Layer {
 class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
  public:
   using imperative::OpBase::OpBase;  // Inherit constructors
+
+  PyOpBase(const std::string& name) : OpBase(name) {}
 };
 
 class PyVarBase : public imperative::VarBase {
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 68f74a8531..c69ccd5072 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -54,12 +55,14 @@ void BindGraph(py::module *m) {
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
       "`paddle::ir::Graph` for details.")
       .def(py::init<const ProgramDesc &>())
+      .def("clone", &Graph::Clone)
       .def("has", &Graph::Has)
       .def("get_int", &Graph::Get<int>)
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
-      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
+      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
+           return_value_policy::reference)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -103,7 +106,8 @@ void BindGraph(py::module *m) {
       .def("retrieve_node", &Graph::RetrieveNode,
            return_value_policy::reference)
       .def("resolve_hazard", &Graph::ResolveHazard)
-      .def("origin_program_desc", &Graph::OriginProgram);
+      .def("origin_program_desc", &Graph::OriginProgram,
+           return_value_policy::reference);
 }
 
 void BindNode(py::module *m) {
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index e729be4a95..7b5e417504 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -23,97 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-// Cast boost::variant for PyBind.
-// Copy from
-// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
-namespace pybind11 {
-namespace detail {
-
-#if !defined(PYBIND11_HIDDEN)
-#ifdef _WIN32
-#define PYBIND11_HIDDEN __declspec(dllexport)
-#else
-#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
-#endif
-#endif
-
-// Can be replaced by a generic lambda in C++14
-struct PYBIND11_HIDDEN paddle_variant_caster_visitor
-    : public boost::static_visitor<handle> {
-  return_value_policy policy;
-  handle parent;
-
-  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
-      : policy(policy), parent(parent) {}
-
-  template <class T>
-  handle operator()(T const &src) const {
-    return make_caster<T>::cast(src, policy, parent);
-  }
-};
-
-template <class Variant>
-struct paddle_variant_caster;
-
-template <template <class...> class V, class... Ts>
-struct paddle_variant_caster<V<Ts...>> {
-  using Type = V<Ts...>;
-
-  template <typename T>
-  typename std::enable_if<
-      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
-  try_load(handle src, bool convert) {
-    auto caster = make_caster<T>();
-    if (!load_success_ && caster.load(src, convert)) {
-      load_success_ = true;
-
-      if (std::is_same<T, std::vector<float>>::value) {
-        auto caster_ints = make_caster<std::vector<int64_t>>();
-        if (caster_ints.load(src, convert)) {
-          VLOG(4) << "This value are floats and int64_ts satisfy "
-                     "simultaneously, will set it's type to "
-                     "std::vector<int64_t>";
-          value = cast_op<std::vector<int64_t>>(caster_ints);
-          return true;
-        }
-      }
-
-      value = cast_op<T>(caster);
-      return true;
-    }
-    return false;
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
-                          bool>::type
-  try_load(handle src, bool convert) {
-    return false;
-  }
-
-  bool load(handle src, bool convert) {
-    auto unused = {false, try_load<Ts>(src, convert)...};
-    (void)(unused);
-    return load_success_;
-  }
-
-  static handle cast(Type const &src, return_value_policy policy,
-                     handle parent) {
-    paddle_variant_caster_visitor visitor(policy, parent);
-    return boost::apply_visitor(visitor, src);
-  }
-
-  PYBIND11_TYPE_CASTER(Type, _("Variant"));
-  bool load_success_{false};
-};
-
-// Add specialization for concrete variant type
-template <class... Args>
-struct type_caster<boost::variant<Args...>>
-    : paddle_variant_caster<boost::variant<Args...>> {};
-
-}  // namespace detail
-}  // namespace pybind11
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cf59ff6d3b..cef95de2ef 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -54,6 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -94,6 +96,14 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }
 
+bool IsCompiledWithNGRAPH() {
+#ifndef PADDLE_WITH_NGRAPH
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -119,6 +129,11 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
   return paddle::platform::Place(p1) == paddle::platform::Place(p2);
 }
 
+template <typename PlaceType>
+static inline int PlaceIndex(const PlaceType &p) {
+  return static_cast<int>(paddle::platform::Place(p).which());
+}
+
 PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
@@ -148,9 +163,20 @@ PYBIND11_MODULE(core, m) {
   m.def("print_mem_usage",
         []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
 
+  m.def("start_imperative_gperf_profiler",
+        []() { imperative::StartProfile(); });
+
+  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
+
   py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
-      // .def(py::init<>())
-      .def(py::init<bool>(), py::arg("stop_gradient") = false)
+      .def(
+          py::init<const std::string &, paddle::framework::proto::VarType::Type,
+                   const std::vector<int64_t>, const paddle::platform::CPUPlace,
+                   bool, bool>())
+      .def(
+          py::init<const std::string &, paddle::framework::proto::VarType::Type,
+                   const std::vector<int64_t>,
+                   const paddle::platform::CUDAPlace, bool, bool>())
       .def("_run_backward",
            [](imperative::VarBase &self) { self.RunBackward(); })
       .def("_grad_name", &imperative::VarBase::GradName)
@@ -177,51 +203,21 @@ PYBIND11_MODULE(core, m) {
            py::return_value_policy::take_ownership)
       .def("value", [](const imperative::VarBase &self) { return self.var_; },
            py::return_value_policy::reference)
-      .def_property("name",
-                    [](const imperative::VarBase &self) { return self.name_; },
-                    [](imperative::VarBase &self, const std::string &name) {
-                      self.name_ = name;
-                    })
-      .def_property("block",
-                    [](const imperative::VarBase &self) { return self.block_; },
-                    [](imperative::VarBase &self, framework::BlockDesc *block) {
-                      self.block_ = block;
-                    },
-                    py::return_value_policy::reference)
-      .def_property(
-          "persistable",
-          [](const imperative::VarBase &self) { return self.persistable_; },
-          [](imperative::VarBase &self, const bool persistable) {
-            self.persistable_ = persistable;
-          })
-      .def_property(
-          "desc",
-          [](const imperative::VarBase &self) { return self.var_desc_; },
-          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
-            self.var_desc_ = var_desc;
-          },
-          py::return_value_policy::reference)
-      .def_property(
-          "stop_gradient",
-          [](const imperative::VarBase &self) { return self.IsStopGradient(); },
-          [](imperative::VarBase &self, bool stop_gradient) {
-            self.SetStopGradient(stop_gradient);
-          });
+      .def_property("name", &imperative::VarBase::Name,
+                    &imperative::VarBase::SetName)
+      .def_property_readonly("shape", &imperative::VarBase::Shape)
+      .def_property_readonly("dtype", &imperative::VarBase::DataType)
+      .def_property("persistable", &imperative::VarBase::IsPersistable,
+                    &imperative::VarBase::SetPersistable)
+      .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
+                    &imperative::VarBase::SetStopGradient);
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
-      .def(py::init<>())
+      .def(py::init<const std::string &>())
       .def("register_backward_hooks",
            [](imperative::OpBase &self, const py::object &callable) {
              self.RegisterBackwardHooks(callable);
            })
-      .def_property(
-          "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
-          [](imperative::OpBase &self, framework::OpDesc *op_desc) {
-            if (op_desc) {
-              self.op_desc_ = op_desc;
-            }
-          },
-          py::return_value_policy::reference)
       .def_property("_trace_id",
                     [](const imperative::OpBase &self) {
                       pybind11::gil_scoped_release release;
@@ -260,7 +256,17 @@ PYBIND11_MODULE(core, m) {
           "apply",
           [](int func_id, const std::vector<imperative::VarBase *> &inputs)
               -> std::vector<imperative::VarBase *> {
-                return imperative::PyLayer::Apply(func_id, inputs);
+                auto ret_vars = imperative::PyLayer::Apply(func_id, inputs);
+                std::vector<imperative::VarBase *> outputs;
+                outputs.reserve(ret_vars.size());
+                for (size_t i = 0U; i != ret_vars.size(); ++i) {
+                  framework::Variable *v = ret_vars[i];
+                  // TODO(minqiyang): use unique_name generator to set a name
+                  outputs.emplace_back(
+                      new imperative::VarBase("", v, nullptr, true));
+                }
+
+                return outputs;
               },
           py::return_value_policy::take_ownership)
       .def_static("register_func",
@@ -531,6 +537,7 @@ PYBIND11_MODULE(core, m) {
 
 All parameter, weight, gradient are variables in Paddle.
 )DOC")
+      .def(py::init<>())
       .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
       .def("set_int",
            [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
@@ -572,14 +579,13 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference);
 
-  py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("start", &framework::ReaderHolder::Start)
-      .def("reset", &framework::ReaderHolder::ResetAll);
+  BindReader(&m);
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
   using LoDTensorBlockingQueueHolder =
       ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+
   py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
       m, "LoDTensorBlockingQueue", "")
       .def("push",
@@ -776,6 +782,7 @@ All parameter, weight, gradient are variables in Paddle.
              PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
 #endif
            })
+      .def("_type", &PlaceIndex<platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
@@ -785,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
@@ -800,6 +808,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
              new (&self) platform::CUDAPinnedPlace();
            })
+      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
@@ -811,16 +820,25 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("is_cpu_place",
+           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_cuda_pinned_place",
+           [](platform::Place &self) {
+             return platform::is_cuda_pinned_place(self);
+           })
       .def("gpu_device_id",
            [](platform::Place &self) {
              return boost::get<platform::CUDAPlace>(self).device;
            })
+      .def("set_place", [](platform::Place &self,
+                           const platform::Place &other) { self = other; })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
@@ -876,9 +894,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<const platform::Place &>())
       .def("close", &Executor::Close)
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-                     int block_id, bool create_local_scope, bool create_vars) {
+                     int block_id, bool create_local_scope, bool create_vars,
+                     const std::vector<std::string> &fetch_vars) {
         pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+        self.Run(prog, scope, block_id, create_local_scope, create_vars,
+                 fetch_vars);
       });
 
   m.def("init_gflags", framework::InitGflags);
@@ -886,6 +906,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_devices",
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
+  m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
@@ -1233,6 +1254,21 @@ All parameter, weight, gradient are variables in Paddle.
                       it will save GPU memory and may make the execution faster.
                       This options is only available in GPU devices.
                       Default False)DOC")
+      .def_property(
+          "sync_batch_norm",
+          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.sync_batch_norm_ = b;
+          },
+          R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
+                synchronous batch normalization which synchronizes the mean
+                and variance through multi-devices in training phase.
+
+                Current implementation doesn't support FP16 training and CPU.
+                And only synchronous on one machine, not all machines.
+
+                Default False)DOC")
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
@@ -1245,6 +1281,10 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
           [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "fuse_all_reduce_ops",
+          [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
+          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
@@ -1254,7 +1294,7 @@ All parameter, weight, gradient are variables in Paddle.
                 cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const std::string &,
+                  const std::vector<std::string> &, const std::string &,
                   Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
                   const BuildStrategy &, ir::Graph *>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h
new file mode 100644
index 0000000000..70c3136d09
--- /dev/null
+++ b/paddle/fluid/pybind/pybind_boost_headers.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/variant.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+// Cast boost::variant for PyBind.
+// Copy from
+// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
+namespace pybind11 {
+namespace detail {
+
+#if !defined(PYBIND11_HIDDEN)
+#ifdef _WIN32
+#define PYBIND11_HIDDEN __declspec(dllexport)
+#else
+#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
+#endif
+#endif
+
+// Can be replaced by a generic lambda in C++14
+struct PYBIND11_HIDDEN paddle_variant_caster_visitor
+    : public boost::static_visitor<handle> {
+  return_value_policy policy;
+  handle parent;
+
+  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
+      : policy(policy), parent(parent) {}
+
+  template <class T>
+  handle operator()(T const &src) const {
+    return make_caster<T>::cast(src, policy, parent);
+  }
+};
+
+template <class Variant>
+struct paddle_variant_caster;
+
+template <template <class...> class V, class... Ts>
+struct paddle_variant_caster<V<Ts...>> {
+  using Type = V<Ts...>;
+
+  template <typename T>
+  typename std::enable_if<
+      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
+  try_load(handle src, bool convert) {
+    auto caster = make_caster<T>();
+    if (!load_success_ && caster.load(src, convert)) {
+      load_success_ = true;
+
+      if (std::is_same<T, std::vector<float>>::value) {
+        auto caster_ints = make_caster<std::vector<int64_t>>();
+        if (caster_ints.load(src, convert)) {
+          VLOG(4) << "This value are floats and int64_ts satisfy "
+                     "simultaneously, will set it's type to "
+                     "std::vector<int64_t>";
+          value = cast_op<std::vector<int64_t>>(caster_ints);
+          return true;
+        }
+      }
+
+      value = cast_op<T>(caster);
+      return true;
+    }
+    return false;
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
+                          bool>::type
+  try_load(handle src, bool convert) {
+    return false;
+  }
+
+  bool load(handle src, bool convert) {
+    auto unused = {false, try_load<Ts>(src, convert)...};
+    (void)(unused);
+    return load_success_;
+  }
+
+  static handle cast(Type const &src, return_value_policy policy,
+                     handle parent) {
+    paddle_variant_caster_visitor visitor(policy, parent);
+    return boost::apply_visitor(visitor, src);
+  }
+
+  PYBIND11_TYPE_CASTER(Type, _("Variant"));
+  bool load_success_{false};
+};
+
+// Add specialization for concrete variant type
+template <class... Args>
+struct type_caster<boost::variant<Args...>>
+    : paddle_variant_caster<boost::variant<Args...>> {};
+
+}  // namespace detail
+}  // namespace pybind11
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
new file mode 100644
index 0000000000..af7d30552e
--- /dev/null
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/reader_py.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include "paddle/fluid/platform/place.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+class MultiDeviceFeedReader {
+ public:
+  using ResultDictList =
+      std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
+
+  MultiDeviceFeedReader(
+      const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
+      const std::vector<std::string> &names,
+      const std::vector<platform::Place> &dst_places, bool use_double_buffer)
+      : queue_(queue),
+        names_(names),
+        pool_(new ::ThreadPool(dst_places.size())) {
+    std::shared_ptr<framework::ReaderBase> reader(
+        new operators::reader::PyReader(queue));
+
+    readers_.reserve(dst_places.size());
+    for (auto &p : dst_places) {
+      auto *holder = new framework::ReaderHolder();
+      if (use_double_buffer) {
+        holder->Reset(
+            framework::MakeDecoratedReader<operators::reader::BufferedReader>(
+                reader, p, 2));
+      } else {
+        if (platform::is_gpu_place(p)) {
+          PADDLE_THROW(
+              "Place cannot be CUDAPlace when use_double_buffer is False");
+        }
+        holder->Reset(reader);
+      }
+      readers_.emplace_back(holder);
+    }
+
+    futures_.resize(dst_places.size());
+    ret_.resize(dst_places.size());
+    ReadAsync();
+  }
+
+  ResultDictList ReadNext() {
+    bool success = WaitFutures();
+
+    if (!success) {
+      return {};
+    }
+
+    ResultDictList result(ret_.size());
+    for (size_t i = 0; i < ret_.size(); ++i) {
+      for (size_t j = 0; j < names_.size(); ++j) {
+        result[i].emplace(names_[j], std::move(ret_[i][j]));
+      }
+    }
+    ReadAsync();
+    return result;
+  }
+
+  void Reset() {
+    Shutdown();
+    Start();
+    ReadAsync();
+  }
+
+  ~MultiDeviceFeedReader() {
+    queue_->Close();
+    pool_.reset();
+  }
+
+ private:
+  bool WaitFutures() {
+    bool success = true;
+    for (auto &f : futures_) {
+      success &= f.get();
+    }
+    return success;
+  }
+
+  void Shutdown() {
+    for (auto &r : readers_) r->Shutdown();
+  }
+
+  void Start() {
+    for (auto &r : readers_) r->Start();
+  }
+
+  void ReadAsync() {
+    for (size_t i = 0; i < readers_.size(); ++i) {
+      futures_[i] = pool_->enqueue([this, i] {
+        readers_[i]->ReadNext(&ret_[i]);
+        return !ret_[i].empty();
+      });
+    }
+  }
+
+  std::shared_ptr<operators::reader::LoDTensorBlockingQueue> queue_;
+  std::vector<std::string> names_;
+  std::unique_ptr<::ThreadPool> pool_;
+
+  std::vector<std::unique_ptr<framework::ReaderHolder>> readers_;
+
+  std::vector<std::future<bool>> futures_;
+  std::vector<std::vector<framework::LoDTensor>> ret_;
+};
+
+namespace py = pybind11;
+
+void BindReader(py::module *module) {
+  auto &m = *module;
+
+  namespace reader = ::paddle::operators::reader;
+
+  py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
+      .def("reset", &framework::ReaderHolder::ResetAll);
+
+  py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
+      .def("read_next", &MultiDeviceFeedReader::ReadNext,
+           py::call_guard<py::gil_scoped_release>())
+      .def("reset", &MultiDeviceFeedReader::Reset,
+           py::call_guard<py::gil_scoped_release>());
+
+  m.def("create_py_reader",
+        [](const std::shared_ptr<operators::reader::LoDTensorBlockingQueue>
+               &queue,
+           const std::vector<std::string> &names,
+           const std::vector<platform::Place> &dst_places,
+           bool use_double_buffer) {
+          return new MultiDeviceFeedReader(queue, names, dst_places,
+                                           use_double_buffer);
+        },
+        py::return_value_policy::take_ownership);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/pybind/reader_py.h
similarity index 53%
rename from paddle/fluid/inference/utils/visualizer.h
rename to paddle/fluid/pybind/reader_py.h
index be532f92cf..472ff65368 100644
--- a/paddle/fluid/inference/utils/visualizer.h
+++ b/paddle/fluid/pybind/reader_py.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,29 +14,12 @@
 
 #pragma once
 
-#include <string>
-#include "paddle/fluid/inference/analysis/argument.h"
+#include "pybind11/pybind11.h"
 
 namespace paddle {
-namespace inference {
-namespace utils {
+namespace pybind {
 
-using paddle::inference::analysis::Argument;
+void BindReader(pybind11::module *module);
 
-class Visualizer final {
- public:
-  Visualizer() = default;
-  ~Visualizer() = default;
-  Visualizer(const Visualizer &) = delete;
-  Visualizer &operator=(const Visualizer &) = delete;
-
-  void SetArgument(Argument *);
-  bool Run();
-
- private:
-  Argument *argument_;
-};
-
-}  // namespace utils
-}  // namespace inference
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9899eee884..025528e85c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -453,19 +453,24 @@ function assert_api_spec_approvals() {
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
       if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+          # approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
           if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
+            if [ "${APPROVALS}" == "TRUE" ];then
+              APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+              python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
+            fi
           else
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
           fi
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
             if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
-              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
             else
-              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
+              echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
             fi
             exit 1
           fi
@@ -475,10 +480,10 @@ function assert_api_spec_approvals() {
     HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have panyx0718 approval for the const_cast"
+            echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
             exit 1
         fi
     fi
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index e91fa92924..614a358615 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("initial_gpu_memory_in_mb");
+  envs.push_back("reallocate_gpu_memory_in_mb");
   envs.push_back("allocator_strategy");
 #elif __clang__
   envs.push_back("use_mkldnn");
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 57c5e83c82..5728a37fc3 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -46,9 +46,9 @@ import six
 from six.moves import cPickle as pickle
 __all__ = ['train', 'test', 'valid']
 
-DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
-LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
-SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index adc0c1aac8..450f159f9d 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -15,7 +15,7 @@
 WMT14 dataset.
 The original WMT14 dataset is too large and a small set of data for set is
 provided. This module will download dataset from
-http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 
 """
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8102732c55..63b7b28948 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -125,14 +125,16 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
     sysstr = platform.system()
     read_env_flags = [
-        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph',
+        'check_nan_inf', 'benchmark', 'eager_delete_scope',
         'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
         'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
-        'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism',
-        'enable_parallel_graph', 'multiple_of_cupti_buffer_size',
-        'enable_subgraph_optimize'
+        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
+        'inner_op_parallelism', 'enable_parallel_graph',
+        'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size',
+        'enable_subgraph_optimize', 'fuse_parameter_memory_size',
+        'tracer_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -143,6 +145,9 @@ def __bootstrap__():
     if core.is_compiled_with_mkldnn():
         read_env_flags.append('use_mkldnn')
 
+    if core.is_compiled_with_ngraph():
+        read_env_flags.append('use_ngraph')
+
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_path')
@@ -158,7 +163,8 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
             'sync_nccl_allreduce', 'limit_of_tmp_allocation',
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 4441481093..ac2a40a7c2 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,9 +17,10 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework
+from .framework import cuda_places, cpu_places
 
 from . import core
-from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -44,21 +45,6 @@ def _is_pserver_mode(main_program):
     return False
 
 
-def get_available_places(use_cuda):
-    if use_cuda:
-        gpus_env = os.getenv("FLAGS_selected_gpus")
-        if gpus_env:
-            gpus = [int(s) for s in gpus_env.split(",")]
-        else:
-            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
-        places = [core.CUDAPlace(i) for i in gpus]
-    else:
-        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-    assert places, "no place for execution"
-    return places
-
-
 class CompiledProgram(object):
     """
     Compiles to Graph for execution.
@@ -117,7 +103,8 @@ class CompiledProgram(object):
                            loss_name=None,
                            build_strategy=None,
                            exec_strategy=None,
-                           share_vars_from=None):
+                           share_vars_from=None,
+                           places=None):
         """Configs the program to run in data parallel way.
 
         Args:
@@ -132,10 +119,18 @@ class CompiledProgram(object):
                 threads are used, how many iterations to clean up the temp
                 variables. For more information, please refer
                 to fluid.ExecutionStrategy. Default None.
-            share_vars_from(CompiledProgram): If provide, this CompiledProgram
+            share_vars_from(CompiledProgram): If provided, this CompiledProgram
                 will share variables from `share_vars_from`. `share_vars_from`
                 must be run by the executor before this CompiledProgram so that
                 vars are ready.
+            places(list(CUDAPlace)|list(CPUPlace)|None): If provided, only compile
+                program in the given places. Otherwise, the places used when compiled 
+                is determined by the Executor, and the places used are controlled 
+                by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES
+                if using GPU; or CPU_NUM if using CPU. For example, if you want to 
+                run on GPU 0 and 1, set places=[fluid.CUDAPlace(0), fluid.CUDAPlace(1)].
+                If you want to run on 2 CPU cores, set places=[fluid.CPUPlace()]*2.  
+
         Returns:
             self
         """
@@ -150,6 +145,12 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
+        if places is not None:
+            if not isinstance(places, (list, tuple)):
+                places = [places]
+            self._places = places
+        else:
+            self._places = None
         self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
@@ -192,7 +193,15 @@ class CompiledProgram(object):
             self._local_scopes = []
 
         self._exec_strategy.use_cuda = use_cuda
-        self._places = get_available_places(self._exec_strategy.use_cuda)
+        has_set_place = (self._places is not None)
+        if has_set_place:
+            for p in self._places:
+                assert p._type() == self._place._type(), \
+                    "Place type not match. You may set the wrong type of places"
+        else:
+            self._places = cuda_places(
+            ) if self._exec_strategy.use_cuda else cpu_places()
+        assert self._places, "no place for execution"
 
         if self._exec_strategy.num_threads == 0:
             if self._exec_strategy.use_cuda:
@@ -200,9 +209,7 @@ class CompiledProgram(object):
                 # performance. Worth tunning for other models in the future.
                 self._exec_strategy.num_threads = len(self._places) * 4
             else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                self._exec_strategy.num_threads = cpu_num * 2
+                self._exec_strategy.num_threads = len(self._places) * 2
 
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
@@ -223,22 +230,27 @@ class CompiledProgram(object):
                 tps), "num_trainers == len(end_points)"
             self._build_strategy.trainers_endpoints = tps
 
+        if self._build_strategy.sync_batch_norm:
+            self._build_strategy.enable_sequential_execution = True
+
         self._persistable_vars = []
-        for block_id in range(self._program_desc.num_blocks()):
-            bdesc = self._program_desc.block(block_id)
-            self._persistable_vars.extend([
-                cpt.to_text(v.name()) for v in bdesc.all_vars()
-                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
-            ])
+        for node in self._graph.nodes():
+            if node.is_var() and node.var() is not None and node.var().persistable() and \
+                    node.var().type() != core.VarDesc.VarType.RAW:
+                self._persistable_vars.append(cpt.to_text(node.name()))
 
         places = list(map(_place_obj, self._places))
-
-        return core.ParallelExecutor(places,
-                                     set(self._persistable_vars),
-                                     cpt.to_text(self._loss_name)
-                                     if self._loss_name else six.u(''), scope,
-                                     self._local_scopes, self._exec_strategy,
-                                     self._build_strategy, self._graph)
+        # ParallelExecutor would broadcast all the parameters during initializing.
+        # The parameters of each process should be in the same ordered for the data-parallelism
+        # distributed training to keep the broadcast correct.
+        self._persistable_vars = list(set(self._persistable_vars))
+        self._persistable_vars.sort()
+
+        return core.ParallelExecutor(
+            places, self._persistable_vars,
+            cpt.to_text(self._loss_name)
+            if self._loss_name else six.u(''), self._scope, self._local_scopes,
+            self._exec_strategy, self._build_strategy, self._graph)
 
     def _compile_inference(self):
         return core.create_paddle_predictor(self._infer_config)
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
index 460ae393f1..55a21ed1c5 100644
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -45,28 +45,41 @@ You can load INT8 model by load_inference_model [API](https://github.com/PaddleP
 ```
 
 ## 3. Result
-We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 (single core).
+
+**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
 
 | Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
-| ------------ | ------------ | ------------ | ------------ | ------------ |
-| ResNet-50  | Small  | 72.00%  | 72.00%  |  0.00% |
-| MobileNet-V1  | Small  | 62.00%  | 62.00%  | 0.00%  |
-| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.17%  | 0.46% |
-| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.49%  | 0.29%  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.23%  | 0.40% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.47%  | 0.31%  |
+
+**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  11.54 images/s | 32.2 images/s | 2.79 |
+| MobileNet-V1 | Full ImageNet Val  | 49.21 images/s | 108.37 images/s | 2.2  |
 
-Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset"). 
+Please note that [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset") can be downloaded by script `test_calibration.py` with `DATASET=full`. 
 
 Notes:
 * The accuracy measurement requires the model with `label`.
-* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
+* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `providing a theoretical peak compute gain of 4x int8 OPS over fp32 OPS` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). However, the actual test results at the model level will be less than 4X, and in general the average is about 2X. In addition, the calculation library optimization of batch size 1 is not as good as the large batch size.
 
 ## 4. How to reproduce the results
-* Small dataset
+* Small dataset (Single core)
 ```bash
 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 
-* Full dataset
+* Full dataset (Single core)
 ```bash
 FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
+
+* Full dataset (Multi-core)
+```bash
+FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+```
+> Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 032d0353ea..8eddf18cec 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -84,7 +84,8 @@ class QuantizeTranspiler(object):
                  activation_bits=8,
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
-                 window_size=10000):
+                 window_size=10000,
+                 moving_rate=0.9):
         """
         Convert and rewrite the fluid Program according to weight and
         activation quantization type.
@@ -117,23 +118,27 @@ class QuantizeTranspiler(object):
         """
         self.weight_bits = weight_bits
         self.activation_bits = activation_bits
-        quant_type = ['abs_max', 'range_abs_max']
+        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(weight_quantize_type))
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(activation_quantize_type))
 
         self.weight_quantize_type = weight_quantize_type
         self.activation_quantize_type = activation_quantize_type
 
         self.window_size = window_size
+        self.moving_rate = moving_rate
         self.helper = LayerHelper(self.__class__.__name__)
         self.fake_quant_op_types = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max'
         ]
         self.fake_dequant_op_types = ['fake_dequantize_max_abs']
         self.is_test = None
@@ -168,6 +173,7 @@ class QuantizeTranspiler(object):
             block_id = block.idx
             # insert quant op and dequant op
             for name in op.input_arg_names:
+                #if share input between ops
                 if name in dequanted_vars[block_id]:
                     dequant_var = dequanted_vars[block_id][name]
                 else:
@@ -261,6 +267,7 @@ class QuantizeTranspiler(object):
             max_range = None
             scale_var = None
             for name in op.input_arg_names:
+                #rename input name of the op to the input name of last op which has be removed
                 if name in op_in_rename_map[block_id]:
                     op._rename_input(name, op_in_rename_map[block_id][name])
 
@@ -272,8 +279,7 @@ class QuantizeTranspiler(object):
                     max_range = param_range * act_range / scale_v
                 else:
                     assert isinstance(scale_v, Variable)
-                    scale_var = var_scale_map[block_id][_original_var_name(
-                        name)]
+                    scale_var = scale_v
 
             if len(op.output_arg_names) != 1:
                 raise ValueError("Only support one output, but op %s has"
@@ -309,7 +315,7 @@ class QuantizeTranspiler(object):
                 op_type = op.type
 
                 # insert dequant_op after fc/conv, need to rename
-                # input of the followed ops
+                # input of the followed ops(of fc/conv) to the dquant_op
                 for name in op.input_arg_names:
                     if name in op_out_rename_map[block_id]:
                         op._rename_input(name,
@@ -389,8 +395,8 @@ class QuantizeTranspiler(object):
             for op in block.ops:
                 args += op.input_arg_names
                 args += op.output_arg_names
-            args = list(set(args))
-            var_names = block.vars.keys()
+            args = list(set(args))  #vals of all left ops
+            var_names = block.vars.keys()  # all vals
             sub_block_remove_vars = []
             for var in var_names:
                 if var not in args:
@@ -471,6 +477,61 @@ class QuantizeTranspiler(object):
 
         return quant_var, scale
 
+    def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
+                                                quant_bits):
+        """Insert fake_quantize_moving_average_abs_max
+        """
+        quant_var = block.create_var(
+            name=_quantized_var_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        state = self.helper.create_global_variable(
+            name=unique_name.generate('state'),
+            persistable=True,
+            dtype=var.dtype,
+            shape=[1])
+        self.helper.set_variable_initializer(
+            state, initializer=Constant(value=1))
+        accum = self.helper.create_global_variable(
+            name=unique_name.generate('accum'),
+            persistable=True,
+            dtype=var.dtype,
+            shape=[1])
+        self.helper.set_variable_initializer(
+            accum, initializer=Constant(value=1))
+        scale = self.helper.create_parameter(
+            attr=ParamAttr(
+                name=_quantized_scale_name(var.name),
+                initializer=Constant(0.001),
+                trainable=False),
+            shape=[1],
+            dtype=var.dtype)
+        scale.stop_gradient = True
+
+        ins = {'X': var, 'InScale': scale}
+        outs = {'Out': quant_var, 'OutScale': scale}
+        if not self.is_test:
+            ins['InState'] = state
+            ins['InAccum'] = accum
+            outs['OutState'] = state
+            outs['OutAccum'] = accum
+
+        attrs = {
+            'bit_length': quant_bits,
+            'moving_rate': self.moving_rate,
+            'is_test': self.is_test
+        }
+
+        quant_op = block._insert_op(
+            idx,
+            type='fake_quantize_moving_average_abs_max',
+            attrs=attrs,
+            inputs=ins,
+            outputs=outs)
+
+        return quant_var, scale
+
     def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
         """
         Insert fake_quantize_op
@@ -480,6 +541,9 @@ class QuantizeTranspiler(object):
         elif quant_type == 'range_abs_max':
             return self._insert_quant_range_abs_max_op(block, idx, var,
                                                        quant_bits)
+        elif quant_type == 'moving_average_abs_max':
+            return self._insert_quant_moving_average_abs_max_op(block, idx, var,
+                                                                quant_bits)
 
     def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
         """
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
index 22dbf7c8b6..4a71fab6d0 100644
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -13,13 +13,4 @@
 # limitations under the License.
 
 from .core import *
-from .graph import *
-from .prune import *
-__all__ = [
-    'build_compressor',
-    'CompressPass',
-    'ImitationGraph',
-    'SensitivePruneStrategy',
-    'MagnitudePruner',
-    'RatioPruner',
-]
+__all__ = ['Compressor', ]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
index 7826d5830a..831bd70ecc 100644
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -14,11 +14,9 @@
 
 from . import config
 from .config import *
-from . import compress_pass
-from .compress_pass import *
+from . import compressor
+from .compressor import *
 from . import strategy
 from .strategy import *
-from . import pass_builder
-from .pass_builder import *
 
-__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
+__all__ = config.__all__ + compressor.__all__ + strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
deleted file mode 100644
index c4c348b878..0000000000
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ....core import CPUPlace
-from ..graph import get_executor
-
-__all__ = ['Context', 'CompressPass']
-
-
-class Context(object):
-    """
-    The context in the process of compression.
-    Args:
-        exe: The executor used to execute graph.
-        graph: The graph to be compressed.
-        scope: The scope used to execute graph.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self, exe, graph, scope, program_exe=None):
-        # The total number of epoches to be trained.
-        self.epoch = 0
-        # Current epoch
-        self.epoch_id = 0
-        # Current batch
-        self.batch_id = 0
-        self.exe = exe
-        self.graph = graph
-        self.scope = scope
-        self.program_exe = program_exe
-
-
-class CompressPass(object):
-    """
-    The pass used to compress model.
-    Args:
-        place: The device used in compression.
-        data_reader: The data_reader used to run graph.
-        data_feeder: The data_feeder used to run graph.
-        scope: The scope used to run graph.
-        metrics: The metrics for evaluating model.
-        epoch: The total epoches of trainning in compression.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self,
-                 place=None,
-                 data_reader=None,
-                 data_feeder=None,
-                 scope=None,
-                 metrics=None,
-                 epoch=None,
-                 program_exe=None):
-        self.strategies = []
-        self.place = CPUPlace() if place is None else place
-        self.data_reader = data_reader
-        self.data_feeder = data_feeder
-        self.scope = scope
-        self.metrics = metrics
-        self.epoch = epoch
-        self.program_exe = program_exe
-
-    def add_strategy(self, strategy):
-        """
-        Add a strategy to current compress pass.
-        Args:
-            strategy: The strategy to be added into current compress pass.
-        """
-        self.strategies.append(strategy)
-        self.epoch = max(strategy.end_epoch, self.epoch)
-
-    def apply(self, graph):
-        """
-        Compress a model.
-        Args:
-            graph: The target graph to be compressed.
-        """
-        self.executor = get_executor(graph, self.place)
-        context = Context(
-            self.executor, graph, self.scope, program_exe=self.program_exe)
-
-        for strategy in self.strategies:
-            strategy.on_compress_begin(context)
-
-        for epoch in range(self.epoch):
-
-            for strategy in self.strategies:
-                strategy.on_epoch_begin(context)
-
-            for data in self.data_reader():
-
-                for strategy in self.strategies:
-                    strategy.on_batch_begin(context)
-                fetches = None
-                if self.metrics:
-                    fetches = self.metrics.values()
-                feed = None
-                if self.data_feeder:
-                    feed = self.data_feeder.feed(data)
-                results = self.executor.run(graph,
-                                            fetches=fetches,
-                                            scope=self.scope,
-                                            feed=feed)
-                if results:
-                    print("results: {}".format(
-                        zip(self.metrics.keys(), results)))
-                for strategy in self.strategies:
-                    strategy.on_batch_end(context)
-                context.batch_id += 1
-
-            for strategy in self.strategies:
-                strategy.on_epoch_end(context)
-            context.epoch_id += 1
-
-        for strategy in self.strategies:
-            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
new file mode 100644
index 0000000000..1547b6abbe
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -0,0 +1,481 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from .... import compiler
+from .... import io
+from .... import profiler
+from .... import scope_guard
+from ....data_feeder import DataFeeder
+from ..graph import *
+from .config import ConfigFactory
+import numpy as np
+from collections import Iterable
+import time
+import os
+import logging
+import sys
+import pickle
+import functools
+
+__all__ = ['Context', 'Compressor']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+def cached_reader(reader, sampled_rate, cache_path, cached_id):
+    """
+    Sample partial data from reader and cache them into local file system.
+    Args:
+        reader: Iterative data source.
+        sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+        cache_path(str): The path to cache the sampled data.
+        cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+    """
+    np.random.seed(cached_id)
+    cache_path = os.path.join(cache_path, str(cached_id))
+    _logger.debug('read data from: {}'.format(cache_path))
+
+    def s_reader():
+        if os.path.isdir(cache_path):
+            for file_name in open(os.path.join(cache_path, "list")):
+                yield np.load(os.path.join(cache_path, file_name.strip()))
+        else:
+            os.makedirs(cache_path)
+            list_file = open(os.path.join(cache_path, "list"), 'w')
+            batch = 0
+            dtype = None
+            for data in reader():
+                if batch == 0 or (np.random.uniform() < sampled_rate):
+                    np.save(
+                        os.path.join(cache_path, 'batch' + str(batch)), data)
+                    list_file.write('batch' + str(batch) + '.npy\n')
+                    batch += 1
+                    yield data
+
+    return s_reader
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_graph=None,
+                 train_reader=None,
+                 eval_graph=None,
+                 eval_reader=None,
+                 teacher_graphs=None,
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place: The device place where the compression job running.
+            scope: The scope used in compression job.
+            train_graph: The graph with loss as output node.
+            eval_graph: The graph used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            teacher_graphs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies.
+        """
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+
+        self.k_v = {}
+
+        self.place = place
+        self.scope = scope
+        self.train_graph = train_graph
+        self.train_reader = train_reader
+        self.eval_graph = eval_graph
+        self.eval_reader = eval_reader
+        self.executor = None
+        self.teacher_graphs = teacher_graphs
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.optimize_graph = None
+        self.cache_path = './eval_cache'
+        self.eval_results = {}
+
+    def to_file(self, file_name):
+        """
+        Save the context into file.
+        """
+        data = {}
+        data['epoch_id'] = self.epoch_id
+        data['eval_results'] = self.eval_results
+        with open(file_name, 'wb') as context_file:
+            pickle.dump(data, context_file)
+
+    def from_file(self, file_name):
+        """
+        Load the context from file.
+        """
+        with open(file_name) as context_file:
+            if sys.version_info < (3, 0):
+                data = pickle.load(context_file)
+            else:
+                data = pickle.load(context_file, encoding='bytes')
+            self.epoch_id = data['epoch_id']
+            self.eval_results = data['eval_results']
+
+    def eval_converged(self, metric_name, delta=0.001):
+        """
+        Check whether the training has been converged.
+        Args:
+            metric_name(str): The metric used to check convergence.
+            delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta'
+                          means that the training has been converged.
+        Returns:
+            bool: True means the training has been converged.
+        """
+        # TODO(wanghaoshuang@baidu.com): enhence this method.
+        if (metric_name not in self.eval_results
+            ) or len(self.eval_results[metric_name]) < 2:
+            return False
+        results = self.eval_results[metric_name][-2:]
+        _logger.info('Latest evaluations: {}'.format(results))
+        return abs(results[1] - results[0]) / results[0] < delta
+
+    def run_eval_graph(self, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            sampled_rate(float): The sampled rate used to sample partial data
+            for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same
+                            cached_id use the same sampled dataset. default: 0.
+        """
+        _logger.info('Running evaluation')
+        assert self.eval_graph is not None
+        assert self.eval_reader is not None
+        eval_graph = self.eval_graph.clone(for_test=True)
+
+        executor = SlimGraphExecutor(self.place)
+        results = []
+        batch_id = 0
+        s_time = time.time()
+        reader = self.eval_reader
+        if sampled_rate:
+            reader = cached_reader(reader, sampled_rate, self.cache_path,
+                                   cached_id)
+        for data in reader():
+            result = executor.run(eval_graph, self.scope, data=data)
+            result = [np.mean(r) for r in result]
+            results.append(result)
+            if batch_id % 20 == 0:
+                _logger.info("batch-{}; {}={}".format(
+                    batch_id, eval_graph.out_nodes.keys(), result))
+            batch_id += 1
+        result = np.mean(np.array(results), axis=0)
+        _logger.info("Final eval result: {}={}".format(
+            eval_graph.out_nodes.keys(), result))
+        if not isinstance(result, Iterable):
+            result = [result]
+        _logger.info('Finish evaluation')
+        return result, eval_graph.out_nodes.keys()
+
+    def put(self, key, value):
+        self.k_v[key] = value
+
+    def get(self, key):
+        return self.k_v.get(key)
+
+
+class Compressor(object):
+    """
+    The pass used to compress model.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place(fluid.Place): The device place where the compression job running.
+            scope(fluid.core.Scope): The scope used to run graph.
+            train_program(Program): The main program to be compressed. It must have loss op.
+            train_reader: The data reader used for training.
+            train_feed_list(dict): A dict to indicate the input variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            train_fetch_list(dict): A dict to indicate the output variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_program(Program): The program used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            eval_feed_list(dict): A dict to indicate the input variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            teacher_programs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy,
+                                 this optimizer is used to minimize the combined loss of student-net and
+                                 teacher-net while train_optimizer is used to minimize loss of
+                                 student-net in fine-tune stage. 
+
+        """
+        assert isinstance(
+            train_feed_list, list
+        ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        assert isinstance(
+            eval_feed_list, list
+        ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        self.strategies = []
+        self.epoch = 0
+        self.place = CPUPlace() if place is None else place
+        self.scope = scope
+        self.train_graph = GraphWrapper(
+            train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list)
+        self.eval_graph = GraphWrapper(
+            eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list)
+        self.train_reader = train_reader
+        self.eval_reader = eval_reader
+        self.teacher_graphs = []
+        for teacher in teacher_programs:
+            self.teacher_graphs.append(GraphWrapper(teacher))
+
+        self.checkpoint = None
+        self.checkpoint_path = checkpoint_path
+        self.eval_epoch = 1
+
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.init_model = None
+
+    def _add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def config(self, config_file):
+        """
+        Configure the compress pass from file with yaml format.
+        Args:
+            config_file(str): The config file in local file system.
+        """
+        factory = ConfigFactory(config_file)
+        self.epoch = factory.compressor['epoch']
+        for strategy in factory.compressor['strategies']:
+            self._add_strategy(strategy)
+        if 'checkpoint_path' in factory.compressor:
+            self.checkpoint_path = factory.compressor['checkpoint_path']
+
+        if 'init_model' in factory.compressor:
+            self.init_model = factory.compressor['init_model']
+
+    def _init_model(self, context):
+        """
+        Load model that has been compressed. 
+        """
+        if self.init_model and os.path.exists(self.init_model):
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.train_graph.load_persistables(self.init_model, exe)
+            flops = context.eval_graph.flops()
+            conv_flops = context.eval_graph.flops(only_conv=True)
+            context.eval_graph.update_param_shape(context.scope)
+            context.eval_graph.update_groups_of_conv()
+            _logger.info("conv flops: -{}".format(1 - float(
+                context.eval_graph.flops(only_conv=True)) / conv_flops))
+            _logger.info("total flops: -{}".format(1 - float(
+                context.eval_graph.flops()) / flops))
+            context.train_graph.update_param_shape(context.scope)
+            context.train_graph.update_groups_of_conv()
+            context.train_graph.infer_shape()
+            _logger.info("Init model from: {}".format(self.init_model))
+
+    def _load_checkpoint(self, context):
+        """
+        Load checkpoints from file.
+        """
+        _logger.debug('_load_checkpoint')
+        strategies = self.strategies
+        if self.checkpoint_path:
+            if not os.path.exists(self.checkpoint_path):
+                _logger.warning("Checkpints path doesn't exist: [{}]".format(
+                    self.checkpoint_path))
+                return context, strategies
+            checkpoints = [
+                dir for dir in os.listdir(self.checkpoint_path)
+                if os.path.isdir(os.path.join(self.checkpoint_path, dir))
+            ]
+            _logger.debug('self.checkpoint_path: {}'.format(
+                self.checkpoint_path))
+            _logger.info('checkpoints: {}'.format(checkpoints))
+            if len(checkpoints) > 0:
+                latest = max([int(ck) for ck in checkpoints])
+                latest_ck_path = os.path.join(self.checkpoint_path, str(latest))
+
+                model_path = os.path.join(latest_ck_path, 'model')
+                context_path = os.path.join(latest_ck_path, 'context')
+                strategy_path = os.path.join(latest_ck_path, 'strategies')
+                if os.path.exists(context_path):
+                    context.from_file(context_path)
+                    context.epoch_id += 1
+                if os.path.exists(strategy_path):
+                    with open(strategy_path, 'rb') as strategy_file:
+                        if sys.version_info < (3, 0):
+                            strategies = pickle.load(strategy_file)
+                        else:
+                            strategies = pickle.load(
+                                strategy_file, encoding='bytes')
+
+                if os.path.exists(model_path):
+                    exe = SlimGraphExecutor(context.place)
+                    with scope_guard(context.scope):
+                        context.optimize_graph.load_persistables(model_path,
+                                                                 exe)
+                    context.optimize_graph.update_param_shape(context.scope)
+                    context.optimize_graph.update_groups_of_conv()
+                    context.eval_graph.update_param_shape(context.scope)
+                    context.eval_graph.update_groups_of_conv()
+                    _logger.info("Loaded params from: {}".format(model_path))
+        return context, strategies
+
+    def _save_checkpoint(self, context):
+        """
+        Save checkpoints to file.
+        """
+        if context.epoch_id % 1 == 0 and self.checkpoint_path:
+            checkpoint_path = os.path.join(self.checkpoint_path,
+                                           str(context.epoch_id))
+            model_path = os.path.join(checkpoint_path, 'model')
+            context_path = os.path.join(checkpoint_path, 'context')
+            strategy_path = os.path.join(checkpoint_path, 'strategies')
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.optimize_graph.save_persistables(model_path, exe)
+            context.to_file(context_path)
+            with open(strategy_path, 'wb') as strategy_file:
+                pickle.dump(self.strategies, strategy_file)
+            _logger.info('Saved checkpoint to: {}'.format(checkpoint_path))
+
+    def _train_one_epoch(self, context):
+        """
+        Train one epoch.
+        """
+
+        executor = SlimGraphExecutor(self.place)
+
+        if context.optimize_graph.compiled_graph is None:
+            context.optimize_graph.compiled_graph = compiler.CompiledProgram(
+                context.optimize_graph.program).with_data_parallel(
+                    loss_name=context.optimize_graph.out_nodes['loss'])
+
+        for data in context.train_reader():
+            for strategy in self.strategies:
+                strategy.on_batch_begin(context)
+            results = executor.run(context.optimize_graph,
+                                   context.scope,
+                                   data=data)
+            results = [float(np.mean(result)) for result in results]
+            if context.batch_id % 20 == 0:
+                _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
+                    context.epoch_id, context.batch_id,
+                    context.optimize_graph.out_nodes.keys(
+                    ), [round(r, 3) for r in results]))
+            for strategy in self.strategies:
+                strategy.on_batch_end(context)
+            context.batch_id += 1
+        context.batch_id = 0
+
+    def _eval(self, context):
+        """
+        Runing evaluation.
+        """
+        results, names = context.run_eval_graph()
+        for name, result in zip(names, results):
+            if name not in context.eval_results:
+                context.eval_results[name] = []
+            context.eval_results[name].append(result)
+
+    def run(self):
+        """
+        Execute compressiong pass.
+        """
+        context = Context(
+            place=self.place,
+            scope=self.scope,
+            train_graph=self.train_graph,
+            train_reader=self.train_reader,
+            eval_graph=self.eval_graph,
+            eval_reader=self.eval_reader,
+            teacher_graphs=self.teacher_graphs,
+            train_optimizer=self.train_optimizer,
+            distiller_optimizer=self.distiller_optimizer)
+        self.context = context
+        if self.teacher_graphs:
+            context.put('teachers', self.teacher_graphs)
+        self._init_model(context)
+        if not context.optimize_graph:
+            if context.train_optimizer:
+                context.train_optimizer._name = 'train_opt'
+                context.optimize_graph = context.train_graph.get_optimize_graph(
+                    context.train_optimizer, context.place, context.scope)
+            else:
+                context.optimize_graph = context.train_graph
+
+        context, self.strategies = self._load_checkpoint(context)
+
+        for strategy in self.strategies:
+            strategy.on_compression_begin(context)
+        start = context.epoch_id
+        self._eval(context)
+        for epoch in range(start, self.epoch):
+            context.epoch_id = epoch
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+            self._train_one_epoch(context)
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            if self.eval_epoch and epoch % self.eval_epoch == 0:
+                self._eval(context)
+            self._save_checkpoint(context)
+        for strategy in self.strategies:
+            strategy.on_compression_end(context)
+        return context.eval_graph
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
index 811c457003..9bb395aee9 100644
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -17,8 +17,9 @@ import funcsigs
 import yaml
 from collections import OrderedDict
 from ..prune import *
-from .compress_pass import *
+from ..quantization import *
 from .strategy import *
+from ..distillation import *
 
 __all__ = ['ConfigFactory']
 """This factory is used to create instances by loading and parsing configure file with yaml format.
@@ -29,15 +30,10 @@ class ConfigFactory(object):
     def __init__(self, config):
         """Init a factory from configure file."""
         self.instances = {}
+        self.compressor = {}
         self.version = None
         self._parse_config(config)
 
-    def get_compress_pass(self):
-        """
-        Get compress pass from factory.
-        """
-        return self.instance('compress_pass')
-
     def instance(self, name):
         """
         Get instance from factory.
@@ -59,8 +55,16 @@ class ConfigFactory(object):
             args = {}
             for key in keys:
                 value = attrs[key]
+                if isinstance(value, str) and value.lower() == 'none':
+                    value = None
                 if isinstance(value, str) and value in self.instances:
                     value = self.instances[value]
+                if isinstance(value, list):
+                    for i in range(len(value)):
+                        if isinstance(value[i],
+                                      str) and value[i] in self.instances:
+                            value[i] = self.instances[value[i]]
+
                 args[key] = value
             self.instances[name] = class_(**args)
         return self.instances.get(name)
@@ -76,16 +80,23 @@ class ConfigFactory(object):
                     assert self.version == int(key_values['version'])
 
                 # parse pruners
-                if key == 'pruners' or key == 'strategies':
+                if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies':
                     instances = key_values[key]
                     for name in instances:
                         self._new_instance(name, instances[name])
 
-                if key == 'compress_pass':
-                    compress_pass = self._new_instance(key, key_values[key])
-                    for name in key_values[key]['strategies']:
-                        strategy = self.instance(name)
-                        compress_pass.add_strategy(strategy)
+                if key == 'compressor':
+                    self.compressor['strategies'] = []
+                    self.compressor['epoch'] = key_values[key]['epoch']
+                    if 'init_model' in key_values[key]:
+                        self.compressor['init_model'] = key_values[key][
+                            'init_model']
+                    self.compressor['checkpoint_path'] = key_values[key][
+                        'checkpoint_path']
+                    if 'strategies' in key_values[key]:
+                        for name in key_values[key]['strategies']:
+                            strategy = self.instance(name)
+                            self.compressor['strategies'].append(strategy)
 
                 if key == 'include':
                     for config_file in key_values[key]:
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
deleted file mode 100644
index fc1ddc94e0..0000000000
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .compress_pass import CompressPass
-from .config import ConfigFactory
-
-__all__ = ['build_compressor']
-
-
-def build_compressor(place=None,
-                     data_reader=None,
-                     data_feeder=None,
-                     scope=None,
-                     metrics=None,
-                     epoch=None,
-                     config=None):
-    if config is not None:
-        factory = ConfigFactory(config)
-        comp_pass = factory.get_compress_pass()
-    else:
-        comp_pass = CompressPass()
-    comp_pass.place = place
-    comp_pass.data_reader = data_reader
-    comp_pass.data_feeder = data_feeder
-    comp_pass.scope = scope
-    comp_pass.metrics = metrics
-    comp_pass.epoch = epoch
-    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
index 74d98e98b0..28bf24f4e3 100644
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -20,7 +20,7 @@ class Strategy(object):
     Base class for all strategies.
     """
 
-    def __init__(self, start_epoch=0, end_epoch=10):
+    def __init__(self, start_epoch=0, end_epoch=0):
         """
         Args:
             start_epoch: The first epoch to apply the strategy.
@@ -29,7 +29,7 @@ class Strategy(object):
         self.start_epoch = start_epoch
         self.end_epoch = end_epoch
 
-    def on_compress_begin(self, context):
+    def on_compression_begin(self, context):
         pass
 
     def on_epoch_begin(self, context):
@@ -44,5 +44,5 @@ class Strategy(object):
     def on_batch_end(self, context):
         pass
 
-    def on_compress_end(self, context):
+    def on_compression_end(self, context):
         pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
deleted file mode 100644
index ea888fa2c7..0000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
deleted file mode 100644
index 21c59c0c9d..0000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle
-import os
-import sys
-from paddle.fluid.contrib.slim import CompressPass
-from paddle.fluid.contrib.slim import build_compressor
-from paddle.fluid.contrib.slim import ImitationGraph
-
-
-class LinearModel(object):
-    def __init__(slef):
-        pass
-
-    def train(self):
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        startup_program.random_seed = 10
-        with fluid.program_guard(train_program, startup_program):
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=1, act=None)
-            cost = fluid.layers.square_error_cost(input=predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-            eval_program = train_program.clone()
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_cost)
-
-        train_reader = paddle.batch(
-            paddle.dataset.uci_housing.train(), batch_size=1)
-        eval_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=1)
-        place = fluid.CPUPlace()
-        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        train_metrics = {"loss": avg_cost.name}
-        eval_metrics = {"loss": avg_cost.name}
-
-        graph = ImitationGraph(train_program)
-        config = './config.yaml'
-        comp_pass = build_compressor(
-            place,
-            data_reader=train_reader,
-            data_feeder=train_feeder,
-            scope=fluid.global_scope(),
-            metrics=train_metrics,
-            epoch=1,
-            config=config)
-        comp_pass.apply(graph)
-
-
-if __name__ == "__main__":
-    model = LinearModel()
-    model.train()
diff --git a/python/paddle/fluid/contrib/slim/distillation/__init__.py b/python/paddle/fluid/contrib/slim/distillation/__init__.py
new file mode 100644
index 0000000000..455c7c5633
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import distiller
+from .distiller import *
+from . import distillation_strategy
+from .distillation_strategy import *
+
+__all__ = distiller.__all__
+__all__ += distillation_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
new file mode 100644
index 0000000000..1f11f07a51
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import Executor
+import logging
+
+__all__ = ['DistillationStrategy']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class DistillationStrategy(Strategy):
+    def __init__(self, distillers=None, start_epoch=0, end_epoch=0):
+        """
+        Args:
+            distillers(list): A list of distiller used to combine student graph and teacher graph
+                              by adding some loss.
+            start_epoch(int): The epoch when to merge student graph and teacher graph for
+                              distillation training. default: 0
+            end_epoch(int): The epoch when to finish distillation training. default: 0
+            
+        """
+        super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
+        self.distillers = distillers
+
+    def on_compression_begin(self, context):
+        # load from checkpoint
+        if context.epoch_id > 0:
+            if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
+                _logger.info('Restore DistillationStrategy')
+                self._create_distillation_graph(context)
+                _logger.info('Restore DistillationStrategy finish.')
+
+    def on_epoch_begin(self, context):
+        if self.start_epoch == context.epoch_id:
+            _logger.info('DistillationStrategy::on_epoch_begin.')
+            self._create_distillation_graph(context)
+            _logger.info('DistillationStrategy set optimize_graph.')
+
+    def _create_distillation_graph(self, context):
+        """
+        step 1: Merge student graph and teacher graph into distillation graph.
+        step 2: Add loss into distillation graph by distillers.
+        step 3: Append backward ops and optimize ops into distillation graph for training.
+        """
+        # step 1
+        teacher = context.teacher_graphs[0]
+        for var in teacher.program.list_vars():
+            var.stop_gradient = True
+        graph = context.train_graph.clone()
+        graph.merge(teacher)
+        graph.out_nodes['student_loss'] = graph.out_nodes['loss']
+
+        # step 2
+        for distiller in self.distillers:
+            graph = distiller.distiller_loss(graph)
+
+        # step 3
+        startup_program = Program()
+        with program_guard(graph.program, startup_program):
+            context.distiller_optimizer._name = 'distillation_optimizer'
+            context.distiller_optimizer.minimize(
+                graph.var(graph.out_nodes['loss'])._var)
+        exe = Executor(context.place)
+        exe.run(startup_program, scope=context.scope)
+
+        # backup graph for fine-tune after distillation
+        context.put('distillation_backup_optimize_graph',
+                    context.optimize_graph)
+        context.optimize_graph = graph
+
+    def on_epoch_end(self, context):
+        if context.epoch_id == (self.end_epoch - 1):
+            _logger.info('DistillationStrategy::on_epoch_end.')
+            # restore optimize_graph for fine-tune or other strategy in next stage.
+            context.optimize_graph = context.get(
+                'distillation_backup_optimize_graph')
+            _logger.info(
+                'DistillationStrategy set context.optimize_graph to None.')
diff --git a/python/paddle/fluid/contrib/slim/distillation/distiller.py b/python/paddle/fluid/contrib/slim/distillation/distiller.py
new file mode 100644
index 0000000000..13bb35a8be
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import layers
+from .... import optimizer
+from .... import Executor
+from .... import Program
+from .... import program_guard
+from .... import regularizer
+
+__all__ = ['FSPDistiller', 'L2Distiller']
+
+
+class L2Distiller(object):
+    """
+    Combine two layers from student net and teacher net by l2-loss.
+    And add the loss into the total loss using for distillation training.
+    """
+
+    def __init__(self,
+                 student_feature_map,
+                 teacher_feature_map,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def distiller_loss(self, graph):
+        """
+        Modify graph inplace to add l2-loss.
+        Args: 
+            graph(GraphWrapper): The graph to be modified.
+        Returns:
+            GraphWrapper: The modified graph.
+        """
+        distiller_pass = L2DistillerPass(self.student_feature_map,
+                                         self.teacher_feature_map,
+                                         self.distillation_loss_weight)
+        dis_graph = distiller_pass.apply(graph)
+        return dis_graph
+
+
+class L2DistillerPass(object):
+    """
+    The pass used to add l2-loss.
+    """
+
+    def __init__(self,
+                 student_feature_map,
+                 teacher_feature_map,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def apply(self, graph):
+        ret_graph = graph
+        with program_guard(ret_graph.program):
+
+            student_feature_map = ret_graph.var(self.student_feature_map)._var
+            teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
+            l2loss = layers.reduce_mean(
+                layers.square(student_feature_map - teacher_feature_map))
+
+            distillation_loss = l2loss * self.distillation_loss_weight
+            student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
+            loss = distillation_loss + student_loss
+
+            ret_graph.out_nodes[
+                'l2loss_' + self.student_feature_map + "_" +
+                self.teacher_feature_map] = distillation_loss.name
+            ret_graph.out_nodes['loss'] = loss.name
+        return ret_graph
+
+
+class FSPDistiller(object):
+    """
+    Combine layers from student net and teacher net by fsp-loss.
+    """
+
+    def __init__(self, student_pairs, teacher_pairs,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+                                        a section in student network. The variables in a tuple should
+                                        have the same feature map size.
+            teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+                                        a section in teacher network. The variables in a tuple should
+                                        have the same feature map size. Varibale named teacher_pairs[i][j]
+                                        should has the save channel number with that of variable named 
+                                        student_pairs[i][j].
+
+            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
+        """
+        self.student_pairs = student_pairs
+        self.teacher_pairs = teacher_pairs
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def distiller_loss(self, graph):
+        """
+        Modify graph inplace to add fsp-loss.
+        Args: 
+            graph(GraphWrapper): The graph to be modified.
+        Returns:
+            GraphWrapper: The modified graph.
+        """
+        distiller_pass = FSPDistillerPass(self.student_pairs,
+                                          self.teacher_pairs,
+                                          self.distillation_loss_weight)
+        dis_graph = distiller_pass.apply(graph)
+        return dis_graph
+
+
+class FSPDistillerPass(object):
+    '''
+    Combine layers from student net and teacher net by fsp-loss.
+    '''
+
+    def __init__(self, s_pairs, t_pairs, distillation_loss_weight=1):
+        """
+        Args:
+            s_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+                                        a section in student network. The variables in a tuple should
+                                        have the same feature map size.
+            t_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+                                        a section in teacher network. The variables in a tuple should
+                                        have the same feature map size. Varibale named teacher_pairs[i][j]
+                                        should has the save channel number with that of variable named 
+                                        student_pairs[i][j].
+
+            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
+        """
+        self.s_pairs = s_pairs
+        self.t_pairs = t_pairs
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def apply(self, graph):
+        ret_graph = graph
+        with program_guard(ret_graph.program):
+            losses = []
+            for s_pair, t_pair in zip(self.s_pairs, self.t_pairs):
+                s_pair_start = ret_graph.var(s_pair[0])._var
+                s_pair_end = ret_graph.var(s_pair[1])._var
+                s_fsp_matrix = self._fsp_matrix(s_pair_start, s_pair_end)
+                t_pair_start = ret_graph.var(t_pair[0])._var
+                t_pair_end = ret_graph.var(t_pair[1])._var
+                t_fsp_matrix = self._fsp_matrix(t_pair_start, t_pair_end)
+                l2_loss = layers.reduce_mean(
+                    layers.square(s_fsp_matrix - t_fsp_matrix))
+                losses.append(l2_loss)
+            distillation_loss = layers.sum(
+                losses) * self.distillation_loss_weight
+            student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
+            loss = distillation_loss + student_loss
+
+            ret_graph.out_nodes[
+                'fsp_distillation_loss'] = distillation_loss.name
+            ret_graph.out_nodes['loss'] = loss.name
+        return ret_graph
+
+    def _fsp_matrix(self, fea_map_0, fea_map_1):
+        return layers.fsp_matrix(fea_map_0, fea_map_1)
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
index d65472d193..c5d1c4dbdf 100644
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -14,10 +14,7 @@
 
 from . import executor
 from .executor import *
-from . import graph
-from .graph import *
-from . import graph_pass
-from .graph_pass import *
+from . import graph_wrapper
+from .graph_wrapper import *
 __all__ = executor.__all__
-__all__ += graph.__all__
-__all__ += graph_pass.__all__
+__all__ += graph_wrapper.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
index c02c3af820..70438a90eb 100644
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -12,51 +12,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import abc
-from abc import abstractmethod
+from ....compiler import CompiledProgram
+from ....data_feeder import DataFeeder
 from .... import executor
-from .graph import IRGraph, ImitationGraph
+from .graph_wrapper import GraphWrapper
 
-__all__ = ['get_executor']
+__all__ = ['SlimGraphExecutor']
 
 
-class GraphExecutor(object):
-    __metaclass__ = abc.ABCMeta
+class SlimGraphExecutor(object):
+    """
+    Wrapper of executor used to run GraphWrapper.
+    """
 
     def __init__(self, place):
-        self.place = place
-
-    @abstractmethod
-    def run(self, graph, feches=None, feed=None):
-        pass
-
-
-class IRGraphExecutor(GraphExecutor):
-    def run(self, grah, fetches, feed=None):
-        pass
-
-
-class ImitationGraphExecutor(GraphExecutor):
-    def __init__(self, place):
-        super(ImitationGraphExecutor, self).__init__(place)
         self.exe = executor.Executor(place)
+        self.place = place
 
-    def run(self, graph, scope=None, fetches=None, feed=None):
-        assert isinstance(graph, ImitationGraph)
-        fetch_list = None
-        if fetches:
-            fetch_list = [
-                graph.program.global_block().var(name) for name in fetches
-            ]
-        results = self.exe.run(graph.program,
+    def run(self, graph, scope, data=None):
+        """
+        Runing a graph with a batch of data.
+        Args:
+            graph(GraphWrapper): The graph to be executed.
+            scope(fluid.core.Scope): The scope to be used.
+            data(list<tuple>): A batch of data. Each tuple in this list is a sample.
+                               It will feed the items of tuple to the in_nodes of graph.
+        Returns:
+            results(list): A list of result with the same order indicated by graph.out_nodes.
+        """
+        assert isinstance(graph, GraphWrapper)
+        if data is not None:
+            feeder = DataFeeder(
+                feed_list=graph.in_nodes.values(),
+                place=self.place,
+                program=graph.program)
+            feed = feeder.feed(data)
+
+        fetch_list = graph.out_nodes.values()
+        program = graph.compiled_graph if graph.compiled_graph else graph.program
+        results = self.exe.run(program,
                                scope=scope,
                                fetch_list=fetch_list,
                                feed=feed)
         return results
-
-
-def get_executor(graph, place):
-    if isinstance(graph, ImitationGraph):
-        return ImitationGraphExecutor(place)
-    if isinstance(graph, IRGraph):
-        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
deleted file mode 100644
index f38d978341..0000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import subprocess
-from ....framework import Program
-from ....framework import Block
-from .... import core
-
-__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
-
-
-class Graph(object):
-    """
-    Base class for all graph.
-    """
-
-    def __init__(self):
-        pass
-
-    def all_parameters(self):
-        """
-        Return all the parameters in current graph.
-        """
-        pass
-
-
-class ImitationGraph(Graph):
-    def __init__(self, program=None):
-        super(ImitationGraph, self).__init__()
-        self.program = Program() if program is None else program
-
-    def all_parameters(self):
-        return self.program.global_block().all_parameters()
-
-
-class IRGraph(Graph):
-    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
deleted file mode 100644
index 1db6c4f110..0000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['GraphPass', 'PruneParameterPass']
-
-
-class GraphPass(object):
-    """
-    Base class for all graph pass.
-    """
-
-    def __init__(self):
-        pass
-
-    def apply(self, graph):
-        pass
-
-
-class PruneParameterPass(GraphPass):
-    """
-    Generate a graph for pruning parameters from target graph.
-    """
-
-    def __init__(self, pruned_params, thresholds):
-        super(PruneParameterPass, self).__init__()
-        self.pruned_params = pruned_params
-        self.thresholds = thresholds
-        self.default_threshold = thresholds['*']
-
-    def apply(self, graph):
-        pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
new file mode 100644
index 0000000000..c208553fd8
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -0,0 +1,502 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .... import io
+from .... import compiler
+from ....framework import Program
+from ....framework import program_guard
+from ....framework import Parameter
+from ....framework import Variable
+from ....executor import Executor
+import copy
+from collections import Iterable
+from ....io import save_inference_model, load_inference_model, save_persistables
+import numpy as np
+import pickle
+import os
+
+__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper']
+
+OPTIMIZER_OPS = [
+    'momentum',
+    'lars_momentum',
+    'adagrad',
+    'adam',
+    'adamax',
+    'decayed_adagrad',
+    'adadelta',
+    'rmsprop',
+]
+
+
+class VarWrapper(object):
+    def __init__(self, var, graph):
+        assert isinstance(var, Variable)
+        assert isinstance(graph, GraphWrapper)
+        self._var = var
+        self._graph = graph
+
+    def __eq__(self, v):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self._var.name == v._var.name
+
+    def name(self):
+        """
+        Get the name of the variable.
+        """
+        return self._var.name
+
+    def shape(self):
+        """
+        Get the shape of the varibale.
+        """
+        return self._var.shape
+
+    def set_shape(self, shape):
+        """
+        Set the shape of the variable.
+        """
+        self._var.desc.set_shape(shape)
+
+    def inputs(self):
+        """
+        Get all the operators that use this variable as output.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_inputs():
+                ops.append(op)
+        return ops
+
+    def outputs(self):
+        """
+        Get all the operators that use this variable as input.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_outputs():
+                ops.append(op)
+        return ops
+
+
+class OpWrapper(object):
+    def __init__(self, op, graph):
+        assert isinstance(graph, GraphWrapper)
+        self._op = op
+        self._graph = graph
+
+    def __eq__(self, op):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self.idx() == op.idx()
+
+    def all_inputs(self):
+        """
+        Get all the input variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.input_arg_names
+        ]
+
+    def all_outputs(self):
+        """
+        Get all the output variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.output_arg_names
+        ]
+
+    def idx(self):
+        """
+        Get the id of this operator.
+        """
+        return self._op.idx
+
+    def type(self):
+        """
+        Get the type of this operator.
+        """
+        return self._op.type
+
+    def is_bwd_op(self):
+        """
+        Whether this operator is backward op.
+        """
+        return self.type().endswith('_grad')
+
+    def is_opt_op(self):
+        """
+        Whether this operator is optimizer op.
+        """
+        return self.type() in OPTIMIZER_OPS
+
+    def inputs(self, name):
+        """
+        Get all the varibales by the input name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.input(name)]
+
+    def outputs(self, name):
+        """
+        Get all the varibales by the output name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.output(name)]
+
+    def set_attr(self, key, value):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            key(str): the attribute name.
+            value(bool|int|str|float|list): the value of the attribute.
+        """
+        self._op._set_attr(key, value)
+
+    def attr(self, name):
+        """
+        Get the attribute by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
+            can be any valid attribute type.
+        """
+        return self._op.attr(name)
+
+
+class GraphWrapper(object):
+    """
+    It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
+    for paddle slim framework.
+    """
+
+    def __init__(self, program=None, in_nodes=[], out_nodes=[]):
+        """
+        Args:
+            program(framework.Program): A program with 
+            in_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+            out_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+        """
+        super(GraphWrapper, self).__init__()
+        self.program = Program() if program is None else program
+        self.compiled_graph = None
+        self.in_nodes = OrderedDict(in_nodes)
+        self.out_nodes = OrderedDict(out_nodes)
+        self._attrs = OrderedDict()
+
+    def all_parameters(self):
+        """
+        Get all the parameters in this graph.
+        Returns:
+            list<VarWrapper>: A list of VarWrapper instances.
+        """
+        params = []
+        for block in self.program.blocks:
+            for param in block.all_parameters():
+                params.append(VarWrapper(param, self))
+        return params
+
+    def is_parameter(self, var):
+        """
+        Whether the given variable is parameter.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return isinstance(var._var, Parameter)
+
+    def is_persistable(self, var):
+        """
+        Whether the given variable is persistable.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return var._var.persistable
+
+    def compile(self, for_parallel=True, for_test=False):
+        """
+        Compile the program in this wrapper to framework.CompiledProgram for next running.
+        This function must be called if the program is modified.
+        Args:
+            for_parallel(bool): Whether the program to run in data parallel way. default: True.
+            for_test(bool): Whether the compiled program is used for test.
+        """
+        target = self.program
+        if for_test:
+            loss = None
+        else:
+            loss = self.out_nodes['loss']
+        if for_parallel:
+            # disable memory optimize for stable training
+            build_strategy = compiler.BuildStrategy()
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
+            self.compiled_graph = compiler.CompiledProgram(
+                target).with_data_parallel(
+                    loss_name=loss, build_strategy=build_strategy)
+        else:
+            self.compiled_graph = compiler.CompiledProgram(target)
+
+    def ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
+        ops = []
+        for block in self.program.blocks:
+            for op in block.ops:
+                ops.append(OpWrapper(op, self))
+        return ops
+
+    def vars(self):
+        """
+        Get all the variables.
+        """
+        return [VarWrapper(var, self) for var in self.program.list_vars()]
+
+    def var(self, name):
+        """
+        Get the variable by variable name.
+        """
+        return VarWrapper(self.program.global_block().var(name), self)
+
+    def clone(self, for_test=False):
+        """
+        Clone a new graph from current graph.
+        Returns:
+            (GraphWrapper): The wrapper of a new graph.
+        """
+        return GraphWrapper(
+            self.program.clone(for_test),
+            copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes))
+
+    def merge(self, graph):
+        """
+        Merge a graph into current graph.
+        Args:
+            graph(GraphWrapper): The graph to be merged by current graph.
+        """
+        for var in graph.program.list_vars():
+            new_var = self.program.global_block()._clone_variable(
+                var, force_persistable=False)
+            new_var.stop_gradient = var.stop_gradient
+            # TODO: parameters should be cloned
+        for op in graph.ops():
+            op = op._op
+            inputs = {}
+            outputs = {}
+            attrs = {}
+            for input_name in op.input_names:
+                inputs[input_name] = [
+                    self.var(in_var_name)._var
+                    for in_var_name in op.input(input_name)
+                ]
+            for output_name in op.output_names:
+                outputs[output_name] = [
+                    self.var(out_var_name)._var
+                    for out_var_name in op.output(output_name)
+                ]
+            for attr_name in op.attr_names:
+                attrs[attr_name] = op.attr(attr_name)
+            self.program.global_block().append_op(
+                type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    def program(self):
+        """
+        Get the program in current wrapper.
+        """
+        return self.program
+
+    def pre_ops(self, op):
+        """
+        Get all the previous operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for in_var in op.all_inputs():
+                if in_var in p.all_outputs():
+                    ops.append(p)
+        return ops
+
+    def next_ops(self, op):
+        """
+        Get all the next operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for out_var in op.all_outputs():
+                if out_var in p.all_inputs():
+                    ops.append(p)
+        return ops
+
+    def get_param_by_op(self, op):
+        """
+        Get the parameters used by target operator.
+        """
+        assert isinstance(op, OpWrapper)
+        params = []
+        for var in op.all_inputs():
+            if isinstance(var._var, Parameter):
+                params.append(var)
+        assert len(params) > 0
+        return params
+
+    def numel_params(self):
+        """
+        Get the number of elements in all parameters.
+        """
+        ret = 0
+        for param in self.all_parameters():
+            ret += np.product(param.shape())
+        return ret
+
+    def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]):
+        """
+        Get a new graph for training by appending some backward operators and optimization operators.
+        Args:
+            optimizer: The optimzier used to generate training graph.
+            place: The place to run the graph.
+            scope: The scope used to run the graph. Some new variable will be added into this scope.
+            no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
+        Returns:
+            (GraphWrapper): The wrapper of new graph with backward ops and optimization ops. 
+        """
+        graph = self.clone()
+        startup_program = Program()
+        with program_guard(
+                main_program=graph.program, startup_program=startup_program):
+            target_name = None
+            if 'loss' in graph.out_nodes:
+                target_name = graph.out_nodes['loss']
+            elif 'cost' in graph.out_nodes:
+                target_name = graph.out_nodes['cost']
+            target = graph.var(target_name)._var
+            optimizer.minimize(target, no_grad_set=no_grad_var_names)
+
+        exe = Executor(place)
+        exe.run(program=startup_program, scope=scope)
+        return graph
+
+    def flops(self, only_conv=False):
+        """
+        Get the flops of current graph.
+        Args:
+            only_conv: Only calculating the conv layers. default: False.
+        Returns:
+            int: The flops of current graph.
+        """
+        flops = 0
+        for op in self.ops():
+            if op.type() in ['conv2d', 'depthwise_conv2d']:
+                filter_shape = op.inputs("Filter")[0].shape()
+                input_shape = op.inputs("Input")[0].shape()
+                output_shape = op.outputs("Output")[0].shape()
+                c_out, c_in, k_h, k_w = filter_shape
+                _, _, h_out, w_out = output_shape
+                groups = op.attr("groups")
+                kernel_ops = k_h * k_w * (c_in / groups)
+                if len(op.inputs("Bias")) > 0:
+                    with_bias = 1
+                else:
+                    with_bias = 0
+                flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
+            elif op.type() == 'pool2d' and not only_conv:
+                input_shape = op.inputs("X")[0].shape()
+                output_shape = op.outputs("Out")[0].shape()
+                _, c_out, h_out, w_out = output_shape
+                k_size = op.attr("ksize")
+                flops += h_out * w_out * c_out * (k_size[0]**2)
+
+            elif op.type() == 'mul' and not only_conv:
+                x_shape = list(op.inputs("X")[0].shape())
+                y_shape = op.inputs("Y")[0].shape()
+                if x_shape[0] == -1:
+                    x_shape[0] = 1
+                flops += 2 * x_shape[0] * x_shape[1] * y_shape[1]
+
+            elif op.type() in ['relu', 'sigmoid', 'batch_norm'
+                               ] and not only_conv:
+                input_shape = list(op.inputs("X")[0].shape())
+                if input_shape[0] == -1:
+                    input_shape[0] = 1
+                flops += np.product(input_shape)
+
+        return flops
+
+    def save_persistables(self, path, exe):
+        """
+        Save all the persistable variables into file.
+        Args:
+            path(str): The path to save the persistables.
+            exe(framework.Executor): The executor used to save the persistables.
+        """
+        io.save_persistables(exe.exe, path, main_program=self.program)
+
+    def load_persistables(self, path, exe):
+        """
+        Load the persistable variables from file.
+        Args:
+            path(str): The path to load the persistables.
+            exe(framework.Executor): The executor used to load the persistables.
+        """
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(path, var.name))
+
+        io.load_vars(
+            exe.exe, path, main_program=self.program, predicate=if_exist)
+
+    def update_param_shape(self, scope):
+        """
+        Update the shape of parameters in the graph according to tensors in scope.
+        It is used after loading pruned parameters from file.
+        """
+        for param in self.all_parameters():
+            tensor_shape = np.array(scope.find_var(param.name()).get_tensor(
+            )).shape
+            param.set_shape(tensor_shape)
+
+    def infer_shape(self):
+        """
+        Update the groups of convolution layer according to current filters.
+        It is used after loading pruned parameters from file.
+        """
+        for op in self.ops():
+            if op.type() != 'conditional_block':
+                op._op.desc.infer_shape(op._op.block.desc)
+
+    def update_groups_of_conv(self):
+        for op in self.ops():
+            if op.type() == 'depthwise_conv2d':
+                op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index 34c5107daa..7a25c3a61e 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -13,54 +13,919 @@
 # limitations under the License.
 
 from ..core.strategy import Strategy
-from ....framework import Program, program_guard
+from ..graph import VarWrapper, OpWrapper, GraphWrapper
+from ....framework import Program, program_guard, Parameter
 from .... import layers
+import prettytable as pt
 import numpy as np
+from scipy.optimize import leastsq
+import copy
+import re
+import os
+import pickle
+import logging
+import sys
 
-__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy']
 
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class PruneStrategy(Strategy):
+    """
+    The base class of all pruning strategies.
+    """
 
-class SensitivePruneStrategy(Strategy):
     def __init__(self,
                  pruner=None,
                  start_epoch=0,
-                 end_epoch=10,
-                 delta_rate=0.20,
-                 acc_loss_threshold=0.2,
-                 sensitivities=None):
-        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
         self.pruner = pruner
-        self.delta_rate = delta_rate
-        self.acc_loss_threshold = acc_loss_threshold
-        self.sensitivities = sensitivities
+        self.target_ratio = target_ratio
+        self.metric_name = metric_name
+        self.pruned_params = pruned_params
+        self.pruned_list = []
+        self.backup = {}
+        self.param_shape_backup = {}
 
+    def _eval_graph(self, context, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            context(slim.core.Context): The context storing all information used to evaluate the current model.
+            sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+        """
+        results, names = context.run_eval_graph(sampled_rate, cached_id)
+        metric = np.mean(results[list(names).index(self.metric_name)])
+        return metric
 
-class PruneStrategy(Strategy):
+    def _prune_filters_by_ratio(self,
+                                scope,
+                                params,
+                                ratio,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning filters by given ratio.
+        Args:
+            scope(fluid.core.Scope): The scope used to pruning filters.
+            params(list<VarWrapper>): A list of filter parameters.
+            ratio(float): The ratio to be pruned.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[0]:
+            return
+        param_t = scope.find_var(params[0].name()).get_tensor()
+        pruned_idx = self.pruner.cal_pruned_idx(
+            params[0].name(), np.array(param_t), ratio, axis=0)
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[0] = pruned_param.shape[0]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()), str(0), str(ori_shape), str(param.shape())))
+            self.pruned_list[0].append(param.name())
+        return pruned_idx
+
+    def _prune_parameter_by_idx(self,
+                                scope,
+                                params,
+                                pruned_idx,
+                                pruned_axis,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning parameters in given axis.
+        Args:
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(VarWrapper): The parameter to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            pruned_axis(int): The pruning axis.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[pruned_axis]:
+            return
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()),
+                str(pruned_axis), str(ori_shape), str(param.shape())))
+            self.pruned_list[pruned_axis].append(param.name())
+
+    def _forward_search_related_op(self, graph, param):
+        """
+        Forward search operators that will be affected by pruning of param.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            param(VarWrapper): The current pruned parameter.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        assert isinstance(param, VarWrapper)
+        visited = {}
+        for op in graph.ops():
+            visited[op.idx()] = False
+        stack = []
+        for op in graph.ops():
+            if (not op.is_bwd_op()) and (param in op.all_inputs()):
+                stack.append(op)
+        visit_path = []
+        while len(stack) > 0:
+            top_op = stack[len(stack) - 1]
+            if visited[top_op.idx()] == False:
+                visit_path.append(top_op)
+                visited[top_op.idx()] = True
+            next_ops = None
+            if top_op.type() == "conv2d" and param not in top_op.all_inputs():
+                next_ops = None
+            elif top_op.type() == "mul":
+                next_ops = None
+            else:
+                next_ops = self._get_next_unvisited_op(graph, visited, top_op)
+            if next_ops == None:
+                stack.pop()
+            else:
+                stack += next_ops
+        return visit_path
+
+    def _get_next_unvisited_op(self, graph, visited, top_op):
+        """
+        Get next unvisited adjacent operators of given operators.
+        Args:
+            graph(GraphWrapper): The graph used to search. 
+            visited(list): The ids of operators that has been visited.
+            top_op: The given operator.
+        Returns:
+            list<OpWrapper>: A list of operators. 
+        """
+        assert isinstance(top_op, OpWrapper)
+        next_ops = []
+        for op in graph.next_ops(top_op):
+            if (visited[op.idx()] == False) and (not op.is_bwd_op()):
+                next_ops.append(op)
+        return next_ops if len(next_ops) > 0 else None
+
+    def _get_accumulator(self, graph, param):
+        """
+        Get accumulators of given parameter. The accumulator was created by optimizer.
+        Args:
+            graph(GraphWrapper): The graph used to search.
+            param(VarWrapper): The given parameter.
+        Returns:
+            list<VarWrapper>: A list of accumulators which are variables.
+        """
+        assert isinstance(param, VarWrapper)
+        params = []
+        for op in param.outputs():
+            if op.is_opt_op():
+                for out_var in op.all_outputs():
+                    if graph.is_persistable(out_var) and out_var.name(
+                    ) != param.name():
+                        params.append(out_var)
+        return params
+
+    def _forward_pruning_ralated_params(self,
+                                        graph,
+                                        scope,
+                                        param,
+                                        place,
+                                        ratio=None,
+                                        pruned_idxs=None,
+                                        lazy=False,
+                                        only_graph=False):
+        """
+        Pruning all the parameters affected by the pruning of given parameter.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            param(VarWrapper): The given parameter.
+            place(fluid.Place): The device place of filter parameters.
+            ratio(float): The target ratio to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        assert isinstance(
+            graph,
+            GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
+        assert isinstance(
+            param, VarWrapper), "param must be instance of slim.core.VarWrapper"
+
+        if param.name() in self.pruned_list[0]:
+            return
+        related_ops = self._forward_search_related_op(graph, param)
+
+        if ratio is None:
+            assert pruned_idxs is not None
+            self._prune_parameter_by_idx(
+                scope, [param] + self._get_accumulator(graph, param),
+                pruned_idxs,
+                pruned_axis=0,
+                place=place,
+                lazy=lazy,
+                only_graph=only_graph)
+
+        else:
+            pruned_idxs = self._prune_filters_by_ratio(
+                scope, [param] + self._get_accumulator(graph, param),
+                ratio,
+                place,
+                lazy=lazy,
+                only_graph=only_graph)
+        corrected_idxs = pruned_idxs[:]
+
+        for idx, op in enumerate(related_ops):
+            if op.type() == "conv2d" and (param not in op.all_inputs()):
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=1,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            if op.type() == "depthwise_conv2d":
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "elementwise_add":
+                # pruning bias
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        bias_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [bias_param] + self._get_accumulator(
+                                graph, bias_param),
+                            pruned_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "mul":  # pruning fc layer
+                fc_input = None
+                fc_param = None
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        fc_param = in_var
+                    else:
+                        fc_input = in_var
+
+                idx = []
+                feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
+                range_idx = np.array(range(feature_map_size))
+                for i in corrected_idxs:
+                    idx += list(range_idx + i * feature_map_size)
+                corrected_idxs = idx
+                self._prune_parameter_by_idx(
+                    scope, [fc_param] + self._get_accumulator(graph, fc_param),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+            elif op.type() == "concat":
+                concat_inputs = op.all_inputs()
+                last_op = related_ops[idx - 1]
+                for out_var in last_op.all_outputs():
+                    if out_var in concat_inputs:
+                        concat_idx = concat_inputs.index(out_var)
+                offset = 0
+                for ci in range(concat_idx):
+                    offset += concat_inputs[ci].shape()[1]
+                corrected_idxs = [x + offset for x in pruned_idxs]
+            elif op.type() == "batch_norm":
+                bn_inputs = op.all_inputs()
+                mean = bn_inputs[2]
+                variance = bn_inputs[3]
+                alpha = bn_inputs[0]
+                beta = bn_inputs[1]
+                self._prune_parameter_by_idx(
+                    scope, [mean] + self._get_accumulator(graph, mean),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [variance] + self._get_accumulator(graph, variance),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [alpha] + self._get_accumulator(graph, alpha),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [beta] + self._get_accumulator(graph, beta),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+    def _prune_parameters(self,
+                          graph,
+                          scope,
+                          params,
+                          ratios,
+                          place,
+                          lazy=False,
+                          only_graph=False):
+        """
+        Pruning the given parameters.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(list<str>): A list of parameter names to be pruned.
+            ratios(list<float>): A list of ratios to be used to pruning parameters.
+            place(fluid.Place): The device place of filter parameters.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+
+        """
+        _logger.debug('\n################################')
+        _logger.debug('#       pruning parameters       #')
+        _logger.debug('################################\n')
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis',
+                                                            'from', 'to'))
+        assert len(params) == len(ratios)
+        self.pruned_list = [[], []]
+        for param, ratio in zip(params, ratios):
+            assert isinstance(param, str) or isinstance(param, unicode)
+            param = graph.var(param)
+            self._forward_pruning_ralated_params(
+                graph,
+                scope,
+                param,
+                place,
+                ratio=ratio,
+                lazy=lazy,
+                only_graph=only_graph)
+            ops = param.outputs()
+            for op in ops:
+                if op.type() == 'conv2d':
+                    brother_ops = self._search_brother_ops(graph, op)
+                    for broher in brother_ops:
+                        for p in graph.get_param_by_op(broher):
+                            self._forward_pruning_ralated_params(
+                                graph,
+                                scope,
+                                p,
+                                place,
+                                ratio=ratio,
+                                lazy=lazy,
+                                only_graph=only_graph)
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+
+    def _search_brother_ops(self, graph, op_node):
+        """
+        Search brother operators that was affected by pruning of given operator.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            op_node(OpWrapper): The start node for searching.
+        Returns: 
+            list<VarWrapper>: A list of operators.
+        """
+        visited = [op_node.idx()]
+        stack = []
+        brothers = []
+        for op in graph.next_ops(op_node):
+            if (op.type() != 'conv2d') and (op.type() != 'fc') and (
+                    not op._is_bwd_op()):
+                stack.append(op)
+                visited.append(op.idx())
+        while len(stack) > 0:
+            top_op = stack.pop()
+            for parent in graph.pre_ops(top_op):
+                if parent.idx() not in visited and (not parent._is_bwd_op()):
+                    if ((parent.type == 'conv2d') or (parent.type == 'fc')):
+                        brothers.append(parent)
+                    else:
+                        stack.append(parent)
+                    visited.append(parent.idx())
+
+            for child in graph.next_ops(top_op):
+                if (child.type != 'conv2d') and (child.type != 'fc') and (
+                        child.idx() not in visited) and (
+                            not child._is_bwd_op()):
+                    stack.append(child)
+                    visited.append(child.idx())
+        return brothers
+
+    def _prune_graph(self, graph, target_graph):
+        """
+        Pruning parameters of graph according to target graph.
+        Args:
+            graph(GraphWrapper): The graph to be pruned.
+            target_graph(GraphWrapper): The reference graph.
+        Return: None
+        """
+        count = 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter',
+                                                            'from', 'to'))
+        for param in target_graph.all_parameters():
+            var = graph.var(param.name())
+            ori_shape = var.shape()
+            var.set_shape(param.shape())
+            _logger.debug(
+                '|----+----------------------------------------+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format(
+                str(count),
+                str(param.name()), str(ori_shape), str(param.shape())))
+            count += 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+
+
+class UniformPruneStrategy(PruneStrategy):
     """
-    The strategy that pruning weights by threshold or ratio iteratively.
+    The uniform pruning strategy. The parameters will be pruned by uniform ratio.
     """
 
     def __init__(self,
-                 pruner,
-                 mini_batch_pruning_frequency=1,
+                 pruner=None,
                  start_epoch=0,
-                 end_epoch=10):
-        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
-        self.pruner = pruner
-        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
-
-    def _triger(self, context):
-        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
-                self.start_epoch <= context.epoch_id < self.end_epoch)
-
-    def on_batch_end(self, context):
-        if self._triger(context):
-            prune_program = Program()
-            with program_guard(prune_program):
-                for param in context.graph.all_parameters():
-                    prune_program.global_block().clone_variable(param)
-                    p = prune_program.global_block().var(param.name)
-                    zeros_mask = self.pruner.prune(p)
-                    pruned_param = p * zeros_mask
-                    layers.assign(input=pruned_param, output=param)
-            context.program_exe.run(prune_program, scope=context.scope)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(UniformPruneStrategy, self).__init__(pruner, start_epoch,
+                                                   end_epoch, target_ratio,
+                                                   metric_name, pruned_params)
+
+    def _get_best_ratios(self, context):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios')
+        pruned_params = []
+        for param in context.eval_graph.all_parameters():
+            if re.match(self.pruned_params, param.name()):
+                pruned_params.append(param.name())
+
+        min_ratio = 0.
+        max_ratio = 1.
+
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+
+        while min_ratio < max_ratio:
+            ratio = (max_ratio + min_ratio) / 2
+            _logger.debug(
+                '-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
+            ratios = [ratio] * len(pruned_params)
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                pruned_params,
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
+            _logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            if abs(pruned_flops - self.target_ratio) < 1e-2:
+                break
+            if pruned_flops > self.target_ratio:
+                max_ratio = ratio
+            else:
+                min_ratio = ratio
+        _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
+        return pruned_params, ratios
+
+    def on_epoch_begin(self, context):
+        if context.epoch_id == self.start_epoch:
+            params, ratios = self._get_best_ratios(context)
+
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('\n################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################\n')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.2f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.2f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            #            metric = self._eval_graph(context)
+            #            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------UniformPruneStrategy.on_compression_begin finish--------------------------------'
+            )
+
+
+class SensitivePruneStrategy(PruneStrategy):
+    """
+    Sensitive pruning strategy. Different pruned ratio was applied on each layer.
+    """
+
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=0,
+                 delta_rate=0.20,
+                 target_ratio=0.5,
+                 metric_name='top1_acc',
+                 pruned_params='conv.*_weights',
+                 sensitivities_file='./sensitivities.data',
+                 sensitivities={},
+                 num_steps=1,
+                 eval_rate=None):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10.
+            delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2
+            target_ratio(float): The flops ratio to be pruned from current model. default: 0.5
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc'
+            pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'.
+            sensitivities_file(str): The sensitivities file. default: './sensitivities.data'
+            sensitivities(dict): The user-defined sensitivities. default: {}.
+            num_steps(int): The number of pruning steps. default: 1.
+            eval_rate(float): The rate of sampled data used to calculate sensitivities.
+                              None means using all the data. default: None.
+        """
+        super(SensitivePruneStrategy, self).__init__(pruner, start_epoch,
+                                                     end_epoch, target_ratio,
+                                                     metric_name, pruned_params)
+        self.delta_rate = delta_rate
+        self.pruned_list = []
+        self.sensitivities = sensitivities
+        self.sensitivities_file = sensitivities_file
+        self.backup = {}
+        self.param_shape_backup = {}
+        self.num_steps = num_steps
+        self.eval_rate = eval_rate
+        self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
+
+    def _save_sensitivities(self, sensitivities, sensitivities_file):
+        """
+        Save sensitivities into file.
+        """
+        with open(sensitivities_file, 'wb') as f:
+            pickle.dump(sensitivities, f)
+
+    def _load_sensitivities(self, sensitivities_file):
+        """
+        Load sensitivities from file.
+        """
+        sensitivities = {}
+        if sensitivities_file and os.path.exists(sensitivities_file):
+            with open(sensitivities_file, 'rb') as f:
+                if sys.version_info < (3, 0):
+                    sensitivities = pickle.load(f)
+                else:
+                    sensitivities = pickle.load(f, encoding='bytes')
+
+        for param in sensitivities:
+            sensitivities[param]['pruned_percent'] = [
+                round(p, 2) for p in sensitivities[param]['pruned_percent']
+            ]
+        self._format_sensitivities(sensitivities)
+        return sensitivities
+
+    def _format_sensitivities(self, sensitivities):
+        """
+        Print formated sensitivities in debug log level.
+        """
+        tb = pt.PrettyTable()
+        tb.field_names = ["parameter", "size"] + [
+            str(round(i, 2))
+            for i in np.arange(self.delta_rate, 1, self.delta_rate)
+        ]
+        for param in sensitivities:
+            if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2):
+                tb.add_row([param, sensitivities[param]['size']] + [
+                    round(loss, 2) for loss in sensitivities[param]['loss']
+                ])
+        _logger.debug('\n################################')
+        _logger.debug('#      sensitivities table     #')
+        _logger.debug('################################\n')
+        _logger.debug(tb)
+
+    def _compute_sensitivities(self, context):
+        """
+        Computing the sensitivities of all parameters.
+        """
+        _logger.info("calling _compute_sensitivities.")
+        self.param_shape_backup = {}
+        self.backup = {}
+        cached_id = np.random.randint(1000)
+        if self.start_epoch == context.epoch_id:
+            sensitivities_file = self.sensitivities_file
+        else:
+            sensitivities_file = self.sensitivities_file + ".epoch" + str(
+                context.epoch_id)
+        sensitivities = self._load_sensitivities(sensitivities_file)
+
+        for param in context.eval_graph.all_parameters():
+            if not re.match(self.pruned_params, param.name()):
+                continue
+            if param.name() not in sensitivities:
+                sensitivities[param.name()] = {
+                    'pruned_percent': [],
+                    'loss': [],
+                    'size': param.shape()[0]
+                }
+
+        metric = None
+
+        for param in sensitivities.keys():
+            ratio = self.delta_rate
+            while ratio < 1:
+                ratio = round(ratio, 2)
+                if ratio in sensitivities[param]['pruned_percent']:
+                    _logger.debug('{}, {} has computed.'.format(param, ratio))
+                    ratio += self.delta_rate
+                    continue
+                if metric is None:
+                    metric = self._eval_graph(context, self.eval_rate,
+                                              cached_id)
+                # prune parameter by ratio
+                self._prune_parameters(
+                    context.eval_graph,
+                    context.scope, [param], [ratio],
+                    context.place,
+                    lazy=True)
+                self.pruned_list[0]
+                # get accuracy after pruning and update self.sensitivities
+                pruned_metric = self._eval_graph(context, self.eval_rate,
+                                                 cached_id)
+                loss = metric - pruned_metric
+                _logger.info("pruned param: {}; {}; loss={}".format(
+                    param, ratio, loss))
+                for brother in self.pruned_list[0]:
+                    if re.match(self.pruned_params, brother):
+                        if brother not in sensitivities:
+                            sensitivities[brother] = {
+                                'pruned_percent': [],
+                                'loss': []
+                            }
+                        sensitivities[brother]['pruned_percent'].append(ratio)
+                        sensitivities[brother]['loss'].append(loss)
+
+                self._save_sensitivities(sensitivities, sensitivities_file)
+
+                # restore pruned parameters
+                for param_name in self.backup.keys():
+                    param_t = context.scope.find_var(param_name).get_tensor()
+                    param_t.set(self.backup[param_name], context.place)
+
+#                pruned_metric = self._eval_graph(context)
+                self.backup = {}
+
+                ratio += self.delta_rate
+        return sensitivities
+
+    def _get_best_ratios(self, context, sensitivities, target_ratio):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios for pruning ratie: {}'.format(
+            target_ratio))
+        self.param_shape_backup = {}
+        self.backup = {}
+
+        def func(params, x):
+            a, b, c, d = params
+            return a * x * x * x + b * x * x + c * x + d
+
+        def error(params, x, y):
+            return func(params, x) - y
+
+        def slove_coefficient(x, y):
+            init_coefficient = [10, 10, 10, 10]
+            coefficient, loss = leastsq(error, init_coefficient, args=(x, y))
+            return coefficient
+
+        min_loss = 0.
+        max_loss = 0.
+
+        # step 1: fit curve by sensitivities
+        coefficients = {}
+        for param in sensitivities:
+            losses = np.array([0] * 5 + sensitivities[param]['loss'])
+            precents = np.array([0] * 5 + sensitivities[param][
+                'pruned_percent'])
+            coefficients[param] = slove_coefficient(precents, losses)
+            loss = np.max(losses)
+            max_loss = np.max([max_loss, loss])
+
+        # step 2: Find a group of ratios by binary searching.
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+        ratios = []
+        while min_loss < max_loss:
+            loss = (max_loss + min_loss) / 2
+            _logger.info(
+                '-----------Try pruned ratios while acc loss={:.4f}-----------'.
+                format(loss))
+            ratios = []
+            # step 2.1: Get ratios according to current loss
+            for param in sensitivities:
+                coefficient = copy.deepcopy(coefficients[param])
+                coefficient[-1] = coefficient[-1] - loss
+                roots = np.roots(coefficient)
+                for root in roots:
+                    min_root = 1
+                    if np.isreal(root) and root > 0 and root < 1:
+                        selected_root = min(root.real, min_root)
+                ratios.append(selected_root)
+            _logger.info('Pruned ratios={}'.format(
+                [round(ratio, 3) for ratio in ratios]))
+            # step 2.2: Pruning by current ratios
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                sensitivities.keys(),
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
+            _logger.info('Pruned model size: {:.4f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            # step 2.3: Check whether current ratios is enough
+            if abs(pruned_flops - target_ratio) < 0.015:
+                break
+            if pruned_flops > target_ratio:
+                max_loss = loss
+            else:
+                min_loss = loss
+        return sensitivities.keys(), ratios
+
+    def _current_pruning_target(self, context):
+        '''
+        Get the target pruning rate in current epoch.
+        '''
+        _logger.info('Left number of pruning steps: {}'.format(self.num_steps))
+        if self.num_steps <= 0:
+            return None
+        if (self.start_epoch == context.epoch_id) or context.eval_converged(
+                self.metric_name, 0.005):
+            self.num_steps -= 1
+            return self.pruning_step
+
+    def on_epoch_begin(self, context):
+        current_ratio = self._current_pruning_target(context)
+        if current_ratio is not None:
+            sensitivities = self._compute_sensitivities(context)
+            params, ratios = self._get_best_ratios(context, sensitivities,
+                                                   current_ratio)
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            self.param_shape_backup = {}
+            self.backup = {}
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+            context.optimize_graph.compile()  # to update the compiled program
+            context.eval_graph.compile(
+                for_parallel=False,
+                for_test=True)  # to update the compiled program
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.3f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.3f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            metric = self._eval_graph(context)
+            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------'
+            )
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
index ca72bcb6f6..506b8fbe1d 100644
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import numpy as np
+import collections
 from .... import layers
 
-__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+__all__ = ['Pruner', 'StructurePruner']
 
 
 class Pruner(object):
@@ -30,54 +31,77 @@ class Pruner(object):
         pass
 
 
-class MagnitudePruner(Pruner):
+class StructurePruner(Pruner):
     """
-    Pruner used to pruning a parameter by threshold.
+    Pruner used to pruning parameters by groups.
     """
 
-    def __init__(self, threshold):
-        self.threshold = threshold
-
-    def prune(self, param, threshold=None):
-        if threshold is None:
-            thres = layers.fill_constant(
-                shape=[1], dtype='float32', value=self.threshold)
-        else:
-            thres = threshold
-        zeros_mask = layers.less_than(x=param, y=thres)
-        return zeros_mask
-
-
-class RatioPruner(Pruner):
-    """
-    Pruner used to pruning a parameter by ratio.
-    """
+    def __init__(self, pruning_axis, criterions):
+        """
+        Args:
+            pruning_axis(dict): The key is the name of parameter to be pruned,
+                                '*' means all the parameters.
+                                The value is the axis to be used. Given a parameter
+                                with shape [3, 4], the result of pruning 50% on aixs 1
+                                is a parameter with shape [3, 2].
+            criterions(dict): The key is the name of parameter to be pruned,
+                              '*' means all the parameters.
+                              The value is the criterion used to sort groups for pruning.
+                              It only supports 'l1_norm' currently.
+        """
+        self.pruning_axis = pruning_axis
+        self.criterions = criterions
 
-    def __init__(self, ratios=None):
+    def cal_pruned_idx(self, name, param, ratio, axis=None):
         """
+        Calculate the index to be pruned on axis by given pruning ratio.
         Args:
-            ratios: dict with pair (paramer_name, pruned_ratio). 
+            name(str): The name of parameter to be pruned.
+            param(np.array): The data of parameter to be pruned.
+            ratio(float): The ratio to be pruned.
+            axis(int): The axis to be used for pruning given parameter.
+                       If it is None, the value in self.pruning_axis will be used.
+                       default: None.
+        Returns:
+            list<int>: The indexes to be pruned on axis.
         """
-        self.ratios = ratios
+        criterion = self.criterions[
+            name] if name in self.criterions else self.criterions['*']
+        if axis is None:
+            assert self.pruning_axis is not None, "pruning_axis should set if axis is None."
+            axis = self.pruning_axis[
+                name] if name in self.pruning_axis else self.pruning_axis['*']
+        prune_num = int(round(param.shape[axis] * ratio))
+        reduce_dims = [i for i in range(len(param.shape)) if i != axis]
+        if criterion == 'l1_norm':
+            criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
+        pruned_idx = criterions.argsort()[:prune_num]
+        return pruned_idx
 
-    def prune(self, param, ratio=None):
+    def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
         """
+        Pruning a array by indexes on given axis.
         Args:
-            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+            tensor(numpy.array): The target array to be pruned.
+            pruned_idx(list<int>): The indexes to be pruned.
+            pruned_axis(int): The axis of given array to be pruned on. 
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means remove the pruned elements from memory.
+                        default: False.
+        Returns:
+            numpy.array: The pruned array.
         """
-        if ratio is None:
-            rat = self.ratios[
-                param.name] if param.name in self.ratios else self.ratios['*']
-        else:
-            rat = ratio
-        if rat < 1.0:
-            k = max(int(rat * np.prod(param.shape)), 1)
-            param_vec = layers.reshape(x=param, shape=[1, -1])
-            param_topk, _ = layers.topk(param_vec, k=k)
-            threshold = layers.slice(
-                param_topk, axes=[1], starts=[-1], ends=[k])
-            threshold = layers.reshape(x=threshold, shape=[1])
-            zeros_mask = layers.less_than(x=param, y=threshold)
+        mask = np.zeros(tensor.shape[pruned_axis], dtype=bool)
+        mask[pruned_idx] = True
+
+        def func(data):
+            return data[~mask]
+
+        def lazy_func(data):
+            data[mask] = 0
+            return data
+
+        if lazy:
+            return np.apply_along_axis(lazy_func, pruned_axis, tensor)
         else:
-            zeros_mask = layers.ones(param.shape)
-        return zeros_mask
+            return np.apply_along_axis(func, pruned_axis, tensor)
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
index 6c26475f48..1c51aa1537 100644
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -16,5 +16,7 @@ from __future__ import print_function
 
 from . import quantization_pass
 from .quantization_pass import *
+from . import quantization_strategy
+from .quantization_strategy import *
 
-__all__ = quantization_pass.__all__
+__all__ = quantization_pass.__all__ + quantization_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 622add4843..5dcef50671 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -22,6 +22,7 @@ from ....framework import IrGraph
 from ....framework import IrNode
 from ....framework import Program
 from ....initializer import Constant
+from ....initializer import NumpyArrayInitializer
 from .... import unique_name
 
 __all__ = [
@@ -38,7 +39,8 @@ class QuantizationTransformPass(object):
                  activation_bits=8,
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
-                 window_size=10000):
+                 window_size=10000,
+                 moving_rate=0.9):
         """
         Convert and rewrite the IrGraph according to weight and
         activation quantization type.
@@ -53,14 +55,15 @@ class QuantizationTransformPass(object):
                 the bias is not quantized.
             activation_bits (int): quantization bit number for activation.
             activation_quantize_type (str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
-                the quantization scale will be calculated dynamically each step
-                in both training and testing period. If use 'range_abs_max',
-                a static quantization scale will be calculated during training
-                and used in inference.
+                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+                If use 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If use
+                'range_abs_max', a static quantization scale will be calculated
+                during training and used in inference.
             weight_quantize_type (str): quantization type for weights,
-                support 'abs_max'. The 'range_abs_max' usually is not used for
-                weight, since weights are fixed once the model is well trained.
+                support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
+                usually is not used for weight, since weights are fixed once the
+                model is well trained.
             window_size (int): the window size for 'range_abs_max' quantization.
 
         Examples:
@@ -83,22 +86,30 @@ class QuantizationTransformPass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
 
-        quant_type = ['abs_max', 'range_abs_max']
+        quant_type = [
+            'abs_max', 'channel_wise_abs_max', 'range_abs_max',
+            'moving_average_abs_max'
+        ]
+        assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(activation_quantize_type))
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(weight_quantize_type))
 
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
         self._window_size = window_size
+        self._moving_rate = moving_rate
 
         self._need_initialized = collections.OrderedDict()
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
@@ -131,10 +142,26 @@ class QuantizationTransformPass(object):
                     else self._activation_bits
                     quant_type = self._weight_quantize_type if var_node.name() \
                         in persistable_vars else self._activation_quantize_type
-                    quant_var_node, scale_var_node = self._insert_quant_op(
-                        graph, var_node, quant_bits, quant_type)
-                    dequant_var_node = self._insert_dequant_op(
-                        graph, quant_var_node, scale_var_node, quant_bits)
+                    if quant_type == 'channel_wise_abs_max':
+                        assert var_node.name(
+                        ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights."
+                        if op.name() in self._conv_ops:
+                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
+                                graph, var_node, quant_bits)
+                            dequant_var_node = self._insert_channel_dequant_op(
+                                graph, quant_var_node, [scale_var_node],
+                                [quant_bits])
+                        else:
+                            quant_var_node, scale_var_node = self._insert_quant_op(
+                                graph, var_node, quant_bits, 'abs_max')
+                            dequant_var_node = self._insert_dequant_op(
+                                graph, quant_var_node, scale_var_node,
+                                quant_bits)
+                    else:
+                        quant_var_node, scale_var_node = self._insert_quant_op(
+                            graph, var_node, quant_bits, quant_type)
+                        dequant_var_node = self._insert_dequant_op(
+                            graph, quant_var_node, scale_var_node, quant_bits)
                     dequantized_vars[var_node.name()] = dequant_var_node
                 graph.update_input_link(var_node, dequant_var_node, op)
 
@@ -222,6 +249,9 @@ class QuantizationTransformPass(object):
         elif quant_type == 'range_abs_max':
             return self._insert_quant_range_abs_max_op(graph, var_node,
                                                        quant_bits)
+        elif quant_type == 'moving_average_abs_max':
+            return self._insert_quant_moving_average_abs_max_op(graph, var_node,
+                                                                quant_bits)
 
     def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
         """
@@ -237,7 +267,7 @@ class QuantizationTransformPass(object):
         scale_var_node = graph.create_var_node(
             name=self._quantized_scale_name(var_node.name()),
             var_type=var_node.type(),
-            shape=var_node.shape(),
+            shape=[1],
             var_dtype=var_node.dtype())
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
@@ -309,6 +339,104 @@ class QuantizationTransformPass(object):
 
         return quant_var_node, scale_out_node
 
+    def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
+                                                quant_bits):
+        """Insert fake_quantize_moving_average_abs_max
+        """
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        scale_in_node = graph.create_persistable_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            var_dtype=var_node.dtype())
+        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+
+        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
+        ins = {'X': var_node, 'InScale': scale_in_node}
+        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
+        if not self._is_test:
+            state_in_node = graph.create_persistable_node(
+                name=unique_name.generate('state'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            self._need_initialized[state_in_node.var()] = Constant(value=1)
+            accum_in_node = graph.create_persistable_node(
+                name=unique_name.generate('accum'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            self._need_initialized[accum_in_node.var()] = Constant(value=1)
+            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
+            ))
+            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
+            ))
+
+            ins['InState'] = state_in_node
+            ins['InAccum'] = accum_in_node
+            outs['OutState'] = state_out_node
+            outs['OutAccum'] = accum_out_node
+
+        attrs = {
+            'bit_length': quant_bits,
+            'moving_rate': self._moving_rate,
+            'is_test': self._is_test,
+            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+        }
+
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_moving_average_abs_max',
+            attrs=attrs,
+            inputs=ins,
+            outputs=outs)
+
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_in_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_out_node)
+
+        if not self._is_test:
+            graph.link_to(state_in_node, quant_op_node)
+            graph.link_to(accum_in_node, quant_op_node)
+            graph.link_to(quant_op_node, state_out_node)
+            graph.link_to(quant_op_node, accum_out_node)
+
+        return quant_var_node, scale_out_node
+
+    def _insert_channel_quant_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_channel_wise_quantize_abs_max op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        scale_var_node = graph.create_var_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=[var_node.shape()[0]],
+            var_dtype=var_node.dtype())
+        quant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_quantize_abs_max',
+            attrs={
+                'bit_length': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={'X': var_node},
+            outputs={'Out': quant_var_node,
+                     'OutScale': scale_var_node})
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_var_node)
+        return quant_var_node, scale_var_node
+
     def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
         """
         Insert fake_dequantize_op in the graph.
@@ -335,6 +463,33 @@ class QuantizationTransformPass(object):
         graph.link_to(dequant_op_node, dequant_var_node)
         return dequant_var_node
 
+    def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
+                                   quant_bits):
+        """
+        Insert fake_channel_wise_dequantize_max_abs in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_dequantize_max_abs',
+            attrs={
+                'quant_bits': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={'X': var_node,
+                    'Scales': scale_var_nodes},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(var_node, dequant_op_node)
+        for scale_n in scale_var_nodes:
+            graph.link_to(scale_n, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
+
     def _quantized_var_name(self, var_name):
         """
         Return quantized variable name for the input `var_name`.
@@ -367,7 +522,7 @@ class QuantizationFreezePass(object):
         place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
         weight_bits (int): quantization bit number for weights.
         activation_bits (int): quantization bit number for activation.
-        weight_quantize_type (str): quantization type for weights, support 'abs_max'.
+        weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
         The 'range_abs_max' usually is not used for weight, since weights are fixed once the
         model is well trained.
     """
@@ -388,10 +543,15 @@ class QuantizationFreezePass(object):
         self._activation_bits = activation_bits
         self._weight_quantize_type = weight_quantize_type
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._fake_quant_op_names = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
         ]
-        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
         self._op_input_rename_map = collections.OrderedDict()
         self._op_output_rename_map = collections.OrderedDict()
         self._var_scale_map = collections.OrderedDict()
@@ -413,20 +573,27 @@ class QuantizationFreezePass(object):
                     if self._weight_quantize_type == 'abs_max':
                         param = self._load_var(input_arg_name)
                         scale_v = np.max(np.abs(param))
+                    elif self._weight_quantize_type == 'channel_wise_abs_max':
+                        param = self._load_var(input_arg_name)
+                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
+                            scale_v = []
+                            for i in range(param.shape[0]):
+                                scale_v.append(np.max(np.abs(param[i])))
+                        else:
+                            scale_v = np.max(np.abs(param))
                     else:
                         scale_v = self._load_var(
                             op_node.output('OutScale')[0])[0]
                     self._var_scale_map[input_arg_name] = scale_v
-                else:
-                    scale_v = graph.var_node(op_node.output('OutScale')[0])
-                    self._var_scale_map[input_arg_name] = scale_v
-                if input_arg_name in persistable_vars:
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
                     # quantize weight and restore
                     param_v = self._load_var(input_arg_name)
                     quantized_param_v = self._quant(param_v, scale_v,
                                                     self._weight_bits)
                     self._restore_var(input_arg_name, quantized_param_v)
+                else:
+                    scale_v = graph.var_node(op_node.output('OutScale')[0])
+                    self._var_scale_map[input_arg_name] = scale_v
 
         ops = graph.all_op_nodes()
         for op_node in ops:
@@ -438,7 +605,10 @@ class QuantizationFreezePass(object):
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
-                self._insert_post_dequant_op(graph, op_node)
+                if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops:
+                    self._insert_post_channel_dequant_op(graph, op_node)
+                else:
+                    self._insert_post_dequant_op(graph, op_node)
 
         for op_node in ops:
             # insert dequant_op after fc/conv, need to rename inputs of the followed ops
@@ -462,9 +632,73 @@ class QuantizationFreezePass(object):
             self._op_input_rename_map[k] = self._op_input_rename_map[v]
         graph.safe_remove_nodes(op_node)
 
+    def _insert_post_channel_dequant_op(self, graph, op_node):
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        for var_node in op_node.inputs:
+            name = var_node.name()
+            if name in self._op_input_rename_map:
+                old_in = graph.var_node(name)
+                new_in = graph.var_node(self._op_input_rename_map[name])
+                new_in.clear_outputs()
+                graph.update_input_link(old_in, new_in, op_node)
+            original_var_name = self._original_var_name(name)
+            scale_v = self._var_scale_map[original_var_name]
+            if original_var_name in persistable_vars:
+                assert isinstance(
+                    scale_v,
+                    list), 'The scale of parameter %s is not a list.' % (
+                        original_var_name)
+                channel_scale = np.array(scale_v)
+            else:
+                assert isinstance(scale_v, IrNode)
+                scale_var_node = self._var_scale_map[original_var_name]
+
+        if len(op_node.outputs) != 1:
+            raise ValueError("Only support one output, but op %s has"
+                             " more than one output." % (op_node.name()))
+
+        output_var_node = op_node.outputs[0]
+        weight_scale_node = graph.create_persistable_node(
+            name=unique_name.generate('channel_scale'),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[channel_scale.shape[0]],
+            var_dtype=output_var_node.dtype())
+        init_program = Program()
+        weight_scale_var = init_program.global_block().create_var(
+            name=weight_scale_node.name(),
+            shape=weight_scale_node.shape(),
+            dtype=weight_scale_node.dtype(),
+            type=weight_scale_node.type(),
+            lod_level=weight_scale_node.var().lod_level(),
+            persistable=weight_scale_node.persistable())
+        initializer = NumpyArrayInitializer(value=channel_scale)
+        initializer(weight_scale_var, init_program.global_block())
+        exe = Executor(self._place)
+        exe.run(program=init_program, scope=self._scope)
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(output_var_node.name()),
+            var_type=output_var_node.type(),
+            shape=output_var_node.shape(),
+            var_dtype=output_var_node.dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_dequantize_max_abs',
+            attrs={
+                'quant_bits': [self._weight_bits, self._activation_bits],
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={
+                'X': output_var_node,
+                'Scales': [weight_scale_node, scale_var_node]
+            },
+            outputs={'Out': dequant_var_node})
+        graph.link_to(output_var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(weight_scale_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
+        return dequant_var_node
+
     def _insert_post_dequant_op(self, graph, op_node):
-        max_range = None
-        scale_var_node = None
         persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
         for var_node in op_node.inputs:
             name = var_node.name()
@@ -561,7 +795,12 @@ class QuantizationFreezePass(object):
             or isinstance(v, np.float64)
 
     def _quant(self, x, scale, num_bits):
-        return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+        if isinstance(scale, list):
+            for i, s in enumerate(scale):
+                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+            return x
+        else:
+            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
 
 
 class ConvertToInt8Pass(object):
@@ -655,9 +894,13 @@ class TransformForMobilePass(object):
 
     def __init__(self):
         self._fake_quant_op_names = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
         ]
-        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
 
     def apply(self, graph):
         """
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
new file mode 100644
index 0000000000..6812b4c633
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+import numpy as np
+from .... import Executor
+from .... import io
+from .... import core
+from ....compiler import CompiledProgram
+from ....compiler import BuildStrategy
+from ....framework import IrGraph
+from ..core.strategy import Strategy
+from .quantization_pass import *
+
+__all__ = ['QuantizationStrategy']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class QuantizationStrategy(Strategy):
+    """
+    The strategy for Quantization.
+    """
+
+    def __init__(self,
+                 start_epoch=0,
+                 end_epoch=0,
+                 float_model_save_path=None,
+                 mobile_model_save_path=None,
+                 int8_model_save_path=None,
+                 activation_bits=8,
+                 weight_bits=8,
+                 activation_quantize_type='abs_max',
+                 save_in_nodes=None,
+                 save_out_nodes=None):
+        """
+        Args:
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            float_model_save_path(str): The path to save model with float weights. 
+                            None means it doesn't save float model. defalut: None.
+            mobile_model_save_path(str): The path to save model for paddle-mobile execution.
+                            None means it doesn't save mobile model. defalut: None.
+            int8_model_save_path(str): The path to save model with int8_t weight.
+                            None means it doesn't save int8 model. defalut: None.
+            activation_bits(int): quantization bit number for activation. default: 8.
+            weight_bits(int): quantization bit number for weights. The bias is not quantized.
+                              default: 8.
+            activation_quantize_type(str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+                If use 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If use
+                'range_abs_max', a static quantization scale will be calculated
+                during training and used in inference.
+            save_in_nodes(list<str>): A list of variable names used to prune graph 
+                                      for saving inference model.
+            save_out_nodes(list<str>): A list of variable names used to prune graph 
+                                      for saving inference model.
+
+        """
+        super(QuantizationStrategy, self).__init__(start_epoch, end_epoch)
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+        self.float_model_save_path = float_model_save_path
+        self.mobile_model_save_path = mobile_model_save_path
+        self.int8_model_save_path = int8_model_save_path
+        self.activation_bits = activation_bits
+        self.weight_bits = weight_bits
+        self.activation_quantize_type = activation_quantize_type
+        self.save_out_nodes = save_out_nodes
+        self.save_in_nodes = save_in_nodes
+
+    def on_epoch_begin(self, context):
+        """
+        Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
+        """
+        super(QuantizationStrategy, self).on_compression_begin(context)
+        if self.start_epoch == context.epoch_id:
+            _logger.info('QuantizationStrategy::on_epoch_begin')
+            train_ir_graph = IrGraph(
+                core.Graph(context.optimize_graph.program.desc), for_test=False)
+            test_ir_graph = IrGraph(
+                core.Graph(context.eval_graph.program.desc), for_test=True)
+            transform_pass = QuantizationTransformPass(
+                scope=context.scope,
+                place=context.place,
+                weight_bits=self.weight_bits,
+                activation_bits=self.activation_bits,
+                activation_quantize_type=self.activation_quantize_type)
+            transform_pass.apply(train_ir_graph)
+            transform_pass.apply(test_ir_graph)
+
+            build_strategy = BuildStrategy()
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
+            # for quantization training
+            context.optimize_graph.compiled_graph = CompiledProgram(
+                train_ir_graph.graph).with_data_parallel(
+                    loss_name=context.optimize_graph.out_nodes['loss'],
+                    build_strategy=build_strategy)
+            # for evaluation. And program compiled from ir graph must be with data parallel.
+            context.eval_graph.compiled_graph = CompiledProgram(
+                test_ir_graph.graph).with_data_parallel(
+                    build_strategy=build_strategy)
+            # for saving inference model after training
+            context.put('quantization_test_ir_graph_backup', test_ir_graph)
+            _logger.info('Finish QuantizationStrategy::on_epoch_begin')
+
+    def on_epoch_end(self, context):
+        """
+        Free and save inference model.
+        """
+        super(QuantizationStrategy, self).on_compression_end(context)
+
+        if context.epoch_id == self.end_epoch:
+            _logger.info('QuantizationStrategy::on_epoch_end')
+            test_ir_graph = context.get('quantization_test_ir_graph_backup')
+            # freeze the graph after training
+            freeze_pass = QuantizationFreezePass(
+                scope=context.scope,
+                place=context.place,
+                weight_bits=self.weight_bits,
+                activation_bits=self.activation_bits)
+            freeze_pass.apply(test_ir_graph)
+
+            # for other strategies
+            context.eval_graph.program = test_ir_graph.to_program()
+
+            if self.save_out_nodes == None:
+                out_vars = [
+                    context.eval_graph.var(var_name)._var
+                    for var_name in context.eval_graph.out_nodes.values()
+                ]
+            else:
+                out_vars = [
+                    context.eval_graph.var(var_name)._var
+                    for var_name in self.save_out_nodes
+                ]
+
+            if self.save_in_nodes == None:
+                in_vars = list(context.eval_graph.out_nodes.values())
+            else:
+                in_vars = self.save_in_nodes
+
+            # save float model
+            if self.float_model_save_path:
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.float_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+
+            # save int8 model
+            if self.int8_model_save_path:
+                convert_int8_pass = ConvertToInt8Pass(
+                    scope=context.scope, place=context.place)
+                convert_int8_pass.apply(test_ir_graph)
+
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.int8_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+
+            # save mobile model
+            if self.mobile_model_save_path:
+                if not self.int8_model_save_path:
+                    # convert the weights as int8_t type
+                    convert_int8_pass = ConvertToInt8Pass(
+                        scope=context.scope, place=context.place)
+                    convert_int8_pass.apply(test_ir_graph)
+                # make some changes on the graph for the mobile inference
+                mobile_pass = TransformForMobilePass()
+                mobile_pass.apply(test_ir_graph)
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.mobile_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+            _logger.info('Finish QuantizationStrategy::on_epoch_end')
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
deleted file mode 100644
index d9b49029d3..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-version: 1.0
-include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_2'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
new file mode 100644
index 0000000000..570c60026d
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        delta_rate: 0.1
+        target_ratio: 0.3
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: '.*_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 120
+    checkpoint_path: './checkpoints/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
deleted file mode 100644
index 235092c595..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_2:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
deleted file mode 100644
index cd2ef9eb56..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_3:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
new file mode 100644
index 0000000000..ef89dfb780
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
@@ -0,0 +1,46 @@
+#start_epoch(int): The epoch when to merge student graph and teacher graph for
+#                  distillation training. default: 0
+#
+#end_epoch(int): The epoch when to finish distillation training. default: 0
+#
+#student_feature_map(str): The name of feature map from student network.
+#
+#teacher_feature_map(str): The name of feature map from teacher network.
+#                          It's shape should be the same with student network.
+#
+#student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+#                            a section in student network. The variables in a tuple should
+#                            have the same feature map size.
+#
+#teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+#                            a section in teacher network. The variables in a tuple should
+#                            have the same feature map size. Varibale named teacher_pairs[i][j]
+#                            should has the save channel number with that of variable named 
+#                            student_pairs[i][j].
+#
+#distillation_loss_weight(float): The weight of the loss.
+version: 1.0
+distillers:
+    fsp_distiller:
+        class: 'FSPDistiller'
+#        teacher_pairs: [['teacher_depthwise_conv2d_1.tmp_0', 'teacher_conv2d_3.tmp_0']]
+#        student_pairs: [['student_depthwise_conv2d_1.tmp_0', 'student_conv2d_3.tmp_0']]
+        teacher_pairs: [['teacher_conv2_1_dw.tmp_0', 'teacher_conv1.tmp_0']]
+        student_pairs: [['student_conv2_1_dw.tmp_0', 'student_conv1.tmp_0']]
+        distillation_loss_weight: 1
+    l2_distiller:
+        class: 'L2Distiller'
+        teacher_feature_map: 'teacher.tmp_2'
+        student_feature_map: 'student.tmp_2'
+        distillation_loss_weight: 1
+strategies:
+    distillation_strategy:
+        class: 'DistillationStrategy'
+        distillers: ['fsp_distiller', 'l2_distiller']
+        start_epoch: 0
+        end_epoch: 1
+compressor:
+    epoch: 1
+    checkpoint_path: './distillation_checkpoints/'
+    strategies:
+        - distillation_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
new file mode 100644
index 0000000000..5f747a049e
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 1
+        delta_rate: 0.2
+        target_ratio: 0.08
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: 'conv6_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 2
+    checkpoint_path: './checkpoints_pruning/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/mobilenet.py b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
new file mode 100644
index 0000000000..f5dbef17e8
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self, name=""):
+        self.params = train_parameters
+        self.name = name
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name=self.name + "_conv1")
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv2_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv2_2")
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv3_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv3_2")
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv4_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv4_2")
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name=self.name + "_conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv5_6")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv6")
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(
+            input=input,
+            size=class_dim,
+            act='softmax',
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.name + "_fc7_weights"),
+            bias_attr=ParamAttr(name=self.name + "_fc7_offset"),
+            name=self.name)
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            name=name,
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=name,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
new file mode 100644
index 0000000000..f29eb53f88
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
@@ -0,0 +1,48 @@
+#start_epoch(int): The epoch to insert quantization operators. default: 0
+#
+#end_epoch(int): The epoch to save inferecne model. default: 0
+#
+#float_model_save_path(str): The path to save model with float weights.
+#                None means it doesn't save float model. defalut: None.
+#
+#mobile_model_save_path(str): The path to save model for paddle-mobile execution.
+#                None means it doesn't save mobile model. defalut: None.
+#
+#int8_model_save_path(str): The path to save model with int8_t weight.
+#                None means it doesn't save int8 model. defalut: None.
+#
+#activation_bits(int): quantization bit number for activation. default: 8.
+#
+#weight_bits(int): quantization bit number for weights. The bias is not quantized.
+#                  default: 8.
+#
+#activation_quantize_type(str): quantization type for activation,
+#    now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+#    If use 'abs_max' mode, the quantization scale will be calculated
+#    dynamically each step in both training and testing period. If use
+#    'range_abs_max', a static quantization scale will be calculated
+#    during training and used in inference.
+#
+#save_in_nodes(list<str>): A list of variable names used to prune graph
+#                          for saving inference model.
+#
+#save_out_nodes(list<str>): A list of variable names used to prune graph
+#                                      for saving inference model.
+version: 1.0
+strategies:
+    quantization_strategy:
+        class: 'QuantizationStrategy'
+        start_epoch: 0
+        end_epoch: 0
+        float_model_save_path: './output/float'
+        weight_bits: 8
+        activation_bits: 8
+        weight_quantize_type: 'abs_max'
+        activation_quantize_type: 'abs_max'
+        save_in_nodes: ['image']
+        save_out_nodes: ['quan.tmp_2']
+compressor:
+    epoch: 1
+    checkpoint_path: './checkpoints_quan/'
+    strategies:
+        - quantization_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
new file mode 100644
index 0000000000..9b967c0ac7
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
@@ -0,0 +1,94 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestDistillationStrategy(unittest.TestCase):
+    """
+    Test API of distillation strategy.
+    """
+
+    def test_compression(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet(name="student").net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        # define teacher program
+        teacher_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(teacher_program, startup_program):
+            img = teacher_program.global_block()._clone_variable(
+                image, force_persistable=False)
+            predict = MobileNet(name="teacher").net(input=img,
+                                                    class_dim=class_dim)
+
+        exe.run(startup_program)
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            teacher_programs=[teacher_program.clone(for_test=True)],
+            train_optimizer=optimizer,
+            distiller_optimizer=optimizer)
+        com_pass.config('./distillation/compress.yaml')
+        eval_graph = com_pass.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
index 2fc72b6475..90eb8bd4b3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
@@ -12,29 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.contrib.slim import ConfigFactory
+from paddle.fluid.contrib.slim.core import ConfigFactory
 import unittest
 
 
 class TestFactory(unittest.TestCase):
-    def test_parse(self):
-        factory = ConfigFactory('./configs/config.yaml')
+    def test_parse_pruning(self):
+        factory = ConfigFactory('./configs/filter_pruning.yaml')
 
-        pruner = factory.instance('pruner_1')
-        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+        pruner_1 = factory.instance('pruner_1')
+        self.assertEquals(pruner_1.pruning_axis['*'], 0)
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        pruner = factory.instance('pruner_2')
-        self.assertEquals(pruner.ratios['*'], 0.7)
+        strategy = factory.instance('sensitive_pruning_strategy')
+        pruner_1 = strategy.pruner
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        strategy = factory.instance('strategy_1')
-        pruner = strategy.pruner
-        self.assertEquals(pruner.ratios['*'], 0.7)
-
-        compress_pass = factory.get_compress_pass()
-        self.assertEquals(compress_pass.epoch, 100)
-
-        strategy = compress_pass.strategies[0]
-        self.assertEquals(strategy.delta_rate, 0.2)
+        self.assertEquals(strategy.start_epoch, 0)
+        self.assertEquals(strategy.sensitivities_file,
+                          'mobilenet_acc_top1_sensitive.data')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
new file mode 100644
index 0000000000..e1763039b3
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
@@ -0,0 +1,89 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestFilterPruning(unittest.TestCase):
+    def test_compression(self):
+        """
+        Model: mobilenet_v1
+        data: mnist
+        step1: Training one epoch
+        step2: pruning flops
+        step3: fine-tune one epoch
+        step4: check top1_acc.
+        """
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet().net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            train_optimizer=optimizer)
+        com_pass.config('./filter_pruning/compress.yaml')
+        eval_graph = com_pass.run()
+        self.assertTrue(
+            abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969)
+            < 0.02)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 2d2f1384de..3629fed160 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -13,58 +13,92 @@
 # limitations under the license.
 
 from __future__ import print_function
+import os
+import six
 import unittest
+import paddle
 import paddle.fluid as fluid
-import six
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
 
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CPU_NUM"] = "1"
 
-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
 
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+def conv_block():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10)
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return [img, label], avg_loss
 
 
 class TestGraph(unittest.TestCase):
-    def test_graph_functions(self):
+    def graph_apis(self, use_cuda=False, for_ci=True):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
-            loss = residual_block(2)
+            feeds, loss = conv_block()
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
+        backup_graph = graph.clone()
+        self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
+        backup_binary = fluid.CompiledProgram(
+            backup_graph.graph).with_data_parallel(
+                loss_name=loss.name, build_strategy=build_strategy)
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        iters = 5
+        batch_size = 8
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+
+        def train(binary):
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(binary,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss.name])
+                print('{}: {}'.format('loss', loss_v))
+
+        train(origin_binary)
+        train(backup_binary)
+
         marked_nodes = set()
         for op in graph.all_op_nodes():
             if op.name().find('conv2d') > -1:
                 marked_nodes.add(op)
-        graph.draw('.', 'residual', marked_nodes)
+        if not for_ci:
+            graph.draw('.', 'residual', marked_nodes)
+            backup_marked_nodes = set()
+            for op in backup_graph.all_op_nodes():
+                if op.name().find('conv2d') > -1:
+                    backup_marked_nodes.add(op)
+            backup_graph.draw('.', 'backup', backup_marked_nodes)
         self.assertFalse(graph.has_circle())
         self.assertEqual(graph.graph_num(), 1)
         nodes = graph.topology_sort()
@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase):
         graph.safe_remove_nodes(marked_nodes)
         self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
 
+    def test_graph_apis_cpu(self):
+        self.graph_apis(use_cuda=False, for_ci=True)
+
+    def test_graph_apis_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.graph_apis(use_cuda=True, for_ci=True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
new file mode 100644
index 0000000000..ad82aa9411
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -0,0 +1,140 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+import numpy as np
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+from paddle.fluid import core
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    data.stop_gradinet = False
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return data, label, loss
+
+
+class TestGraphWrapper(unittest.TestCase):
+    def build_program(self):
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            image, label, self.loss = residual_block(2)
+            eval_program = main.clone()
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(self.loss)
+        self.scope = core.Scope()
+        exe = fluid.Executor(place)
+        exe.run(startup, scope=self.scope)
+        self.eval_graph = GraphWrapper(
+            program=eval_program,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+        self.train_graph = GraphWrapper(
+            program=main,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+
+    def test_all_parameters(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.all_parameters()), 24)
+
+    def test_all_vars(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.vars()), 90)
+
+    def test_numel_params(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.numel_params(), 13258)
+
+    def test_compile(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        self.train_graph.compile()
+        exe.run(self.train_graph.compiled_graph,
+                scope=self.scope,
+                feed={
+                    'image':
+                    np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'),
+                    'label': np.random.randint(0, 10, [16, 1]).astype('int64')
+                })
+
+    def test_pre_and_next_ops(self):
+        self.build_program()
+        for op in self.train_graph.ops():
+            for next_op in self.train_graph.next_ops(op):
+                self.assertTrue(op in self.train_graph.pre_ops(next_op))
+
+    def test_get_optimize_graph(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        opt = fluid.optimizer.SGD(learning_rate=0.001)
+        train_graph = self.eval_graph.get_optimize_graph(
+            opt, place, self.scope, no_grad_var_names=['image'])
+        self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops()))
+        exe = fluid.Executor(place)
+        train_graph.compile()
+        image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32')
+        label = np.random.randint(0, 10, [16, 1]).astype('int64')
+        exe.run(train_graph.compiled_graph,
+                scope=self.scope,
+                feed={'image': image,
+                      'label': label})
+
+    def test_flops(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.flops(), 354624)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 3b82380f94..c7feca0b82 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -12,6 +12,7 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import os
 import unittest
 import random
 import numpy as np
@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
 from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid import core
 
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CPU_NUM"] = "1"
+
 
 def linear_fc(num):
     data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
@@ -123,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
                             arg_name.endswith('.quantized.dequantized'))
                         self.assertTrue(arg_name in quantized_ops)
 
-    def linear_fc_quant(self, quant_type, for_ci=False):
+    def linear_fc_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -136,14 +140,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             place=place,
-            activation_quantize_type=quant_type)
+            activation_quantize_type=activation_quant_type)
         transform_pass.apply(graph)
         if not for_ci:
             marked_nodes = set()
             for op in graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+            graph.draw('.', 'quantize_fc_' + activation_quant_type,
+                       marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
@@ -152,7 +157,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
             for op in val_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+            val_graph.draw('.', 'val_fc_' + activation_quant_type,
+                           val_marked_nodes)
 
     def test_linear_fc_quant_abs_max(self):
         self.linear_fc_quant('abs_max', for_ci=True)
@@ -160,7 +166,10 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_linear_fc_quant_range_abs_max(self):
         self.linear_fc_quant('range_abs_max', for_ci=True)
 
-    def residual_block_quant(self, quant_type, for_ci=False):
+    def test_linear_fc_quant_moving_average_abs_max(self):
+        self.linear_fc_quant('moving_average_abs_max', for_ci=True)
+
+    def residual_block_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -173,14 +182,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             place=place,
-            activation_quantize_type=quant_type)
+            activation_quantize_type=activation_quant_type)
         transform_pass.apply(graph)
         if not for_ci:
             marked_nodes = set()
             for op in graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+            graph.draw('.', 'quantize_residual_' + activation_quant_type,
+                       marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
@@ -189,7 +199,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
             for op in val_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+            val_graph.draw('.', 'val_residual_' + activation_quant_type,
+                           val_marked_nodes)
 
     def test_residual_block_abs_max(self):
         self.residual_block_quant('abs_max', for_ci=True)
@@ -197,9 +208,17 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_residual_block_range_abs_max(self):
         self.residual_block_quant('range_abs_max', for_ci=True)
 
+    def test_residual_block_moving_average_abs_max(self):
+        self.residual_block_quant('moving_average_abs_max', for_ci=True)
+
 
 class TestQuantizationFreezePass(unittest.TestCase):
-    def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
+    def freeze_graph(self,
+                     use_cuda,
+                     seed,
+                     activation_quant_type,
+                     weight_quant_type='abs_max',
+                     for_ci=False):
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
@@ -233,7 +252,12 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             exe.run(startup)
         transform_pass = QuantizationTransformPass(
-            scope=scope, place=place, activation_quantize_type=quant_type)
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quant_type)
+        #transform_pass = QuantizationTransformPass(
+        #    scope=scope, place=place, activation_quantize_type=activation_quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
         dev_name = '_gpu_' if use_cuda else '_cpu_'
@@ -242,14 +266,20 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in main_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
+                            + weight_quant_type, marked_nodes)
             marked_nodes = set()
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
-
-        quantized_main_program = main_graph.to_program()
+            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
+                            + weight_quant_type, marked_nodes)
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
         quantized_test_program = test_graph.to_program()
         iters = 5
         batch_size = 8
@@ -264,12 +294,13 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(program=quantized_main_program,
+                loss_v = exe.run(binary,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
                 if not for_ci:
-                    print('{}: {}'.format('loss' + dev_name + quant_type,
-                                          loss_v))
+                    print('{}: {}'.format('loss' + dev_name +
+                                          activation_quant_type + '_' +
+                                          weight_quant_type, loss_v))
 
         test_data = next(test_reader())
         with fluid.program_guard(quantized_test_program):
@@ -282,14 +313,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                           fetch_list=[loss, w_var])
 
         # Freeze graph for inference, but the weight of fc/conv is still float type.
-        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        freeze_pass = QuantizationFreezePass(
+            scope=scope, place=place, weight_quantize_type=weight_quant_type)
+        #freeze_pass = QuantizationFreezePass(scope=scope, place=place)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+            test_graph.draw('.', 'test_freeze' + dev_name +
+                            activation_quant_type + '_' + weight_quant_type,
                             marked_nodes)
 
         server_program = test_graph.to_program()
@@ -299,18 +333,20 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                   fetch_list=[loss])
         self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
         if not for_ci:
-            print('{}: {}'.format('test_loss1' + dev_name + quant_type,
-                                  test_loss1))
-            print('{}: {}'.format('test_loss2' + dev_name + quant_type,
-                                  test_loss2))
+            print(
+                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
+                                + '_' + weight_quant_type, test_loss1))
+            print(
+                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
+                                + '_' + weight_quant_type, test_loss2))
         w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
         # Maybe failed, this is due to the calculation precision
         # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
         if not for_ci:
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
-            print('{}: {}'.format('w_quant' + dev_name + quant_type,
-                                  np.sum(w_quant)))
+            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
+                                  + '_' + weight_quant_type, np.sum(w_freeze)))
+            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
+                                  '_' + weight_quant_type, np.sum(w_quant)))
 
         # Convert parameter to 8-bit.
         convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
@@ -320,26 +356,28 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_int8' + dev_name + quant_type,
-                            marked_nodes)
+            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
+                            + '_' + weight_quant_type, marked_nodes)
         server_program_int8 = test_graph.to_program()
         # Save the 8-bit parameter and model file.
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
-                                          ['image', 'label'], [loss], exe,
-                                          server_program_int8)
+            fluid.io.save_inference_model(
+                'server_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, ['image', 'label'], [loss], exe,
+                server_program_int8)
             # Test whether the 8-bit parameter and model file can be loaded successfully.
             [infer, feed, fetch] = fluid.io.load_inference_model(
-                'server_int8' + dev_name + quant_type, exe)
+                'server_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, exe)
         # Check the loaded 8-bit weight.
         w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
         self.assertEqual(w_8bit.dtype, np.int8)
         self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
         if not for_ci:
-            print('{}: {}'.format('w_8bit' + dev_name + quant_type,
-                                  np.sum(w_8bit)))
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
+            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
+                                  '_' + weight_quant_type, np.sum(w_8bit)))
+            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
+                                  + '_' + weight_quant_type, np.sum(w_freeze)))
 
         mobile_pass = TransformForMobilePass()
         mobile_pass.apply(test_graph)
@@ -348,35 +386,103 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
+            test_graph.draw('.', 'test_mobile' + dev_name +
+                            activation_quant_type + '_' + weight_quant_type,
                             marked_nodes)
 
         mobile_program = test_graph.to_program()
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
-                                          ['image', 'label'], [loss], exe,
-                                          mobile_program)
+            fluid.io.save_inference_model(
+                'mobile_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, ['image', 'label'], [loss], exe,
+                mobile_program)
 
     def test_freeze_graph_cuda_dynamic(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(
-                    True, seed=1, quant_type='abs_max', for_ci=True)
+                    True,
+                    seed=1,
+                    activation_quant_type='abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
+            with fluid.unique_name.guard():
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='abs_max',
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_dynamic(self):
         with fluid.unique_name.guard():
-            self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
 
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(
-                    True, seed=1, quant_type='range_abs_max', for_ci=True)
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
             self.freeze_graph(
-                False, seed=2, quant_type='range_abs_max', for_ci=True)
+                False,
+                seed=2,
+                activation_quant_type='range_abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='moving_average_abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='range_abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='moving_average_abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
new file mode 100644
index 0000000000..92afd892af
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
@@ -0,0 +1,82 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestQuantizationStrategy(unittest.TestCase):
+    """
+    Test API of quantization strategy.
+    """
+
+    def test_compression(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet(name='quan').net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            train_optimizer=optimizer)
+        com_pass.config('./quantization/compress.yaml')
+        eval_graph = com_pass.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index b9f938bebe..00885eb5d6 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -136,7 +136,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
                                                         "full_data", False)
         else:
             data_urls.append(
-                'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
+                'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
             )
             data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
             self.data_cache_folder = self.download_data(data_urls, data_md5s,
@@ -189,7 +189,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
     def download_model(self):
         # resnet50 fp32 data
         data_urls = [
-            'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
         ]
         data_md5s = ['4a5194524823d9b76da6e738e1367881']
         self.model_cache_folder = self.download_data(data_urls, data_md5s,
@@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
             self.model, self.infer_iterations)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program("calibration_out")
-        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        delta_value = fp32_acc1 - int8_acc1
         self.assertLess(delta_value, 0.01)
         print(
             "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
@@ -307,7 +307,7 @@ class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
     def download_model(self):
         # mobilenetv1 fp32 data
         data_urls = [
-            'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
         data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
         self.model_cache_folder = self.download_data(data_urls, data_md5s,
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index 20e6328d81..a127f5b11b 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -18,6 +18,7 @@ import os
 import time
 import logging
 
+import paddle
 from paddle.fluid import core
 from paddle.fluid import io
 from paddle.fluid import Program
@@ -84,8 +85,9 @@ def convert_dist_to_sparse_program(program):
     when we train model with distributed lookup table but want to do the local inference, we can use
     this function to convert the train program with distributed lookup table to sparse lookup table.
 
-    :param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    :return:
+    Args:
+        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
+    Returns:
         program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
     """
     if not program._distributed_lookup_table:
@@ -128,68 +130,92 @@ def convert_dist_to_sparse_program(program):
     return program
 
 
-def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-    def _is_checkpoint_var(exclude_fluid_vars=None):
-        """
-        the checkpoint will not save or load all the variables.
-        var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-        : param var(Variable)
-        """
-
-        if exclude_fluid_vars is None:
-            exclude_fluid_vars = []
-
-        def is_valid(var):
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.RAW:
-                return False
-            # @GRAD are named for gradient variables, checkpoint will not save it.
-            if "@GRAD" in var.name:
-                return False
-            # .trainer_ are named for distribute train variables, checkpoint will not save it.
-            if ".trainer_" in var.name:
-                return False
-
-            # .block is named for distribute train variables, checkpoint will not save it.
-            if ".block" in var.name:
-                return False
-
-            if "tmp_" in var.name:
-                return False
-
-            if var.name in exclude_fluid_vars:
-                return False
-
-            return var.persistable
-
-        return is_valid
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var(lookup_table_vars),
-        filename=None)
-
-
 def load_persistables_for_increment(dirname, executor, program,
                                     lookup_table_var, lookup_table_var_path):
     """
     WARNING: this function will only be used for distributed training with distributed lookup table.
     for increment trainning, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of slice lookup table
-    var with HASH, we must load the correct slice var.
+    but also load the suitable lookup table var. Because of sliced lookup table
+    var with HASH, we must load the correct sliced var.
+
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var: the distributed lookup tables var name.
+        lookup_table_var_path: the the distributed lookup tables var location.
+
+    Returns:
+        None
+    """
 
+    def _load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var: the distributed lookup tables var name.
-    :param lookup_table_var_path: the the distributed lookup tables var location.
-    :return: None
-    """
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
 
     def __load_lookup_table_vars(executor, main_program, lookup_table_var,
                                  lookup_table_var_path):
@@ -217,7 +243,9 @@ def load_persistables_for_increment(dirname, executor, program,
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var])
+    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
+        program._ps_endpoint)
+    _load_persistable_vars(executor, dirname, need_load_vars)
     __load_lookup_table_vars(executor, program, lookup_table_var,
                              lookup_table_var_path)
 
@@ -233,15 +261,62 @@ def load_persistables_for_inference(dirname, executor, program,
     Inference with distributed lookup table is a little funky, this function will load distributed
     lookup table vars into sparse var, can be used in local inference mode.
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var_name: the distributed lookup tables var name.
-    :return: None
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var_name: the distributed lookup tables var name.
+    Returns:
+        None
     """
 
-    def __load_lookup_table_vars(executor, dirname, main_program,
-                                 lookup_table_vars):
+    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
+        def _is_checkpoint_var(exclude_fluid_vars=None):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+
+            if exclude_fluid_vars is None:
+                exclude_fluid_vars = []
+
+            def is_valid(var):
+                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.RAW:
+                    return False
+                # @GRAD are named for gradient variables, checkpoint will not save it.
+                if "@GRAD" in var.name:
+                    return False
+                # .trainer_ are named for distribute train variables, checkpoint will not save it.
+                if ".trainer_" in var.name:
+                    return False
+
+                # .block is named for distribute train variables, checkpoint will not save it.
+                if ".block" in var.name:
+                    return False
+
+                if "tmp_" in var.name:
+                    return False
+
+                if var.name in exclude_fluid_vars:
+                    return False
+
+                return var.persistable
+
+            return is_valid
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var(lookup_table_vars),
+            filename=None)
+
+    def _load_lookup_table_vars(executor, dirname, main_program,
+                                lookup_table_vars):
         if not os.path.isdir(dirname):
             raise ValueError("There is no directory named '%s'", dirname)
 
@@ -313,11 +388,96 @@ def load_persistables_for_inference(dirname, executor, program,
                      dirname, time.ctime()))
 
     _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    __load_lookup_table_vars(executor, dirname, program,
-                             [lookup_table_var_name])
+    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
 
     _logger.info("Finish Load Sparse Program With "
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
     return program
+
+
+def get_inference_model(main_program, feeded_var_names, target_vars):
+    """
+    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
+    and then add `feeded_vars` and `target_vars` in this program.
+
+    Args:
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
+                                    the default main program will be used.
+                                    Default: None.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference
+                                     results.
+    Returns:
+        program(Program)
+
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    """
+
+    def prepend_feed_ops(inference_program,
+                         feed_target_names,
+                         feed_holder_name='feed'):
+        if len(feed_target_names) == 0:
+            return
+
+        global_block = inference_program.global_block()
+
+        feed_var = global_block.create_var(
+            name=feed_holder_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed_target_names):
+            out = global_block.var(name)
+            global_block._prepend_op(
+                type='feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+
+    def append_fetch_ops(inference_program,
+                         fetch_target_names,
+                         fetch_holder_name='fetch'):
+        global_block = inference_program.global_block()
+        fetch_var = global_block.create_var(
+            name=fetch_holder_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+
+        for i, name in enumerate(fetch_target_names):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [name]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+    origin_program = main_program.clone()
+    main_program = main_program.clone()
+    global_block = main_program.global_block()
+
+    need_to_remove_op_index = []
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            need_to_remove_op_index.append(i)
+
+    for index in need_to_remove_op_index[::-1]:
+        global_block._remove_op(index)
+
+    main_program.desc.flush()
+
+    main_program = main_program._prune(targets=target_vars)
+    main_program = main_program._inference_optimize(prune_read_op=True)
+
+    fetch_var_names = [v.name for v in target_vars]
+
+    prepend_feed_ops(main_program, feeded_var_names)
+    append_fetch_ops(main_program, fetch_var_names)
+
+    return main_program
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index a24e1d1300..00c4e5691a 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -26,6 +26,24 @@ from .framework import Variable, default_main_program
 __all__ = ['DataFeeder']
 
 
+def convert_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32:
+        return 'float32'
+    elif dtype == core.VarDesc.VarType.INT64:
+        return 'int64'
+    elif dtype == core.VarDesc.VarType.FP64:
+        return 'float64'
+    elif dtype == core.VarDesc.VarType.FP16:
+        return 'float16'
+    elif dtype == core.VarDesc.VarType.INT32:
+        return 'int32'
+    elif dtype == core.VarDesc.VarType.UINT8:
+        return 'uint8'
+    else:
+        raise ValueError("dtype must be any of [int32, float32, int64, "
+                         "float64, uint8]")
+
+
 class DataToLoDTensorConverter(object):
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
@@ -38,27 +56,12 @@ class DataToLoDTensorConverter(object):
             if negtive_count > 1:
                 self.shape = None
                 break
-        if dtype == core.VarDesc.VarType.FP32:
-            self.dtype = 'float32'
-        elif dtype == core.VarDesc.VarType.INT64:
-            self.dtype = 'int64'
-        elif dtype == core.VarDesc.VarType.FP64:
-            self.dtype = 'float64'
-        elif dtype == core.VarDesc.VarType.FP16:
-            self.dtype = 'float16'
-        elif dtype == core.VarDesc.VarType.INT32:
-            self.dtype = 'int32'
-        elif dtype == core.VarDesc.VarType.UINT8:
-            self.dtype = 'uint8'
-        else:
-            raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64, uint8]")
+        self.dtype = convert_dtype(dtype)
+        self._reset()
 
+    def _reset(self):
         self.data = []
-        self.lod = []
-
-        for i in six.moves.range(lod_level):
-            self.lod.append([])
+        self.lod = [[] for _ in six.moves.range(self.lod_level)]
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -88,15 +91,52 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            #else:
-            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
             t.set_recursive_sequence_lengths(self.lod)
+        self._reset()
         return t
 
 
+class BatchedTensorProvider(object):
+    def __init__(self, feed_list, place, batch_size, generator, drop_last):
+        self.place = place
+        self.batch_size = batch_size
+        self.generator = generator
+        self.converters = []
+        self.drop_last = drop_last
+
+        for var in feed_list:
+            assert var.lod_level == 0, "lod_level must be 0"
+            self.converters.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=0,
+                    shape=var.shape,
+                    dtype=var.dtype))
+
+    def _done(self):
+        return [c.done() for c in self.converters]
+
+    def __call__(self):
+        idx = 0
+        for each_sample in self.generator():
+            for each_slot, each_converter in six.moves.zip(each_sample,
+                                                           self.converters):
+                each_converter.data.append(each_slot)
+
+            idx += 1
+            if idx == self.batch_size:
+                idx = 0
+                yield self._done()
+
+        if not self.drop_last and idx > 0:
+            yield self._done()
+        else:
+            [c._reset() for c in self.converters]
+
+
 class DataFeeder(object):
     """
     DataFeeder converts the data that returned by a reader into a data
@@ -268,8 +308,8 @@ class DataFeeder(object):
         Args:
             reader(function): the reader is the function which can generate data.
             multi_devices(bool): whether to use multiple devices or not.
-            num_places(int): if the multi_devices is True, you can specify the number
-                of GPU to use, if 'num_places' is None, the function will use all the
+            num_places(int): if multi_devices is True, you can specify the number
+                of GPU to use, if multi_devices is None, the function will use all the
                 GPU of the current machine. Default None.
             drop_last(bool): whether to drop the last batch if the
                 size of the last batch is less than batch_size. Default True.
@@ -278,7 +318,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot fit for devices.
+            ValueError: If drop_last is False and the data batch cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index dfa50e721c..018e38cbb3 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -470,13 +470,21 @@ class Executor(object):
             program(Program|CompiledProgram): the program that need to run,
                 if not provided, then default_main_program (not compiled) will be used.
             feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
-            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
-            feed_var_name(str): the name for the input variable of feed Operator.
-            fetch_var_name(str): the name for the output variable of fetch Operator.
-            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
+            fetch_list(list): a list of variable or variable names that user 
+                wants to get, this method will return them according to this list.
+            feed_var_name(str): the name for the input variable of 
+                feed Operator.
+            fetch_var_name(str): the name for the output variable of 
+                fetch Operator.
+            scope(Scope): the scope used to run this program, you can switch 
+                it to different scope. default is global_scope
             return_numpy(bool): if convert the fetched tensor to numpy
-            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
-
+            use_program_cache(bool): whether to use the cached program 
+                settings across batches. Setting it be true would be faster 
+                only when (1) the program is not compiled with data parallel, 
+                and (2) program, feed variable names and fetch_list variable 
+                names do not changed compared to the last step. 
+                
         Returns:
 
             list(numpy.array): fetch result according to fetch_list.
@@ -556,6 +564,10 @@ class Executor(object):
 
         if feed is None:
             feed = {}
+        elif isinstance(feed, (list, tuple)):
+            assert len(feed) == 1, "Not compiled with data parallel"
+            feed = feed[0]
+
         if not isinstance(feed, dict):
             raise TypeError(
                 "feed requires dict as its Parameter. But you passed in %s" %
@@ -590,7 +602,7 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
-        exe.run(program.desc, scope, 0, True, True)
+        exe.run(program.desc, scope, 0, True, True, fetch_var_name)
         outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7dc9178807..b25d9441e0 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -26,6 +26,7 @@ import six
 
 import numpy as np
 import subprocess
+import multiprocessing
 
 from .. import compat as cpt
 from .proto import framework_pb2
@@ -63,6 +64,9 @@ __all__ = [
     'default_main_program',
     'program_guard',
     'name_scope',
+    'cuda_places',
+    'cpu_places',
+    'cuda_pinned_places',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -87,6 +91,87 @@ def _current_expected_place():
     return _imperative_current_expected_place_
 
 
+def _cpu_num():
+    return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+
+def cuda_places(device_ids=None):
+    '''
+    Create a list of :code:`fluid.CUDAPlace` objects.
+
+    If :code:`device_ids` is None, environment variable of
+    :code:`FLAGS_selected_gpus` would be checked first. If
+    :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
+    be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    If :code:`FLAGS_selected_gpus` is not set, all visible
+    gpu places would be returned.  
+
+    If :code:`device_ids` is not None, it should be the device
+    ids of gpus. For example, if :code:`device_ids=[0,1,2]`, 
+    the returned list would be 
+    [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    
+    Args: 
+        device_ids (None|list(int)|tuple(int)): gpu device id list.
+
+    Returns:
+        out (list(fluid.CUDAPlace)): gpu place list.
+    '''
+    assert core.is_compiled_with_cuda(), \
+        "Not compiled with CUDA"
+    if device_ids is None:
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            device_ids = [int(s) for s in gpus_env.split(",")]
+        else:
+            device_ids = six.moves.range(core.get_cuda_device_count())
+    elif not isinstance(device_ids, (list, tuple)):
+        device_ids = [device_ids]
+    return [core.CUDAPlace(dev_id) for dev_id in device_ids]
+
+
+def cpu_places(device_count=None):
+    '''
+    Create a list of :code:`fluid.CPUPlace` objects.
+    
+    If :code:`device_count` is None, the device count would
+    be determined by environment variable :code:`CPU_NUM`. 
+    If :code:`CPU_NUM` is not set, the device count would
+    be determined by :code:`multiprocessing.cpu_count()`. 
+
+    Args:
+        device_count (None|int): device number.
+
+    Returns:
+        out (list(fluid.CPUPlace)): cpu place list.
+    '''
+    if device_count is None:
+        device_count = _cpu_num()
+    return [core.CPUPlace()] * device_count
+
+
+def cuda_pinned_places(device_count=None):
+    '''
+    Create a list of :code:`fluid.CUDAPinnedPlace` objects.
+
+    If :code:`device_count` is None, the device count would
+    be determined by environment variable :code:`CPU_NUM`. 
+    If :code:`CPU_NUM` is not set, the device count would
+    be determined by :code:`multiprocessing.cpu_count()`. 
+
+    Args:
+        device_count (None|int): device number.
+
+    Returns:
+        out (list(fluid.CUDAPinnedPlace)): cuda pinned place list.
+    '''
+    assert core.is_compiled_with_cuda(), \
+        "Not compiled with CUDA"
+    if device_count is None:
+        device_count = _cpu_num()
+    return [core.cuda_pinned_places()] * device_count
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -304,96 +389,101 @@ class Variable(object):
                  is_data=False,
                  **kwargs):
         self.block = block
-        self.error_clip = error_clip
-
         if name is None:
             name = unique_name.generate('_generated_var')
-        is_new_var = False
-        name = cpt.to_text(name)
-        self.desc = self.block.desc.find_var(cpt.to_bytes(name))
-
-        if self.desc is None:
-            self.desc = self.block.desc.var(cpt.to_bytes(name))
-            is_new_var = True
 
-        if is_new_var:
-            self.desc.set_type(type)
-        elif self.desc.type() != type:
-            raise ValueError("Variable {0} has been created before. The "
-                             "previous type is {1}; the new type is {2}. They"
-                             " are not matched".format(self.name,
-                                                       self.desc.type(), type))
-
-        if shape is not None:
-            if is_new_var:
-                self.desc.set_shape(shape)
-            else:
-                old_shape = self.shape
-                shape = tuple(shape)
-                if shape != old_shape:
-                    raise ValueError(
-                        "Variable {0} has been created before. the previous "
-                        "shape is {1}; the new shape is {2}. They are not "
-                        "matched.".format(self.name, old_shape, shape))
         if dtype is not None:
             if not isinstance(dtype, core.VarDesc.VarType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
-            if is_new_var:
-                self.desc.set_dtype(dtype)
-            else:
-                old_dtype = self.dtype
-                if dtype != old_dtype:
-                    raise ValueError("Variable {0} has been created before. "
-                                     "The previous data type is {1}; the new "
-                                     "data type is {2}. They are not "
-                                     "matched.".format(self.name, old_dtype,
-                                                       dtype))
-
-        if lod_level is not None:
-            if is_new_var:
-                self.desc.set_lod_level(lod_level)
-            else:
-                if lod_level != self.lod_level:
-                    raise ValueError("Variable {0} has been created before. "
-                                     "The previous lod_level is {1}; the new "
-                                     "lod_level is {2}. They are not "
-                                     "matched".format(self.name, self.lod_level,
-                                                      lod_level))
-        if persistable is not None:
-            if is_new_var:
-                self.desc.set_persistable(persistable)
-            else:
-                if persistable != self.persistable:
-                    raise ValueError(
-                        "Variable {0} has been created before."
-                        "The previous persistable is {1}; the new "
-                        "persistable is {2}. They are not matched".format(
-                            self.name, self.persistable, persistable))
-
-        if capacity is not None:
-            if is_new_var:
-                self.desc.set_capacity(capacity)
-            else:
-                # TODO(abhinavarora) : Compare with set capacity once,
-                # get_capacity is implemented
-                pass
 
         if _in_imperative_mode():
             # record vars in tracer rather than blocks
             self._ivar = kwargs.get("ivar", None)
             if not self._ivar:
-                self._ivar = core.VarBase(stop_gradient)
-            self._ivar.desc = self.desc
-            self._ivar.block = block.desc
-            self._ivar.name = name
-            self._ivar.persistable = persistable
+                self._ivar = core.VarBase(
+                    name, dtype if dtype else core.VarDesc.VarType.FP32,
+                    list(shape) if shape else [],
+                    _current_expected_place(), stop_gradient, True
+                    if persistable else False)
             if persistable:
-                self.block.vars[name] = self
+                _imperative_tracer().trace_var(name, self)
         else:
+            self.error_clip = error_clip
+
+            is_new_var = False
+            name = cpt.to_text(name)
+            self.desc = self.block.desc.find_var(cpt.to_bytes(name))
+
+            if self.desc is None:
+                self.desc = self.block.desc.var(cpt.to_bytes(name))
+                is_new_var = True
+
+            if is_new_var:
+                self.desc.set_type(type)
+            elif self.desc.type() != type:
+                raise ValueError(
+                    "Variable {0} has been created before. The "
+                    "previous type is {1}; the new type is {2}. They"
+                    " are not matched".format(self.name, self.desc.type(),
+                                              type))
+
+            if shape is not None:
+                if is_new_var:
+                    self.desc.set_shape(shape)
+                else:
+                    old_shape = self.shape
+                    shape = tuple(shape)
+                    if shape != old_shape:
+                        raise ValueError(
+                            "Variable {0} has been created before. the previous "
+                            "shape is {1}; the new shape is {2}. They are not "
+                            "matched.".format(self.name, old_shape, shape))
+            if dtype is not None:
+                if is_new_var:
+                    self.desc.set_dtype(dtype)
+                else:
+                    old_dtype = self.dtype
+                    if dtype != old_dtype:
+                        raise ValueError(
+                            "Variable {0} has been created before. "
+                            "The previous data type is {1}; the new "
+                            "data type is {2}. They are not "
+                            "matched.".format(self.name, old_dtype, dtype))
+
+            if lod_level is not None:
+                if is_new_var:
+                    self.desc.set_lod_level(lod_level)
+                else:
+                    if lod_level != self.lod_level:
+                        raise ValueError(
+                            "Variable {0} has been created before. "
+                            "The previous lod_level is {1}; the new "
+                            "lod_level is {2}. They are not "
+                            "matched".format(self.name, self.lod_level,
+                                             lod_level))
+            if persistable is not None:
+                if is_new_var:
+                    self.desc.set_persistable(persistable)
+                else:
+                    if persistable != self.persistable:
+                        raise ValueError(
+                            "Variable {0} has been created before."
+                            "The previous persistable is {1}; the new "
+                            "persistable is {2}. They are not matched".format(
+                                self.name, self.persistable, persistable))
+
+            if capacity is not None:
+                if is_new_var:
+                    self.desc.set_capacity(capacity)
+                else:
+                    # TODO(abhinavarora) : Compare with set capacity once,
+                    # get_capacity is implemented
+                    pass
+
             self.block.vars[name] = self
-        self.op = None
-        self.stop_gradient = stop_gradient
-        self.is_data = is_data
+            self.op = None
+            self.stop_gradient = stop_gradient
+            self.is_data = is_data
 
     def _numpy(self):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
@@ -425,6 +515,11 @@ class Variable(object):
         Returns:
             str: The debug string.
         """
+        if _in_imperative_mode():
+            # TODO(panyx0718): add more imperative debug info.
+            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
+                                                     self.shape)
+
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
@@ -462,40 +557,63 @@ class Variable(object):
     def _stop_gradient(self, s):
         if _in_imperative_mode():
             self._ivar.stop_gradient = s
-        self.stop_gradient = s
+        else:
+            self.stop_gradient = s
 
     @property
     def persistable(self):
-        return self.desc.persistable()
+        if _in_imperative_mode():
+            return self._ivar.persistable
+        else:
+            return self.desc.persistable()
 
     @persistable.setter
     def persistable(self, p):
-        self.desc.set_persistable(p)
+        if _in_imperative_mode():
+            return self._ivar.persistable
+        else:
+            self.desc.set_persistable(p)
 
     @property
     def name(self):
-        return cpt.to_text(self.desc.name())
+        if _in_imperative_mode():
+            return self._ivar.name
+        else:
+            return cpt.to_text(self.desc.name())
 
     @name.setter
     def name(self, new_name):
-        self.desc.set_name(new_name)
+        if _in_imperative_mode():
+            self._ivar.name = new_name
+        else:
+            self.desc.set_name(new_name)
 
     @property
     def shape(self):
         # convert to tuple, make it as same as numpy API.
-        return tuple(self.desc.shape())
+        if _in_imperative_mode():
+            return self._ivar.shape
+        else:
+            return tuple(self.desc.shape())
 
     @property
     def dtype(self):
-        return self.desc.dtype()
+        if _in_imperative_mode():
+            return self._ivar.dtype
+        else:
+            return self.desc.dtype()
 
     @property
     def lod_level(self):
+        # TODO(minqiyang): Support lod_level in imperative mode
         return self.desc.lod_level()
 
     @property
     def type(self):
-        return self.desc.type()
+        if _in_imperative_mode():
+            return self._ivar.dtype
+        else:
+            return self.desc.type()
 
     def _set_error_clip(self, error_clip):
         """
@@ -611,10 +729,9 @@ class Operator(object):
                                 outputs={"Out": [var1]})
     """
     OP_WITHOUT_KERNEL_SET = {
-        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
-        'checkpoint_notify', 'gen_nccl_id'
+        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
+        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
+        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
@@ -624,121 +741,14 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        self.block = block
-        self.desc = desc
-        # note: not add self.attrs here:
-        # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
-        op_attrs = attrs
-        if op_attrs is None:
-            op_attrs = dict()
-        del attrs
-
-        op_maker = core.op_proto_and_checker_maker
-
-        if op_maker.kOpRoleAttrName() not in op_attrs:
-            op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
-
-        role_var_name = op_maker.kOpRoleVarAttrName()
-        if len(self.block.program.
-               op_role_var) != 0 and role_var_name not in op_attrs:
-            op_attrs[role_var_name] = self.block.program.op_role_var
-
-        if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
-            del op_attrs[role_var_name]
-
-        if len(self.desc.type()) != 0:
-            return
-        if type is None:
-            raise ValueError(
-                "`type` to initilized an Operator can not be None.")
-        else:
-            callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-            op_attrs[callstack_var_name] = list(
-                reversed(traceback.format_stack()))[1:]
-
-        self.desc.set_type(type)
-        proto = OpProtoHolder.instance().get_op_proto(type)
-
-        namescope_var_name = op_maker.kOpNameScopeAttrName()
-        op_attrs[namescope_var_name] = _full_name_scope()
-
-        def find_name(var_list, name):
-            for var_name in var_list:
-                if var_list[var_name] is not None and var_name == name:
-                    return True
-            return False
-
-        if inputs is not None:
-            for in_proto in proto.inputs:
-                found = find_name(inputs, in_proto.name)
-                assert found or in_proto.dispensable, "Input {} not found".format(
-                    in_proto.name)
-
-                if found:
-                    in_args = inputs[in_proto.name]
-                    if not isinstance(in_args, list):
-                        in_args = [in_args]
-                    if not in_proto.duplicable and len(in_args) > 1:
-                        raise ValueError(
-                            "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_args)))
-                    in_arg_names = []
-                    for arg in in_args:
-                        if isinstance(arg, six.string_types):
-                            in_arg_names.append(arg)
-                        elif isinstance(arg, six.binary_type):
-                            in_arg_names.append(arg.decode())
-                        else:
-                            in_arg_names.append(cpt.to_text(arg.name))
-                    self.desc.set_input(in_proto.name, in_arg_names)
-                else:
-                    self.desc.set_input(in_proto.name, [])
-
-        if outputs is not None:
-            for m in proto.outputs:
-                if (m.name not in outputs) and m.dispensable:
-                    continue
-                if not ((m.name in outputs) or m.dispensable):
-                    raise ValueError(
-                        ("Incorrect setting for output(s) of "
-                         "operator \"%s\", should set: [%s].") % (type, m.name))
-            for out_proto in proto.outputs:
-                if out_proto.name not in outputs:
-                    continue
-                out_args = outputs[out_proto.name]
-                if not isinstance(out_args, list):
-                    out_args = [out_args]
-                if not out_proto.duplicable and len(out_args) > 1:
-                    raise ValueError(
-                        "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_args)))
-                out_arg_names = []
-                for arg in out_args:
-                    out_arg_names.append(cpt.to_text(arg.name))
-                    # TODO(minqiyang): could we remove variable's op in static mode?
-                    if not _in_imperative_mode():
-                        arg.op = self
-                self.desc.set_output(out_proto.name, out_arg_names)
-
-        if op_attrs is not None:
-            if not isinstance(op_attrs, dict):
-                raise TypeError("'attrs' should be a dict.")
-            for attr in proto.attrs:
-                attr_name = attr.name
-                if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
-                    continue
-                attr_val = op_attrs[attr_name]
-                self._update_desc_attr(attr_name, attr_val)
-
-        self.desc.check_attrs()
-        if self._has_kernel(type):
-            self.desc.infer_var_type(self.block.desc)
-            self.desc.infer_shape(self.block.desc)
-
         if _in_imperative_mode():
-            self.iop = core.OpBase()
-            self.iop.desc = self.desc
+            if type is None:
+                raise ValueError(
+                    "`type` to initilized an Operator can not be None.")
+            self.iop = core.OpBase(type)
 
+            # TODO(minqiyang): remove these lines after we take apart all
+            # backward grads and forward variables
             self.inputs = defaultdict(list)
             if inputs is not None:
                 for k, v in six.iteritems(inputs):
@@ -755,6 +765,121 @@ class Operator(object):
                     elif isinstance(v, list) or isinstance(v, tuple):
                         self.outputs[k].extend([var._ivar for var in v])
 
+            self.attrs = attrs if attrs else {}
+        else:
+            self.block = block
+            self.desc = desc
+            # note: not add self.attrs here:
+            # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
+            op_attrs = attrs
+            if op_attrs is None:
+                op_attrs = dict()
+            del attrs
+
+            op_maker = core.op_proto_and_checker_maker
+
+            if op_maker.kOpRoleAttrName() not in op_attrs:
+                op_attrs[op_maker.kOpRoleAttrName(
+                )] = self.block.program.op_role
+
+            role_var_name = op_maker.kOpRoleVarAttrName()
+            if len(self.block.program.
+                   op_role_var) != 0 and role_var_name not in op_attrs:
+                op_attrs[role_var_name] = self.block.program.op_role_var
+
+            if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
+                del op_attrs[role_var_name]
+
+            if len(self.desc.type()) != 0:
+                return
+            if type is None:
+                raise ValueError(
+                    "`type` to initilized an Operator can not be None.")
+            else:
+                callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+                op_attrs[callstack_var_name] = list(
+                    reversed(traceback.format_stack()))[1:]
+
+            self.desc.set_type(type)
+            proto = OpProtoHolder.instance().get_op_proto(type)
+
+            namescope_var_name = op_maker.kOpNameScopeAttrName()
+            op_attrs[namescope_var_name] = _full_name_scope()
+
+            def find_name(var_list, name):
+                for var_name in var_list:
+                    if var_list[var_name] is not None and var_name == name:
+                        return True
+                return False
+
+            if inputs is not None:
+                for in_proto in proto.inputs:
+                    found = find_name(inputs, in_proto.name)
+                    assert found or in_proto.dispensable, "Input {} not found".format(
+                        in_proto.name)
+
+                    if found:
+                        in_args = inputs[in_proto.name]
+                        if not isinstance(in_args, list):
+                            in_args = [in_args]
+                        if not in_proto.duplicable and len(in_args) > 1:
+                            raise ValueError(
+                                "Input %s expects only one input, but %d are given."
+                                % (in_proto.name, len(in_args)))
+                        in_arg_names = []
+                        for arg in in_args:
+                            if isinstance(arg, six.string_types):
+                                in_arg_names.append(arg)
+                            elif isinstance(arg, six.binary_type):
+                                in_arg_names.append(arg.decode())
+                            else:
+                                in_arg_names.append(cpt.to_text(arg.name))
+                        self.desc.set_input(in_proto.name, in_arg_names)
+                    else:
+                        self.desc.set_input(in_proto.name, [])
+
+            if outputs is not None:
+                for m in proto.outputs:
+                    if (m.name not in outputs) and m.dispensable:
+                        continue
+                    if not ((m.name in outputs) or m.dispensable):
+                        raise ValueError(("Incorrect setting for output(s) of "
+                                          "operator \"%s\", should set: [%s].")
+                                         % (type, m.name))
+                for out_proto in proto.outputs:
+                    if out_proto.name not in outputs:
+                        continue
+                    out_args = outputs[out_proto.name]
+                    if not isinstance(out_args, list):
+                        out_args = [out_args]
+                    if not out_proto.duplicable and len(out_args) > 1:
+                        raise ValueError(
+                            "Output %s expects only one output, but %d are given."
+                            % (out_proto.name, len(out_args)))
+                    out_arg_names = []
+                    for arg in out_args:
+                        out_arg_names.append(cpt.to_text(arg.name))
+                        # TODO(minqiyang): could we remove variable's op in static mode?
+                        if not _in_imperative_mode():
+                            arg.op = self
+                    self.desc.set_output(out_proto.name, out_arg_names)
+
+            if op_attrs is not None:
+                if not isinstance(op_attrs, dict):
+                    raise TypeError("'attrs' should be a dict.")
+                for attr in proto.attrs:
+                    attr_name = attr.name
+                    if (attr_name not in op_attrs) or (
+                            op_attrs[attr_name] is None):
+                        continue
+                    attr_val = op_attrs[attr_name]
+                    self._update_desc_attr(attr_name, attr_val)
+
+            self.desc.check_attrs()
+            if self._has_kernel(type):
+                self.desc.infer_var_type(self.block.desc)
+                self.desc.infer_shape(self.block.desc)
+
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
 
@@ -1318,16 +1443,15 @@ class Block(object):
         Returns:
             Operator: the append Operator.
         """
-        op_desc = self.desc.append_op()
-        op = Operator(
-            block=self,
-            desc=op_desc,
-            type=kwargs.get("type", None),
-            inputs=kwargs.get("inputs", None),
-            outputs=kwargs.get("outputs", None),
-            attrs=kwargs.get("attrs", None))
-
         if _in_imperative_mode():
+            op = Operator(
+                block=self,
+                desc=None,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None))
+
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
@@ -1335,6 +1459,15 @@ class Block(object):
             _imperative_tracer().trace_op(op,
                                           kwargs.get("stop_gradient", False))
         else:
+            op_desc = self.desc.append_op()
+            op = Operator(
+                block=self,
+                desc=op_desc,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None))
+
             self.ops.append(op)
 
         return op
@@ -1383,19 +1516,27 @@ class Block(object):
         return self.ops[start:end]
 
     def _prepend_op(self, *args, **kwargs):
-        op_desc = self.desc._prepend_op()
-        op = Operator(
-            self,
-            op_desc,
-            type=kwargs.get("type", None),
-            inputs=kwargs.get("inputs", None),
-            outputs=kwargs.get("outputs", None),
-            attrs=kwargs.get("attrs", None))
         if _in_imperative_mode():
+            op = Operator(
+                self,
+                None,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None))
             _imperative_tracer().trace_op(op,
                                           kwargs.get("stop_gradient", False))
         else:
+            op_desc = self.desc._prepend_op()
+            op = Operator(
+                self,
+                op_desc,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None))
             self.ops.insert(0, op)
+
         return op
 
     def _sync_with_cpp(self):
@@ -1503,12 +1644,15 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
-    def _clone_variable(self, var):
+    def _clone_variable(self, var, force_persistable=True):
         """
         Clone a variable into current block.
 
         Args:
             var: the variable to be cloned.
+            force_persistable(bool): True means setting the result variable to being persistable.
+                                     False means setting the persistable the same with that of input var.
+                                     default: True.
 
         Returns:
             Variable: the new  variable cloned from 'var' in current block.
@@ -1528,7 +1672,7 @@ class Block(object):
                 shape=var.shape,
                 dtype=var.dtype,
                 type=var.type,
-                persistable=True,
+                persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data)
         else:
             ret_var = self.create_var(
@@ -1537,7 +1681,7 @@ class Block(object):
                 dtype=var.dtype,
                 type=var.type,
                 lod_level=var.lod_level,
-                persistable=True,
+                persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data)
         return ret_var
 
@@ -1950,6 +2094,19 @@ class IrGraph(object):
         self.graph = graph
         self._for_test = for_test
 
+    def clone(self):
+        """
+        Create a new and duplicated IrGraph.
+
+        Warns:
+            The method only clones the graph structure, not its attributes.
+
+        Returns:
+            IrGraph: A new and duplicated graph.
+        """
+        g = self.graph.clone()
+        return IrGraph(g, self._for_test)
+
     def is_test(self):
         """
         If the graph is used for testing, the function returns true. Otherwise, returns false.
@@ -2180,10 +2337,10 @@ class IrGraph(object):
         Notes: the `graph` cannot contain a circle.
 
         Returns:
-            set(IrNode): nodes in topology order.
+            list(IrNode): nodes in topology order.
         """
         ordered_nodes = core.topology_sort(self.graph)
-        return {IrNode(n) for n in ordered_nodes}
+        return [IrNode(n) for n in ordered_nodes]
 
     def build_adjacency_list(self):
         """
@@ -2251,7 +2408,7 @@ class IrGraph(object):
         """
         Convert the graph into a Program.
 
-        Notes: When the graph includes backward operator nodes, the
+        WARN: When the graph includes backward operator nodes, the
         conversion process may be failed. Usually, this function is
         only used to convert a test graph.
 
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 034a11e0a6..7281b3ea4b 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -26,8 +26,16 @@ from .nn import *
 from . import tracer
 from .tracer import *
 
+from . import profiler
+from .profiler import *
+
+from . import checkpoint
+from .checkpoint import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
+__all__ += profiler.__all__
+__all__ += checkpoint.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 174f138bfa..d619c09b1b 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -44,7 +44,7 @@ def guard(place=None):
                     yield
 
 
-def to_variable(value, block=None):
+def to_variable(value, block=None, name=None):
     if isinstance(value, np.ndarray):
         assert enabled(), "to_variable could only be called in imperative mode"
 
@@ -53,7 +53,7 @@ def to_variable(value, block=None):
         py_var = framework.Variable(
             block,
             type=core.VarDesc.VarType.LOD_TENSOR,
-            name=None,
+            name=name,
             shape=value.shape,
             dtype=value.dtype)
         var = py_var._ivar.value()
diff --git a/python/paddle/fluid/imperative/checkpoint.py b/python/paddle/fluid/imperative/checkpoint.py
new file mode 100644
index 0000000000..37c43f29d2
--- /dev/null
+++ b/python/paddle/fluid/imperative/checkpoint.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import collections
+from .. import core
+from ..framework import Variable, default_main_program
+
+__all__ = ['save_persistables', 'load_persistables']
+
+
+def save_persistables(vardict, dirname, filename=None):
+    """
+    This function filters out all variables in layer.parameters from the
+    give `layer` and then trys to load these variables from the folder
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
+    the file name.
+
+    Args:
+        vardict(dict of Parameters): The parameters will
+                                    be saved. If it is None, nothing
+                                    will be deal.
+        dirname(str): The directory path.
+        filename(str|None): The file which saved all variables. If variables were
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+
+    Examples:
+        .. code-block:: python
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            x_data = np.arange(12).reshape(4, 3).astype('int64')
+            y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+            x_data = x_data.reshape((-1, num_steps, 1))
+            y_data = y_data.reshape((-1, 1))
+            init_hidden_data = np.zeros(
+                (num_layers, batch_size, hidden_size), dtype='float32')
+            init_cell_data = np.zeros(
+                (num_layers, batch_size, hidden_size), dtype='float32')
+            x = to_variable(x_data)
+            y = to_variable(y_data)
+            init_hidden = to_variable(init_hidden_data)
+            init_cell = to_variable(init_cell_data)
+            dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                        init_cell)
+            param_path = "./my_paddle_model"
+            fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path,
+                                       layer=ptb_model)
+    """
+    if isinstance(vardict, collections.OrderedDict):
+        _save_var_to_file(vardict, dirname, filename)
+
+
+def load_persistables(vardict, dirname, filename=None):
+    """
+    This function trys to load persistable variables from the folder
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
+    the file name.
+
+    Args:
+        vardict(dict of Parameters): The parameters will be loaded.
+        dirname(str): The directory path.
+        filename(str|None): The file which saved all variables, this file path should be end with '.npz'. If variables were
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        dict: The parameter-dict resumed from file
+
+    Examples:
+        .. code-block:: python
+            my_layer = layer(fluid.imperative.Layer)
+            param_path = "./my_paddle_model"
+
+            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path)
+            param_1 = param_dict['PtbModel_0.w_1']
+
+            or:
+            my_layer = layer(fluid.imperative.Layer)
+            param_path = "./my_paddle_model"
+            filename = "model.file"
+            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path,
+                                                                       filename=filename)
+            param_1 = param_dict['PtbModel_0.w_1']
+
+        """
+    if isinstance(vardict, collections.OrderedDict):
+        return _load_var_from_file(vardict, dirname, filename)
+
+    return {}
+
+
+def _save_var_to_file(stat_dict, file_dir, file_name):
+    save_block = default_main_program().global_block()
+    save_var_map = {}
+    for each_var in stat_dict.items():
+        save_var_map[each_var.name] = each_var
+        if file_name is None:
+            save_block.append_op(
+                type='save',
+                inputs={'X': [each_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+
+    if file_name is not None:
+        save_var_list = []
+        for name in sorted(save_var_map.keys()):
+            save_var_list.append(save_var_map[name])
+
+        save_block.append_op(
+            type='save_combine',
+            inputs={'X': save_var_list},
+            outputs={},
+            attrs={'file_path': os.path.join(file_dir, file_name)})
+
+
+def _load_var_from_file(stat_dict, file_dir, file_name):
+    load_block = default_main_program().global_block()
+    load_var_map = {}
+
+    for each_var in stat_dict.items():
+        assert isinstance(each_var, Variable)
+        if each_var.type == core.VarDesc.VarType.RAW:
+            continue
+        new_var = _clone_var_in_block_(load_block, each_var)
+        if file_name is None:
+            load_block.append_op(
+                type='load',
+                inputs={},
+                outputs={'Out': [new_var]},
+                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+
+        load_var_map[new_var.name] = new_var
+
+    if file_name is not None:
+        load_var_list = []
+        for name in sorted(load_var_map.keys()):
+            load_var_list.append(load_var_map[name])
+
+        load_block.append_op(
+            type='load_combine',
+            inputs={},
+            outputs={"Out": load_var_list},
+            attrs={'file_path': os.path.join(file_dir, file_name)})
+        for res_var in load_var_list:
+            load_var_map[res_var.name] = res_var
+
+    return load_var_map
+
+
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.dtype,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/imperative/layer_object_helper.py
index 6afffe3636..0dac99a491 100644
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
@@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase):
 
         Returns dtype of the input
         """
+        inputs_in = inputs_in if (inputs_in is not None) else []
         inputs = self._multiple_input(inputs_in)
         dtype = None
         for each in inputs:
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 0c96d4dc59..e64667f7f4 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,10 +17,12 @@ import contextlib
 import sys
 import numpy as np
 import collections
+import six
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
+from ..param_attr import ParamAttr
 
 __all__ = ['Layer', 'PyLayer']
 
@@ -72,6 +74,10 @@ class Layer(core.Layer):
 
         Returns created parameter Variable.
         """
+        if isinstance(attr, ParamAttr) and (attr.name is not None):
+            attr.name = ".".join([self._full_name, attr.name])
+        elif isinstance(attr, six.string_types):
+            attr = ".".join([self._full_name, attr])
         return self._helper.create_parameter(attr, shape, dtype, is_bias,
                                              default_initializer)
 
@@ -164,6 +170,7 @@ class Layer(core.Layer):
             the sublayer passed in.
         """
         assert isinstance(sublayer, core.Layer)
+
         self._sub_layers[name] = sublayer
         return sublayer
 
@@ -212,6 +219,34 @@ class Layer(core.Layer):
         else:
             object.__delattr__(self, name)
 
+    def state_dict(self, destination=None, prefix='', include_sublayers=True):
+        if destination is None:
+            destination = collections.OrderedDict()
+        for name, data in self._parameters.items():
+            if data is not None:
+                destination[prefix + name] = data
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    destination_temp = destination.copy()
+                    destination_temp.update(
+                        layer_item.state_dict(destination_temp, prefix +
+                                              layer_name + ".",
+                                              include_sublayers))
+                    destination = destination_temp
+        return destination
+
+    def load_dict(self, stat_dict, include_sublayers=True):
+        for name, item in self.__dict__.get('_parameters', None).items():
+            if item.name in stat_dict:
+                self.__setattr__(name, stat_dict[item.name])
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    layer_item.load_dict(stat_dict)
+
 
 class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
@@ -258,7 +293,7 @@ class PyLayer(core.PyLayer):
             cls.backward_id = core.PyLayer.num_funcs() + 1
             PyLayer.register_func(cls.backward_id, cls._do_backward)
 
-        iop = core.OpBase()
+        iop = core.OpBase(cls.__class__.__name__ + str(cls.forward_id))
         iop.forward_id = cls.forward_id
         iop.backward_id = cls.backward_id
         block.ops.append(iop)
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 5aff3ea2d1..9cbfc3e3eb 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -20,9 +20,12 @@ from .. import core
 from ..layers import utils
 from . import layers
 from ..framework import Variable, OpProtoHolder
+from ..layers import layer_function_generator
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
+__all__ = [
+    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm'
+]
 
 
 class Conv2D(layers.Layer):
@@ -112,7 +115,7 @@ class Conv2D(layers.Layer):
                 'strides': self._stride,
                 'paddings': self._padding,
                 'dilations': self._dilation,
-                'groups': self._groups,
+                'groups': self._groups if self._groups else 1,
                 'use_cudnn': self._use_cudnn,
                 'use_mkldnn': False,
             })
@@ -437,7 +440,6 @@ class Embedding(layers.Layer):
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
-
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             size[0] + padding_idx)
 
@@ -468,3 +470,262 @@ class Embedding(layers.Layer):
             })
 
         return out
+
+
+class LayerNorm(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
+        """
+        ${comment}
+
+        The formula is as follows:
+
+        ..  math::
+
+            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+        * :math:`a`: the vector representation of the summed inputs to the neurons
+        in that layer.
+
+        * :math:`H`: the number of hidden units in a layers
+
+        * :math:`g`: the trainable scale parameter.
+
+        * :math:`b`: the trainable bias parameter.
+
+        Args:
+            input(Variable): The input tensor variable.
+            scale(bool): Whether to learn the adaptive gain :math:`g` after
+                normalization. Default True.
+            shift(bool): Whether to learn the adaptive bias :math:`b` after
+                normalization. Default True.
+            begin_norm_axis(int): The normalization will be performed along
+                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+                Default 1.
+            epsilon(float): The small value added to the variance to prevent
+                division by zero. Default 1e-05.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as scale. The
+                :attr:`param_attr` is initialized as 1 if it is added. Default None.
+            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as bias. The
+                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+            act(str): Activation to be applied to the output of layer normalizaiton.
+                      Default None.
+        Returns:
+            ${y_comment}
+
+        Examples:
+
+            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+            >>>                          dtype='float32')
+            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        """
+
+        super(LayerNorm, self).__init__(name_scope)
+        self._scale = scale
+        self._shift = shift
+        self._begin_norm_axis = begin_norm_axis
+        self._epsilon = epsilon
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        input_shape = input.shape
+        param_shape = [
+            reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
+        ]
+        if self._scale:
+            self._scale_w = self.create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+        if self._shift:
+            assert self._bias_attr is not False
+            self._bias_w = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True)
+
+    def forward(self, input):
+        inputs = dict()
+        inputs['X'] = input
+        if self._scale:
+            inputs['Scale'] = self._scale_w
+        if self._shift:
+            inputs['Bias'] = self._bias_w
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        layer_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        self._helper.append_op(
+            type="layer_norm",
+            inputs=inputs,
+            outputs={
+                "Y": layer_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "begin_norm_axis": self._begin_norm_axis
+            })
+
+        return self._helper.append_activation(layer_norm_out)
+
+
+class GRUUnit(layers.Layer):
+    """
+    **GRU unit layer**
+
+    if origin_mode is True, then the equation of a gru step is from paper
+    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
+
+    if origin_mode is False, then the equation of a gru step is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
+
+
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        name_scope (str): See base class.
+        hidden (Variable): The hidden value of gru unit from previous step.
+        size (integer): The input dimension value.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, gru_unit will create ParamAttr as
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
+        activation (string): The activation type for cell (actNode).
+                             Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate).
+                                  Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+    """
+
+    def __init__(self,
+                 name_scope,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 activation='tanh',
+                 gate_activation='sigmoid',
+                 origin_mode=False,
+                 dtype='float32'):
+        super(GRUUnit, self).__init__(name_scope)
+
+        activation_dict = dict(
+            identity=0,
+            sigmoid=1,
+            tanh=2,
+            relu=3, )
+        activation = activation_dict[activation]
+        gate_activation = activation_dict[gate_activation]
+
+        self._dtype = dtype
+        size = size // 3
+        # create weight
+        self._weight = self.create_parameter(
+            attr=param_attr, shape=[size, 3 * size], dtype=dtype)
+
+        # create bias
+        bias_size = [1, 3 * size]
+        self._bias = self.create_parameter(
+            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    def forward(self, input, hidden):
+        inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
+        if self._bias:
+            inputs['Bias'] = self._bias
+
+        gate = self._helper.create_variable_for_type_inference(self._dtype)
+        reset_hidden_pre = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        updated_hidden = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type='gru_unit',
+            inputs=inputs,
+            outputs={
+                'Gate': gate,
+                'ResetHiddenPrev': reset_hidden_pre,
+                'Hidden': updated_hidden,
+            },
+            attrs={
+                'activation': 2,  # tanh
+                'gate_activation': 1,  # sigmoid
+            })
+
+        return updated_hidden, reset_hidden_pre, gate
diff --git a/python/paddle/fluid/imperative/profiler.py b/python/paddle/fluid/imperative/profiler.py
new file mode 100644
index 0000000000..04c865500b
--- /dev/null
+++ b/python/paddle/fluid/imperative/profiler.py
@@ -0,0 +1,30 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from .. import core
+
+__all__ = [
+    'start_gperf_profiler',
+    'stop_gperf_profiler',
+]
+
+
+def start_gperf_profiler():
+    core.start_imperative_gperf_profiler()
+
+
+def stop_gperf_profiler():
+    core.stop_imperative_gperf_profiler()
diff --git a/python/paddle/fluid/imperative/tracer.py b/python/paddle/fluid/imperative/tracer.py
index 1064ad63e7..bd77de7424 100644
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
@@ -36,14 +36,21 @@ class Tracer(core.Tracer):
         super(Tracer, self).__init__(block)
 
         self._ops = defaultdict()
+        self._vars = defaultdict()
         self._trace_id = 0
 
+    def trace_var(self, name, var):
+        self._vars[name] = var
+
+    def all_parameters(self):
+        return list((item for name, item in six.iteritems(self._vars)
+                     if isinstance(item, framework.Parameter)))
+
     def trace_op(self, op, stop_gradient=False):
         # record op's trace id
         op.iop._trace_id = self._trace_id
 
-        # trace op and save it
-        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.block.desc,
+        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs,
                                    framework._current_expected_place(),
                                    stop_gradient)
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1775159798..4d55236272 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -26,12 +26,14 @@ from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
+from . import reader
+from .reader import *
 from . import core
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model'
-]
+] + reader.__all__
 
 
 def is_parameter(var):
@@ -896,7 +898,7 @@ def save_inference_model(dirname,
                                      True is supported.
 
     Returns:
-        None
+        target_var_name_list(list): The fetch variables' name list
 
     Raises:
         ValueError: If `feed_var_names` is not a list of basestring.
@@ -949,11 +951,13 @@ def save_inference_model(dirname,
     # TODO(Superjomn) add an IR pass to remove 1-scale op.
     with program_guard(main_program):
         uniq_target_vars = []
-        for var in target_vars:
+        for i, var in enumerate(target_vars):
             if isinstance(var, Variable):
-                var1 = layers.scale(var, 1.)
-            uniq_target_vars.append(var1)
+                var = layers.scale(
+                    var, 1., name="save_infer_model/scale_{}".format(i))
+            uniq_target_vars.append(var)
         target_vars = uniq_target_vars
+    target_var_name_list = [var.name for var in target_vars]
 
     # when a pserver and a trainer running on the same machine, mkdir may conflict
     try:
@@ -1010,6 +1014,7 @@ def save_inference_model(dirname,
         params_filename = os.path.basename(params_filename)
 
     save_persistables(executor, dirname, main_program, params_filename)
+    return target_var_name_list
 
 
 def load_inference_model(dirname,
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 3504cb7935..a68160d797 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -268,11 +268,9 @@ class LayerHelperBase(object):
         """
         # Deepcopy the attr so that parameters can be shared in program
         attr = copy.deepcopy(attr)
-        if attr is None:
-            attr = ParamAttr._to_attr(attr)
+        attr = ParamAttr._to_attr(attr)
         if not attr:
             return None
-
         assert isinstance(attr, ParamAttr)
         suffix = 'b' if is_bias else 'w'
         if attr.name is None:
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index a2a808777d..31effea378 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -33,6 +33,7 @@ from .detection import *
 from . import metric_op
 from .metric_op import *
 from .learning_rate_scheduler import *
+from .collective import *
 
 __all__ = []
 __all__ += nn.__all__
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
new file mode 100644
index 0000000000..a9bce77b9d
--- /dev/null
+++ b/python/paddle/fluid/layers/collective.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from ..layer_helper import LayerHelper, unique_name
+
+
+def _allreduce(x, out=None, reduce_type="sum"):
+    helper = LayerHelper("allreduce", **locals())
+    # Convert string reduce type to op int type
+    red_typ_int = 0
+    if reduce_type == "sum":
+        red_typ_int = 0
+    elif reduce_type == "prod":
+        red_typ_int = 1
+    elif reduce_type == "max":
+        red_typ_int = 2
+    elif reduce_type == "min":
+        red_typ_int = 3
+    else:
+        raise TypeError("reduce type can only be [sum|prod|max|min]")
+
+    if out is None:
+        out = helper.create_variable(
+            name=unique_name.generate(".".join([x.name, 'tmp'])),
+            shape=x.shape,
+            dtype=x.dtype,
+            type=x.type,
+            persistable=x.persistable,
+            stop_gradient=True)
+    helper.append_op(
+        type='allreduce',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={"reduce_type": red_typ_int})
+    return out
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index e7f704515d..3277766171 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -28,21 +28,9 @@ import six
 from functools import reduce
 
 __all__ = [
-    'While',
-    'Switch',
-    'increment',
-    'array_write',
-    'create_array',
-    'less_than',
-    'equal',
-    'array_read',
-    'array_length',
-    'IfElse',
-    'DynamicRNN',
-    'StaticRNN',
-    'reorder_lod_tensor_by_rank',
-    'Print',
-    'is_empty',
+    'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
+    'equal', 'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
+    'reorder_lod_tensor_by_rank', 'Print', 'is_empty'
 ]
 
 
@@ -1448,12 +1436,13 @@ class DynamicRNN(object):
         self.input_array = []
         self.mem_link = []
 
-    def step_input(self, x):
+    def step_input(self, x, level=0):
         """
         Mark a sequence as a dynamic RNN input.
 
         Args:
             x(Variable): The input sequence.
+            level(int): The level of lod used to split steps. Default: 0.
 
         Returns:
             The current timestep in the input sequence.
@@ -1471,7 +1460,8 @@ class DynamicRNN(object):
             parent_block.append_op(
                 type='lod_rank_table',
                 inputs={"X": x},
-                outputs={"Out": self.lod_rank_table})
+                outputs={"Out": self.lod_rank_table},
+                attrs={"level": level})
             self.max_seq_len = parent_block.create_var(
                 name=unique_name.generate('dynamic_rnn_max_seq_len'),
                 dtype='int64')
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cbedd70f85..0a1ddbc1db 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -49,6 +49,7 @@ __all__ = [
     'box_coder',
     'polygon_box_transform',
     'yolov3_loss',
+    'yolo_box',
     'box_clip',
     'multiclass_nms',
     'distribute_fpn_proposals',
@@ -515,6 +516,8 @@ def yolov3_loss(x,
                 class_num,
                 ignore_thresh,
                 downsample_ratio,
+                gtscore=None,
+                use_label_smooth=True,
                 name=None):
     """
     ${comment}
@@ -533,28 +536,35 @@ def yolov3_loss(x,
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolov3 loss
+        name (string): the name of yolov3 loss. Default None.
+        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+                            of [N, B]. Default None.
+        use_label_smooth (bool): ${use_label_smooth_comment}
 
     Returns:
-        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+        Variable: A 1-D tensor with shape [N], the value of yolov3 loss
 
     Raises:
         TypeError: Input x of yolov3_loss must be Variable
-        TypeError: Input gtbox of yolov3_loss must be Variable"
-        TypeError: Input gtlabel of yolov3_loss must be Variable"
+        TypeError: Input gtbox of yolov3_loss must be Variable
+        TypeError: Input gtlabel of yolov3_loss must be Variable
+        TypeError: Input gtscore of yolov3_loss must be None or Variable
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
+        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
 
     Examples:
       .. code-block:: python
 
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
+          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
+                                          gtscore=gtscore, anchors=anchors, 
                                           anchor_mask=anchor_mask, class_num=80,
                                           ignore_thresh=0.7, downsample_ratio=32)
     """
@@ -566,6 +576,8 @@ def yolov3_loss(x,
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
     if not isinstance(gtlabel, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
+    if gtscore is not None and not isinstance(gtscore, Variable):
+        raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
     if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
@@ -575,6 +587,9 @@ def yolov3_loss(x,
     if not isinstance(ignore_thresh, float):
         raise TypeError(
             "Attr ignore_thresh of yolov3_loss must be a float number")
+    if not isinstance(use_label_smooth, bool):
+        raise TypeError(
+            "Attr use_label_smooth of yolov3_loss must be a bool value")
 
     if name is None:
         loss = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -585,21 +600,26 @@ def yolov3_loss(x,
     objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
     gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
 
+    inputs = {
+        "X": x,
+        "GTBox": gtbox,
+        "GTLabel": gtlabel,
+    }
+    if gtscore:
+        inputs["GTScore"] = gtscore
+
     attrs = {
         "anchors": anchors,
         "anchor_mask": anchor_mask,
         "class_num": class_num,
         "ignore_thresh": ignore_thresh,
         "downsample_ratio": downsample_ratio,
+        "use_label_smooth": use_label_smooth,
     }
 
     helper.append_op(
         type='yolov3_loss',
-        inputs={
-            "X": x,
-            "GTBox": gtbox,
-            "GTLabel": gtlabel,
-        },
+        inputs=inputs,
         outputs={
             'Loss': loss,
             'ObjectnessMask': objectness_mask,
@@ -609,6 +629,83 @@ def yolov3_loss(x,
     return loss
 
 
+@templatedoc(op_type="yolo_box")
+def yolo_box(x,
+             img_size,
+             anchors,
+             class_num,
+             conf_thresh,
+             downsample_ratio,
+             name=None):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        img_size (Variable): ${img_size_comment}
+        anchors (list|tuple): ${anchors_comment}
+        class_num (int): ${class_num_comment}
+        conf_thresh (float): ${conf_thresh_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
+        name (string): the name of yolo box layer. Default None.
+
+    Returns:
+        Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
+        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
+        scores of boxes.
+
+    Raises:
+        TypeError: Input x of yolov_box must be Variable
+        TypeError: Attr anchors of yolo box must be list or tuple
+        TypeError: Attr class_num of yolo box must be an integer
+        TypeError: Attr conf_thresh of yolo box must be a float number
+
+    Examples:
+
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+        anchors = [10, 13, 16, 30, 33, 23]
+        loss = fluid.layers.yolo_box(x=x, class_num=80, anchors=anchors, 
+                                        conf_thresh=0.01, downsample_ratio=32)
+    """
+    helper = LayerHelper('yolo_box', **locals())
+
+    if not isinstance(x, Variable):
+        raise TypeError("Input x of yolo_box must be Variable")
+    if not isinstance(img_size, Variable):
+        raise TypeError("Input img_size of yolo_box must be Variable")
+    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+        raise TypeError("Attr anchors of yolo_box must be list or tuple")
+    if not isinstance(class_num, int):
+        raise TypeError("Attr class_num of yolo_box must be an integer")
+    if not isinstance(conf_thresh, float):
+        raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
+
+    boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
+    scores = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    attrs = {
+        "anchors": anchors,
+        "class_num": class_num,
+        "conf_thresh": conf_thresh,
+        "downsample_ratio": downsample_ratio,
+    }
+
+    helper.append_op(
+        type='yolo_box',
+        inputs={
+            "X": x,
+            "ImgSize": img_size,
+        },
+        outputs={
+            'Boxes': boxes,
+            'Scores': scores,
+        },
+        attrs=attrs)
+    return boxes, scores
+
+
 @templatedoc()
 def detection_map(detect_res,
                   label,
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a9b391fd53..94fd9f3ea5 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -563,22 +563,26 @@ def _py_reader(capacity,
 
     def start_provide_thread(func):
         def __provider_thread__():
-            for tensors in func():
-                array = core.LoDTensorArray()
-                for item in tensors:
-                    if not isinstance(item, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(item, core.CPUPlace())
-                        item = tmp
-
-                    array.append(item)
-
-                if reader.exited:
-                    break
-                feed_queue.push(array)
-                if reader.exited:
-                    break
-            feed_queue.close()
+            try:
+                for tensors in func():
+                    array = core.LoDTensorArray()
+                    for item in tensors:
+                        if not isinstance(item, core.LoDTensor):
+                            tmp = core.LoDTensor()
+                            tmp.set(item, core.CPUPlace())
+                            item = tmp
+
+                        array.append(item)
+
+                    if reader.exited:
+                        break
+                    feed_queue.push(array)
+                    if reader.exited:
+                        break
+                feed_queue.close()
+            except Exception as ex:
+                feed_queue.close()
+                raise ex
 
         reader.thread = threading.Thread(target=__provider_thread__)
         reader.thread.daemon = True
@@ -628,6 +632,9 @@ def _py_reader(capacity,
     reader.reset = __reset__
     reader.decorate_tensor_provider = __set_tensor_provider__
     reader.decorate_paddle_reader = __set_paddle_reader__
+
+    reader.decorate_batch_generator = __set_tensor_provider__
+    reader.decorate_sample_list_generator = __set_paddle_reader__
     reader.start = __start__
 
     return reader
@@ -692,6 +699,11 @@ def py_reader(capacity,
         >>>             exe.run(fetch_list=[loss.name])
         >>>     except fluid.core.EOFException:
         >>>         reader.reset()
+        >>>
+        >>> ...
+        >>>
+        >>> fluid.io.save_inference_model(dirname='./model', feeded_var_names=[img, label],
+        >>>     target_vars=[loss], executor=fluid.Executor(fluid.CUDAPlace(0)))  
 
         2. When training and testing are both performed, two different
         :code:`py_reader` should be created with different names, e.g.:
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index a458cebfb1..734383655c 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -174,6 +174,8 @@ def monkey_patch_variable():
         ("__rtruediv__", "elementwise_div", True),
         ("__pow__", "elementwise_pow", False),
         ("__rpow__", "elementwise_pow", True),
+        ("__floordiv__", "elementwise_floordiv", False),
+        ("__mod__", "elementwise_mod", False),
             # for logical compare
         ("__eq__", "equal", False),
         ("__ne__", "not_equal", False),
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9d1d5fe093..713aedb81e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,8 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder
+from ..framework import Variable, OpProtoHolder, _in_imperative_mode
+from ..imperative import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
 from .tensor import concat, assign
@@ -188,6 +189,7 @@ __all__ = [
     'huber_loss',
     'tree_conv',
     'npair_loss',
+    'fsp_matrix',
 ]
 
 kIgnoreIndex = -100
@@ -205,16 +207,23 @@ def fc(input,
     **Fully Connected Layer**
 
     This function creates a fully connected layer in the network. It can take
-    multiple tensors as its inputs. It creates a variable called weights for
-    each input tensor, which represents a fully connected weight matrix from
-    each input unit to each output unit. The fully connected layer multiplies
-    each input tensor with its coresponding weight to produce an output Tensor.
-    If multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a bias variable will be created
-    and added to the output. Finally, if activation is not None, it will be applied
-    to the output as well.
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
 
-    This process can be formulated as follows:
+    .. math::
+
+        Out = Act({XW + b})
+
+    When the input are multiple tensors:
 
     .. math::
 
@@ -222,13 +231,31 @@ def fc(input,
 
     In the above equation:
 
-    * :math:`N`: Number of the input.
-    * :math:`X_i`: The input tensor.
-    * :math:`W`: The weights created by this layer.
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
     * :math:`b`: The bias parameter created by this layer (if needed).
     * :math:`Act`: The activation function.
     * :math:`Out`: The output tensor.
 
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
     Args:
         input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
             the input tensor(s) is at least 2.
@@ -260,8 +287,14 @@ def fc(input,
     Examples:
         .. code-block:: python
 
+          # when input is single tensor
           data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
     """
 
     helper = LayerHelper("fc", **locals())
@@ -1316,7 +1349,7 @@ def dropout(x,
                                         1. downgrade_in_infer(default), downgrade the outcome at inference
 
                                            - train: out = input * mask
-                                           - inference: out = input * dropout_prob
+                                           - inference: out = input * (1.0 - dropout_prob)
 
                                            (mask is a tensor same shape with input, value is 0 or 1
                                            ratio of 0 is dropout_prob)
@@ -1432,6 +1465,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
           predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
           cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
+    if not soft_label:
+        return cross_entropy2(input, label, ignore_index)
     helper = LayerHelper('cross_entropy', **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -1444,6 +1479,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     return out
 
 
+def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
+    helper = LayerHelper('cross_entropy2', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
+    match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy2',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out],
+                 'MatchX': [match_x],
+                 'XShape': [xshape]},
+        attrs={'ignore_index': ignore_index})
+    return out
+
+
 def bpr_loss(input, label, name=None):
     """
     Bayesian Personalized Ranking Loss Operator.
@@ -2904,11 +2955,17 @@ def batch_norm(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta
 
     Args:
-        input(variable): The input variable which is a LoDTensor.
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
         act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test(bool, Default False): Used for training or training.
-        momentum(float, Default 0.9):
-        epsilon(float, Default 1e-05):
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float, Default 0.9): The value used for the moving_mean and
+            moving_var computation. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
         param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
              will create ParamAttr as param_attr. If the Initializer of the param_attr
@@ -2966,15 +3023,8 @@ def batch_norm(input,
         shape=param_shape,
         dtype=dtype,
         default_initializer=Constant(1.0))
-    # setting stop_gradient=True to reduce computation
-    if use_global_stats and helper.param_attr.learning_rate == 0.:
-        scale.stop_gradient = True
-
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-    # setting stop_gradient=True to reduce computation
-    if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        bias.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -4847,10 +4897,14 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
         if transpose_y:
             y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
         if x_shape[-1] != y_shape[-2]:
-            raise ValueError("Invalid inputs for matmul.")
+            raise ValueError("Invalid inputs for matmul. x: %s, y: %s\n" %
+                             (x_shape, y_shape))
 
         if len(y_shape) > 2 and len(x_shape) > 2:
             for i, dim_x in enumerate(x_shape[:-2]):
+                # don't check neg shape
+                if dim_x < 0 or y_shape[i] < 0:
+                    continue
                 if dim_x != y_shape[i]:
                     raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" %
                                      (x.shape, y.shape))
@@ -6152,7 +6206,8 @@ def one_hot(input, depth):
         type="one_hot",
         inputs={'X': input},
         attrs={'depth': depth},
-        outputs={'Out': one_hot_out})
+        outputs={'Out': one_hot_out},
+        stop_gradient=True)
     return one_hot_out
 
 
@@ -6350,6 +6405,8 @@ def squeeze(input, axes, name=None):
             x = layers.data(name='x', shape=[5, 1, 10])
             y = layers.sequeeze(input=x, axes=[1])
     """
+    assert not _in_imperative_mode(), (
+        "squeeze layer is not supported in imperative mode yet.")
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -9087,6 +9144,10 @@ def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
+    if _in_imperative_mode():
+        x = base.to_variable(x)
+        y = base.to_variable(y)
+
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     axis = helper.kwargs.get('axis', -1)
@@ -9172,9 +9233,24 @@ def elementwise_pow(x, y, axis=-1, act=None, name=None):
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
+def elementwise_mod(x, y, axis=-1, act=None, name=None):
+    return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
+
+
+def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
+    return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
+
+
 for func in [
-        elementwise_add, elementwise_div, elementwise_sub, elementwise_mul,
-        elementwise_max, elementwise_min, elementwise_pow
+        elementwise_add,
+        elementwise_div,
+        elementwise_sub,
+        elementwise_mul,
+        elementwise_max,
+        elementwise_min,
+        elementwise_pow,
+        elementwise_mod,
+        elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
     func.__doc__ = _generate_doc_string_(
@@ -9650,7 +9726,12 @@ def sequence_reverse(x, name=None):
     return out
 
 
-def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
+def affine_channel(x,
+                   scale=None,
+                   bias=None,
+                   data_layout='NCHW',
+                   name=None,
+                   act=None):
     """
     Applies a separate affine transformation to each channel of the input.
     Useful for replacing spatial batch norm with its equivalent fixed
@@ -9669,6 +9750,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
         data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
             tensor, you can ignore data_layout.
         name (str, default None): The name of this layer.
+        act (str, default None): Activation to be applied to the output of this layer.
 
     Returns:
         out (Variable): A tensor of the same shape and data layout with x.
@@ -9688,7 +9770,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
                 'Bias': bias},
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
-    return out
+    return helper.append_activation(out)
 
 
 def similarity_focus(input, axis, indexes, name=None):
@@ -10704,8 +10786,52 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
 
     similarity_matrix = matmul(
         anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_value = softmax(similarity_matrix)
-    cross_entropy = -1 * reduce_sum(labels * log(softmax_value), 0)
+    softmax_ce = softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True)
+    cross_entropy = reduce_sum(labels * softmax_ce, 0)
     celoss = reduce_mean(cross_entropy)
 
     return l2loss + celoss
+
+
+def fsp_matrix(x, y):
+    """
+
+    **FSP matrix op**
+
+    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
+    Given feature map x with shape [x_channel, h, w] and feature map y with shape
+    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
+
+    1. reshape x into matrix with shape [x_channel, h * w] and reshape and
+       transpose y into matrix with shape [h * w, y_channel].
+    2. multiply x and y to get fsp matrix with shape [x_channel, y_channel].
+
+    The output is a batch of fsp matrices.
+
+    Args:
+
+        x (Variable): A feature map with shape [batch_size, x_channel, height, width].
+        y (Variable): A feature map with shape [batch_size, y_channel, height, width].
+                      The y_channel can be different with the x_channel of Input(X)
+                      while the other dimensions must be the same with Input(X)'s.
+
+    Returns:
+
+        fsp matrix (Variable): The output of FSP op with shape [batch_size, x_channel, y_channel].
+        The x_channel is the channel of x and the y_channel is the channel of y.
+
+    Examples:
+
+        .. code-block:: python
+
+            feature_map_0 = fluid.layers.conv2d(x)
+            feature_map_1 = fluid.layers.conv2d(feature_map_0)
+            loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
+
+    """
+    helper = LayerHelper('fsp_matrix', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
+        input_param_name='x'))
+    helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 4381727a09..f018bb8af8 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -23,6 +23,7 @@ __activations_noattr__ = [
     'logsigmoid',
     'exp',
     'tanh',
+    'atan',
     'tanh_shrink',
     'softshrink',
     'sqrt',
@@ -30,6 +31,8 @@ __activations_noattr__ = [
     'ceil',
     'floor',
     'cos',
+    'acos',
+    'asin',
     'sin',
     'round',
     'reciprocal',
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cb97398698..a18e5b6a9c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'tensor_array_to_tensor',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
+    'ones',
+    'zeros',
+    'reverse',
+    'has_inf',
+    'has_nan',
+    'isfinite',
+    'range',
 ]
 
 
@@ -764,3 +780,50 @@ def isfinite(x):
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
     return out
+
+
+def range(start, end, step, dtype):
+    """
+    Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval [start, stop) (in other words,
+    the interval including start but excluding stop).
+
+    args:
+        start(int|float|Variable): Start of interval. The interval includes this value.
+        end(int|float|Variable): End of interval. The interval does not include this
+                                 value, except in some cases where step is not an integer
+                                 and floating point round-off affects the length of out. 
+        step(int|float|Variable): Spacing between values. For any output out, this is the
+                                  distance between two adjacent values, out[i+1] - out[i].
+                                  The default step size is 1.
+        dtype(string): 'float32'|'int32'|..., the data type of the output tensor.
+
+    returns:
+        Evenly spaced values within a given interval.
+
+    examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.range(0, 10, 2, 'int32')
+
+    """
+    helper = LayerHelper("range", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(end, Variable):
+        end = fill_constant([1], dtype, end)
+    if not isinstance(step, Variable):
+        step = fill_constant([1], dtype, step)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='range',
+        inputs={'Start': start,
+                'End': end,
+                'Step': step},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index f01924317d..eb2f523539 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -70,6 +70,10 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._opti_name_list = []
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
 
     def _create_global_learning_rate(self):
         if imperative_base.enabled():
@@ -185,8 +189,13 @@ class Optimizer(object):
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
         var = self.helper.create_global_variable(
-            name=unique_name.generate(name),
+            name=var_name,
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
@@ -396,28 +405,30 @@ class Optimizer(object):
             and list of (param, grad) Variables pair for optimization.
         """
         self._dtype = loss.dtype
-        program = loss.block.program
         optimize_ops = []
         if framework._in_imperative_mode():
             if parameter_list is not None:
                 parameters = parameter_list
             else:
-                parameters = program.global_block().all_parameters()
+                parameters = framework._imperative_tracer().all_parameters()
 
             params_grads = []
             for param in parameters:
-                if param.stop_gradient or not param.trainable:
+                if not param.trainable:
                     continue
-                # create gradient variable
-                grad_var = Variable(
-                    block=loss.block,
-                    name=param._ivar._grad_name(),
-                    stop_gradient=True,
-                    ivar=param._ivar._grad_ivar())
-                params_grads.append((param, grad_var))
-            with program_guard(program, startup_program):
+                if param._ivar._grad_ivar() is not None:
+                    # create gradient variable
+                    grad_var = Variable(
+                        block=loss.block,
+                        name=param._ivar._grad_name(),
+                        stop_gradient=True,
+                        ivar=param._ivar._grad_ivar())
+                    params_grads.append((param, grad_var))
+            with program_guard(framework.default_main_program(),
+                               framework.default_startup_program()):
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
+            program = loss.block.program
             with program_guard(program, startup_program):
                 params_grads = self.backward(loss, startup_program,
                                              parameter_list, no_grad_set)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 517418da1c..6702fc808b 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -99,7 +99,8 @@ class ParallelExecutor(object):
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
 
-        self._places = compiler.get_available_places(use_cuda)
+        self._places = framework.cuda_places(
+        ) if use_cuda else framework.cpu_places()
         self._scope = scope if scope is not None else executor.global_scope()
 
         main_program = main_program if main_program is not None \
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
new file mode 100644
index 0000000000..74ee2828de
--- /dev/null
+++ b/python/paddle/fluid/reader.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import core
+import six
+import threading
+from .framework import Program, Variable, program_guard, default_main_program, default_startup_program
+from .executor import global_scope
+from .data_feeder import DataFeeder, BatchedTensorProvider
+from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
+from .unique_name import UniqueNameGenerator
+
+__all__ = ['PyReader']
+
+
+def _convert_places(places):
+    if not isinstance(places, (list, tuple)):
+        places = [places]
+
+    ret = []
+    for p in places:
+        if not isinstance(p, core.Place):
+            tmp = core.Place()
+            tmp.set_place(p)
+            p = tmp
+
+        ret.append(p)
+    return ret
+
+
+class PyReader(object):
+    """
+    Create a reader object for data feeding in Python. 
+    Data would be prefetched using Python thread and be pushed
+    into a queue asynchronously. Data in the queue would be extracted 
+    automatically when `Executor.run(...)` is called.
+
+    Args:  
+        feed_list (list(Variable)|tuple(Variable)): feed variable list.
+            The variables should be created by :code:`fluid.layers.data()`. 
+        capacity (int): capacity of the queue maintained in PyReader object. 
+        use_double_buffer (bool): whether to use double_buffer_reader to 
+            speed up data feeding. 
+        iterable (bool): whether the created reader object is iterable.   
+
+    Returns:
+        reader (Reader): the created reader object.
+
+    Examples:
+        1. If iterable = False, the created PyReader object is almost the
+           same as :code:`fluid.layers.py_reader()`. Operators would be 
+           inserted into the program. User should call :code:`start()` 
+           before each epoch and catch :code:`fluid.core.EOFException`
+           thrown by :code:`Executor.run()` when epoch ends. Once the 
+           exception is caught, user should call :code:`reset()` to reset 
+           the reader manually.
+
+        .. code-block:: python
+            
+            image = fluid.layers.data(
+                        name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+            
+            reader = fluid.io.PyReader(feed_list=[image, label], 
+                        capacity=4, iterable=False)
+            reader.decorate_sample_list_generator(user_defined_reader)
+            ... # definition of network is omitted
+            executor.run(fluid.default_main_program())
+            for _ in range(EPOCH_NUM):
+                reader.start()
+                while True:
+                    try:
+                        executor.run(feed=None, ...)
+                    except fluid.core.EOFException:
+                        reader.reset()
+                        break
+                    
+        2. If iterable=True, the created PyReader object is decoupled with
+           the program. No operator would be inserted into the program. 
+           In this case, the created reader is a Python generator, which 
+           is iterable. User should feed the data yielded from PyReader 
+           object into :code:`Executor.run(feed=...)`.  
+
+        .. code-block:: python
+
+            image = fluid.layers.data(
+                        name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+
+            reader = fluid.io.PyReader(feed_list=[image, label], 
+                        capacity=4, iterable=True)
+            reader.decorate_sample_list_generator(user_defined_reader, 
+                        places=fluid.cuda_places())
+            ... # definition of network is omitted
+            executor.run(fluid.default_main_program())
+            for _ in range(EPOCH_NUM):
+                for data in reader():
+                    executor.run(feed=data, ...)
+    """
+
+    unique_name_generator = UniqueNameGenerator()
+
+    def __init__(self,
+                 feed_list,
+                 capacity,
+                 use_double_buffer=True,
+                 iterable=False):
+        self._tensor_reader = None
+        self._thread = None
+        self._iterable = iterable
+        self._use_double_buffer = use_double_buffer
+        self._capacity = capacity
+        self._feed_list = feed_list
+        if not self._iterable:
+            self._init_non_iterable()
+
+    def _init_iterable(self, places):
+        self._var_names = [v.name for v in self._feed_list]
+        self._places = _convert_places(places)
+        self._queue = core.init_lod_tensor_blocking_queue(core.Variable(),
+                                                          self._capacity)
+        self._reader = core.create_py_reader(
+            self.queue, self._var_names, self._places, self._use_double_buffer)
+
+    def _init_non_iterable(self):
+        lod_levels = []
+        dtypes = []
+        shape_concat = []
+        ranks = []
+        shapes = []
+
+        for feed_data in self._feed_list:
+            dtypes.append(feed_data.dtype)
+            shape_concat.extend(feed_data.shape)
+            ranks.append(len(feed_data.shape))
+            shapes.append(feed_data.shape)
+            lod_levels.append(feed_data.lod_level)
+
+        queue_name = PyReader.unique_name_generator('lod_tensor_blocking_queue')
+        reader_name = PyReader.unique_name_generator('create_py_reader')
+        double_buffer_name = PyReader.unique_name_generator('double_buffer')
+
+        var = global_scope().var(queue_name)
+        self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity)
+
+        startup_blk = default_startup_program().current_block()
+        startup_var = startup_blk.create_var(name=reader_name)
+
+        startup_blk.append_op(
+            type='create_py_reader',
+            inputs={'blocking_queue': [queue_name]},
+            outputs={'Out': [startup_var]},
+            attrs={
+                'shape_concat': shape_concat,
+                'lod_levels': lod_levels,
+                'ranks': ranks
+            })
+
+        startup_var.desc.set_dtypes(dtypes)
+        startup_var.persistable = True
+
+        main_prog_var = _copy_reader_var_(
+            default_main_program().current_block(), startup_var)
+
+        main_prog_var.stop_gradient = True
+        main_prog_var.persistable = True
+
+        reader = monkey_patch_reader_methods(main_prog_var)
+        if self._use_double_buffer:
+            double_buffer_reader = double_buffer(
+                reader, name=double_buffer_name)
+            # we return a double buffer reader. However, the reset method comes from
+            # py_reader.
+            double_buffer_reader.reset = reader.reset
+            reader = double_buffer_reader
+
+        self._reader = reader
+
+        default_main_program().current_block().append_op(
+            type='read',
+            inputs={'Reader': [self._reader]},
+            outputs={'Out': self._feed_list})
+
+    @property
+    def queue(self):
+        return self._queue
+
+    @property
+    def iterable(self):
+        return self._iterable
+
+    def __call__(self):
+        assert self.iterable, "PyReader is not iterable"
+        assert self._tensor_reader is not None, \
+            "Data source of PyReader has not set yet"
+
+        class Iterator(object):
+            def __init__(self, reader):
+                self._reader = reader._reader
+                self._reset = reader._reset
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                return self.next()
+
+            def next(self):
+                ret = self._reader.read_next()
+                if ret:
+                    return ret
+                else:
+                    self._reset()
+                    raise StopIteration
+
+        self._start()
+        return Iterator(self)
+
+    def _reset(self):
+        self._reader.reset()
+        self._thread.join()
+
+    def start(self):
+        '''
+        Start the data feeding thread. 
+        Can only call when the reader object is not iterable.  
+        '''
+        assert not self._iterable, "start() cannot be called when PyReader is iterable"
+        self._start()
+
+    def reset(self):
+        '''
+        Reset the reader object when :code:`fluid.core.EOFException` raises. 
+        Can only call when the reader object is not iterable.
+        '''
+        assert not self._iterable, "reset() cannot be called when PyReader is iterable"
+        self._reset()
+
+    def _start(self):
+        def __thread_main__():
+            try:
+                for tensors in self._tensor_reader():
+                    array = core.LoDTensorArray()
+                    for item in tensors:
+                        if not isinstance(item, core.LoDTensor):
+                            tmp = core.LoDTensor()
+                            tmp.set(item, core.CPUPlace())
+                            item = tmp
+
+                        array.append(item)
+
+                    if not self._queue.push(array):
+                        break
+
+                self._queue.close()
+            except Exception as ex:
+                self._queue.close()
+                raise ex
+
+        self._thread = threading.Thread(target=__thread_main__)
+        self._thread.daemon = True
+        self._thread.start()
+
+    def decorate_sample_generator(self,
+                                  sample_generator,
+                                  batch_size,
+                                  drop_last=True,
+                                  places=None):
+        '''
+        Set the data source of the PyReader object.
+        
+        The provided :code:`sample_generator` should be a Python generator,
+        which yields numpy.ndarray typed data of each sample.
+
+        :code:`places` must be set when the PyReader object is iterable.
+
+        If all inputs have no lods, this method is faster than 
+        :code:`decorate_sample_list_generator(paddle.batch(sample_generator, ...))` .
+
+        Args:
+            sample_generator (generator): Python generator that yields
+                numpy.ndarray-typed sample data.
+            batch_size (int): batch size. Must be larger than 0.
+            drop_last (bool): Whether to drop the last batch when sample number
+                is less than batch_size. 
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert batch_size > 0, "batch_size must be larger than 0"
+        has_lod = False
+        for f in self._feed_list:
+            if f.lod_level != 0:
+                has_lod = True
+                break
+
+        if has_lod:
+            self.decorate_sample_list_generator(
+                paddle.batch(
+                    sample_generator,
+                    batch_size=batch_size,
+                    drop_last=drop_last),
+                places=places)
+        else:
+            reader = BatchedTensorProvider(
+                feed_list=self._feed_list,
+                place=core.CPUPlace(),
+                batch_size=batch_size,
+                generator=sample_generator,
+                drop_last=drop_last)
+            self.decorate_batch_generator(reader, places=places)
+
+    def decorate_sample_list_generator(self, reader, places=None):
+        '''
+        Set the data source of the PyReader object. 
+
+        The provided :code:`reader` should be a Python generator,
+        which yields list(numpy.ndarray) typed batched data. 
+        
+        :code:`places` must be set when the PyReader object is iterable.
+
+        Args:
+            reader (generator): Python generator that yields 
+                list(numpy.ndarray)-typed batched data. 
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert self._tensor_reader is None, \
+            "Cannot reset the data source of PyReader"
+        with program_guard(Program(), Program()):
+            feeder = DataFeeder(
+                feed_list=self._feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(reader, multi_devices=False)
+
+        def __tensor_reader_impl__():
+            for slots in paddle_reader():
+                yield [slots[var.name] for var in self._feed_list]
+
+        self.decorate_batch_generator(__tensor_reader_impl__, places)
+
+    def decorate_batch_generator(self, reader, places=None):
+        '''
+        Set the data source of the PyReader object.
+
+        The provided :code:`reader` should be a Python generator,
+        which yields numpy.ndarray-typed or LoDTensor-typed batched data.
+
+        :code:`places` must be set when the PyReader object is iterable.
+
+        Args:
+            reader (generator): Python generator that yields LoDTensor-typed
+                batched data.
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert self._tensor_reader is None, \
+            "Cannot reset the data source of PyReader"
+        self._tensor_reader = reader
+        if self._iterable:
+            assert places is not None, "Places cannot be None when py_reader is iterable"
+            self._init_iterable(places)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 6218db7345..7d1b869cf5 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -476,11 +476,29 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
-                                      [0, 1], 10, 0.7, 32)
+            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
+            loss = layers.yolov3_loss(
+                x,
+                gtbox,
+                gtlabel, [10, 13, 30, 13], [0, 1],
+                10,
+                0.7,
+                32,
+                gtscore=gtscore,
+                use_label_smooth=False)
 
             self.assertIsNotNone(loss)
 
+    def test_yolo_box(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
+            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
+            boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
+                                            0.01, 32)
+            self.assertIsNotNone(boxes)
+            self.assertIsNotNone(scores)
+
 
 class TestBoxClip(unittest.TestCase):
     def test_box_clip(self):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 562866cf60..1bc647126d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
@@ -95,14 +96,16 @@ if(WITH_DISTRIBUTE)
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
-	py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-	set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
+        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
         # FIXME(typhoonzero): add these tests back
-	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+        # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
-    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
@@ -115,8 +118,8 @@ if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    # change the timeout from 600 to 1200, because in debug mode, this test need more time.
-    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
+    # change the timeout from 600 to 2200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
 endif()
 
 if (WITH_NGRAPH)
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
new file mode 100644
index 0000000000..88a3cd14c4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    return predict
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2, single_device=False):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+
+        # Optimization
+        # TODO(typhoonzero): fix distributed adam optimizer
+        # opt = fluid.optimizer.AdamOptimizer(
+        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
+        opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
+        if single_device:
+            opt.minimize(avg_cost)
+        else:
+            # multi device or distributed multi device
+            params_grads = opt.backward(avg_cost)
+            data_parallel_param_grads = []
+            for p, g in params_grads:
+                # NOTE: scale will be done on loss scale in multi_devices_graph_pass using nranks.
+                grad_reduce = fluid.layers.collective._allreduce(g)
+                data_parallel_param_grads.append([p, grad_reduce])
+            opt.apply_gradients(data_parallel_param_grads)
+
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
index 95e39d891f..48a4768782 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
@@ -20,7 +20,7 @@ logging.basicConfig()
 logger = logging.getLogger("paddle")
 logger.setLevel(logging.INFO)
 
-DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
+DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
 DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
 """
 avazu_ctr_data/train.txt
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
new file mode 100644
index 0000000000..0b6556746c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.op_type = "concat"
+        self.use_mkldnn = True
+        self._cpu_only = True
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype('int')
+
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output()
+
+#--------------------test concat s8 in with axis 0--------------------
+
+    def init_test_data(self):
+        self.x0 = (np.random.randint(0, 100, self.x0_shape) - 50).astype('int8')
+        self.x1 = (np.random.randint(0, 80, self.x1_shape) - 30).astype('int8')
+        self.x2 = (np.random.randint(0, 110, self.x2_shape) - 80).astype('int8')
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 2]
+        self.x1_shape = [1, 2, 1, 2]
+        self.x2_shape = [3, 2, 1, 2]
+
+
+#--------------------test concat u8 in with axis 0--------------------
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8')
+        self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8')
+        self.x2 = (np.random.randint(0, 80, self.x2_shape)).astype('uint8')
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 1, 5, 5]
+        self.x1_shape = [1, 1, 5, 5]
+        self.x2_shape = [3, 1, 5, 5]
+
+
+def create_test_int8_class(parent):
+
+    #--------------------test concat s8/u8 in with axis 1--------------------
+
+    class TestAxis1Case(parent):
+        def init_axis(self):
+            self.axis = 1
+
+        def init_shape(self):
+            self.x0_shape = [1, 1, 5, 5]
+            self.x1_shape = [1, 2, 5, 5]
+            self.x2_shape = [1, 3, 5, 5]
+
+#--------------------test concat s8/u8 in with axis 2--------------------
+
+    class TestAxis2Case(parent):
+        def init_axis(self):
+            self.axis = 2
+
+        def init_shape(self):
+            self.x0_shape = [2, 3, 4, 5]
+            self.x1_shape = [2, 3, 5, 5]
+            self.x2_shape = [2, 3, 6, 5]
+
+#--------------------test concat s8/u8 in with axis 3--------------------
+
+    class TestAxis3Case(parent):
+        def init_axis(self):
+            self.axis = 3
+
+        def init_shape(self):
+            self.x0_shape = [2, 3, 5, 5]
+            self.x1_shape = [2, 3, 5, 6]
+            self.x2_shape = [2, 3, 5, 7]
+
+    cls_name_1 = "{0}_axis_{1}".format(parent.__name__, "1")
+    cls_name_2 = "{0}_axis_{1}".format(parent.__name__, "2")
+    cls_name_3 = "{0}_axis_{1}".format(parent.__name__, "3")
+    TestAxis1Case.__name__ = cls_name_1
+    TestAxis2Case.__name__ = cls_name_2
+    TestAxis3Case.__name__ = cls_name_3
+    globals()[cls_name_1] = TestAxis1Case
+    globals()[cls_name_2] = TestAxis2Case
+    globals()[cls_name_3] = TestAxis3Case
+
+create_test_int8_class(TestConcatOp)
+create_test_int8_class(TestConcatOp2)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
new file mode 100644
index 0000000000..a8127bcc78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from mkldnn_op_test import format_reorder
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.initInputData()
+        self.use_mkldnn = True
+        self.axis = (0, 2, 3, 1)
+
+        self.inputs = {
+            'X': format_reorder(self.input_data, self.shape)
+        }  #transform data format to 'NHWC' for INT8 transpose specially.
+
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+        }
+
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype('int8'),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 6, 8)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestUINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 5, 7)
+
+    def initDataType(self):
+        self.input_data = (np.random.randint(0, 100,
+                                             self.shape)).astype('uint8')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
new file mode 100644
index 0000000000..86961b8c36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_softmax_with_cross_entropy_op import TestSoftmaxWithCrossEntropyOp, TestSoftmaxWithCrossEntropyOp2, TestSoftmaxWithCrossEntropyOp3
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8234457243..b84ce2b3ae 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -22,6 +22,7 @@ import six
 import time
 import itertools
 import collections
+from collections import defaultdict
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase):
         outs, _ = self._calc_output(place)
         return outs
 
-    def _calc_output(self, place, parallel=False, no_check_set=None):
+    def _create_var_from_numpy(self, value):
+        if isinstance(value, tuple):
+            data = value[0]
+            lod = value[1]
+            v = fluid.imperative.base.to_variable(value=data)
+            v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
+            return v
+        else:
+            return fluid.imperative.base.to_variable(value)
+
+    def _calc_imperative_output(self, place, parallel=False, no_check_set=None):
+        with fluid.imperative.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+
+            # prepare input variable
+            inputs = defaultdict(list)
+            for name, np_value in six.iteritems(self.inputs):
+                if not isinstance(np_value, list):
+                    np_value = [np_value]
+
+                for i in range(len(np_value)):
+                    inputs[name].append(
+                        self._create_var_from_numpy(np_value[i]))
+
+            # prepare output variable
+            outputs = defaultdict(list)
+            for name, np_value in six.iteritems(self.outputs):
+                if not isinstance(np_value, list):
+                    np_value = [np_value]
+
+                for i in range(len(np_value)):
+                    value = np_value[i]
+                    if isinstance(value, tuple):
+                        v = block.create_var(
+                            name="%s_out%d" % (name, i),
+                            dtype=value[0].dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=False)
+                        v._ivar.value().get_tensor(
+                        ).set_recursive_sequence_lengths(value[1])
+                    else:
+                        v = block.create_var(
+                            name="%s_out%d" % (name, i),
+                            dtype=value.dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=False)
+                    outputs[name].append(v)
+
+            block.append_op(
+                type=self.op_type,
+                inputs=inputs,
+                outputs=outputs,
+                attrs=self.attrs)
+
+            return outputs
 
+    def _calc_output(self, place, parallel=False, no_check_set=None):
         program = Program()
         block = program.global_block()
         self._append_ops(block)
@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase):
                                 place,
                                 atol,
                                 no_check_set=None,
-                                equal_nan=False):
+                                equal_nan=False,
+                                check_imperative=False):
+        if check_imperative:
+            imperative_outs = self._calc_imperative_output(
+                place, no_check_set=no_check_set)
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
+
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase):
                                          type(sub_out))
                 for item in sub_out:
                     sub_out_name, expect = item[0], item[1]
+                    if check_imperative:
+                        imperative_actual = imperative_outs[sub_out_name][0]
+                        imperative_actual_t = np.array(
+                            imperative_actual._ivar.value().get_tensor())
                     idx = find_actual(sub_out_name, fetch_list)
                     actual = outs[idx]
                     actual_t = np.array(actual)
@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase):
                             actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                         "Output (" + sub_out_name + ") has diff at " +
                         str(place))
+                    if check_imperative:
+                        self.assertTrue(
+                            np.allclose(
+                                imperative_actual_t,
+                                expect_t,
+                                atol=atol,
+                                equal_nan=equal_nan),
+                            "Output (" + sub_out_name + ") has diff at " +
+                            str(place) + " in imperative mode")
                     if isinstance(expect, tuple):
                         self.assertListEqual(
                             actual.recursive_sequence_lengths(), expect[1],
                             "Output (" + sub_out_name +
                             ") has different lod at " + str(place))
+                    if check_imperative:
+                        self.assertListEqual(
+                            imperative_actual._ivar.value().get_tensor()
+                            .recursive_sequence_lengths(), expect[1],
+                            "Output (" + out_name + ") has different lod at " +
+                            str(place) + " in imperative mode")
             else:
+                if check_imperative:
+                    imperative_actual = imperative_outs[out_name][0]
+                    imperative_actual_t = np.array(
+                        imperative_actual._ivar.value().get_tensor())
                 idx = find_actual(out_name, fetch_list)
                 actual = outs[idx]
                 actual_t = np.array(actual)
@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase):
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t) + " in class " + self.__class__.__name__)
+                if check_imperative:
+                    self.assertTrue(
+                        np.allclose(
+                            imperative_actual_t,
+                            expect_t,
+                            atol=atol,
+                            equal_nan=equal_nan),
+                        "Output (" + out_name + ") has diff at " + str(place) +
+                        "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                        str(imperative_actual_t) + " in class " +
+                        self.__class__.__name__)
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
+                    if check_imperative:
+                        self.assertListEqual(
+                            imperative_actual._ivar.value().get_tensor()
+                            .recursive_sequence_lengths(), expect[1],
+                            "Output (" + out_name + ") has different lod at " +
+                            str(place) + " in imperative mode")
 
     def _get_places(self):
         if self.dtype == np.float16:
@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase):
             places.append(core.CUDAPlace(0))
         return places
 
-    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
+    def check_output(self,
+                     atol=1e-5,
+                     no_check_set=None,
+                     equal_nan=False,
+                     check_imperative=False):
         places = self._get_places()
         for place in places:
-            self.check_output_with_place(place, atol, no_check_set, equal_nan)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan,
+                                         check_imperative)
 
     def check_output_customized(self, checker):
         places = self._get_places()
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index a94487e67d..61fd9af127 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_ir_memory_optimize=True,
                                   enable_inplace=True,
                                   fuse_elewise_add_act_ops=False,
+                                  fuse_all_reduce_ops=False,
                                   fuse_relu_depthwise_conv=False,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
         build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
         build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
         build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
+        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
         # python memory optimization is conflict with inplace pass.
         # Use ir graph memory optimization after inplace pass is the correct way.
         build_strategy.enable_inplace = False if memory_opt else enable_inplace
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d5a8385409..d587715d60 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestAtan(TestActivation):
+    def setUp(self):
+        self.op_type = "atan"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.arctan(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestTanhShrink(TestActivation):
     def setUp(self):
         self.op_type = "tanh_shrink"
@@ -248,6 +265,23 @@ class TestCos(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestAcos(TestActivation):
+    def setUp(self):
+        self.op_type = "acos"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arccos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestSin(TestActivation):
     def setUp(self):
         self.op_type = "sin"
@@ -265,6 +299,23 @@ class TestSin(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestAsin(TestActivation):
+    def setUp(self):
+        self.op_type = "asin"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arcsin(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestRound(TestActivation):
     def setUp(self):
         self.op_type = "round"
@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(TestFloor, grad_check=False)
 create_test_act_fp16_class(TestCos, grad_atol=0.85)
+create_test_act_fp16_class(TestAcos, grad_atol=0.85)
 create_test_act_fp16_class(TestSin)
+create_test_act_fp16_class(TestAsin)
+create_test_act_fp16_class(TestAtan)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu)
 create_test_act_fp16_class(TestGelu)
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
new file mode 100644
index 0000000000..55029c18d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import unittest
+import numpy as np
+import six
+
+
+class CrossEntropy2OpTestBase(OpTest):
+    def initParameters(self):
+        return [32, 64], 'float32', -100
+
+    def calc_output(self, logits, label, ignore_index):
+        ret = np.zeros(shape=label.shape, dtype=logits.dtype)
+        match_x = np.zeros(shape=label.shape, dtype=logits.dtype)
+        for idx in six.moves.range(label.shape[0]):
+            if label[idx] == ignore_index:
+                continue
+            match_x[idx] = logits[idx][label[idx]]
+            ret[idx] = -np.log(match_x[idx])
+        return ret, match_x
+
+    def setUp(self):
+        self.shape, self.dtype, self.ignore_index = self.initParameters()
+        self.op_type = 'cross_entropy2'
+        feature_size = int(self.shape[-1])
+        batch_size = int(np.prod(self.shape) / feature_size)
+        logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
+        label = np.random.random_integers(
+            low=0, high=feature_size - 1,
+            size=self.shape[0:-1] + [1]).astype('int64')
+        outputs, match_x = self.calc_output(
+            np.reshape(logits, [batch_size, feature_size]),
+            np.reshape(label, [batch_size, 1]), self.ignore_index)
+        self.inputs = {'X': logits, 'Label': label}
+        self.outputs = {
+            'Y': np.reshape(outputs, label.shape),
+            'MatchX': np.reshape(match_x, label.shape),
+            'XShape': np.zeros(
+                shape=logits.shape, dtype=logits.dtype)
+        }
+        self.attrs = {'ignore_index': self.ignore_index}
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad(
+            inputs_to_check=['X'],
+            output_names=['Y'],
+            no_grad_set=['XShape', 'MatchX', 'Label'])
+
+
+class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [32, 64], 'float64', 3
+
+
+class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [4, 8, 16, 32], 'float32', -100
+
+
+class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [4, 8, 16, 32], 'float32', 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
new file mode 100644
index 0000000000..377014510b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import time
+import six
+import unittest
+
+EPOCH_NUM = 60
+BATCH_SIZE = 32
+CLASS_NUM = 10
+
+
+def random_reader():
+    np.random.seed(1)
+    for i in range(BATCH_SIZE * 40):
+        image = np.random.random([784])
+        label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
+        yield image, label
+
+
+def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    startup_prog = fluid.Program()
+    main_prog = fluid.Program()
+    startup_prog.random_seed = 1
+    main_prog.random_seed = 1
+
+    with fluid.unique_name.guard():
+        with fluid.program_guard(main_prog, startup_prog):
+            image = fluid.layers.data(
+                name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            py_reader = fluid.io.PyReader(
+                feed_list=[image, label],
+                capacity=4,
+                iterable=not use_legacy_py_reader,
+                use_double_buffer=use_double_buffer)
+            hidden = image
+            for hidden_size in [10, 20, 30]:
+                hidden = fluid.layers.fc(
+                    hidden,
+                    size=hidden_size,
+                    act='tanh',
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(value=1.0)))
+
+            predict_label = fluid.layers.fc(hidden,
+                                            size=CLASS_NUM,
+                                            act='softmax')
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+    return startup_prog, main_prog, py_reader, loss
+
+
+class TestBase(unittest.TestCase):
+    def run_main(self, use_legacy_py_reader, with_data_parallel, places,
+                 use_double_buffer):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, py_reader, loss = simple_fc_net(
+                places, use_legacy_py_reader, use_double_buffer)
+
+            reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
+
+            ps = places if use_double_buffer else fluid.cpu_places(len(places))
+
+            py_reader.decorate_sample_list_generator(
+                reader, places=ps if py_reader.iterable else None)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if with_data_parallel:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step = 0
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            if not py_reader.iterable:
+                for _ in six.moves.range(EPOCH_NUM):
+                    step = 0
+                    py_reader.start()
+                    while True:
+                        try:
+                            L, = exe.run(program=prog,
+                                         fetch_list=[loss],
+                                         use_program_cache=True)
+                            loss_list.append(np.mean(L))
+                            step += 1
+                        except fluid.core.EOFException:
+                            py_reader.reset()
+                            break
+                    step_list.append(step)
+            else:
+                for _ in six.moves.range(EPOCH_NUM):
+                    step = 0
+                    for d in py_reader():
+                        assert len(d) == len(places)
+                        for i, item in enumerate(d):
+                            image = item['image']
+                            label = item['label']
+                            assert image.shape() == [BATCH_SIZE, 784]
+                            assert label.shape() == [BATCH_SIZE, 1]
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
+                        L, = exe.run(program=prog,
+                                     feed=d,
+                                     fetch_list=[loss],
+                                     use_program_cache=True)
+                        loss_list.append(np.mean(L))
+                        step += 1
+                    step_list.append(step)
+            end_t = time.time()
+            ret = {
+                "time": end_t - start_t,
+                "step": step_list,
+                "loss": np.array(loss_list)
+            }
+            return ret
+
+    def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
+        places = []
+        if with_cpu:
+            places.append([fluid.CPUPlace()])
+            if with_data_parallel:
+                places.append([fluid.CPUPlace()] * 2)
+
+        if with_gpu and fluid.core.is_compiled_with_cuda():
+            tmp = fluid.cuda_places()
+            assert len(tmp) > 0, "no gpu detected"
+            if with_data_parallel:
+                places.append(tmp)
+            places.append([tmp[0]])
+        return places
+
+    def test_main(self):
+        for with_data_parallel in [True, False]:
+            for p in self.prepare_places(with_data_parallel):
+                for use_double_buffer in [False, True]:
+                    results = []
+                    for use_legacy_py_reader in [False, True]:
+                        ret = self.run_main(
+                            use_legacy_py_reader=use_legacy_py_reader,
+                            with_data_parallel=with_data_parallel,
+                            places=p,
+                            use_double_buffer=use_double_buffer)
+                        results.append(ret)
+                    if not use_double_buffer:
+                        diff = np.max(
+                            np.abs(results[0]['loss'] - results[1]['loss']))
+                        self.assertLess(diff, 1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
new file mode 100644
index 0000000000..fbeff20c63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistMnistNCCL2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._nccl2_reduce_layer = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_allreduce_op.py", delta=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index f4d14d4024..969f5cb63c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -33,7 +33,10 @@ DEFAULT_BATCH_SIZE = 2
 
 
 class TestDistRunnerBase(object):
-    def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1):
+    def get_model(self,
+                  batch_size=DEFAULT_BATCH_SIZE,
+                  lr=0.1,
+                  single_device=False):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
@@ -76,8 +79,12 @@ class TestDistRunnerBase(object):
 
     def run_trainer(self, args):
         self.lr = args.lr
-        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-            self.get_model(batch_size=args.batch_size)
+        if args.nccl2_reduce_layer_local_run:
+            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+                self.get_model(batch_size=args.batch_size, single_device=True)
+        else:
+            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+                self.get_model(batch_size=args.batch_size)
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
@@ -87,7 +94,7 @@ class TestDistRunnerBase(object):
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
-        elif args.update_method == "nccl2":
+        elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
             # transpile for nccl2
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
@@ -103,16 +110,17 @@ class TestDistRunnerBase(object):
             trainer_prog = fluid.default_main_program()
 
         if args.use_cuda:
-            place = fluid.CUDAPlace(0)
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(device_id)
         else:
             place = fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        strategy = fluid.ExecutionStrategy()
-        strategy.num_threads = 1
-        strategy.allow_op_delay = False
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_threads = 1
+        exec_strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
         # FIXME force disable enable_inplace and memory_optimize
@@ -124,23 +132,25 @@ class TestDistRunnerBase(object):
         else:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
+        pass_builder = None
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
                 len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
             mypass.set("num_repeats", args.batch_merge_repeat)
 
-        if args.update_method == "nccl2":
+        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
             build_stra.num_trainers = len(args.endpoints.split(","))
             build_stra.trainer_id = args.trainer_id
         else:
+            # case args.update_method == "nccl2_reduce_layer":
             build_stra.num_trainers = 1
             build_stra.trainer_id = 0
 
         binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
             loss_name=avg_cost.name,
             build_strategy=build_stra,
-            exec_strategy=strategy)
+            exec_strategy=exec_strategy)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -182,7 +192,7 @@ def runtime_main(test_class):
         '--update_method',
         type=str,
         default="local",
-        choices=["pserver", "nccl2", "local"])
+        choices=["pserver", "nccl2", "local", "nccl2_reduce_layer"])
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument(
@@ -198,6 +208,11 @@ def runtime_main(test_class):
     parser.add_argument('--lr', required=False, type=float, default=0.001)
     parser.add_argument(
         '--batch_merge_repeat', required=False, type=int, default=1)
+    parser.add_argument(
+        '--nccl2_reduce_layer_local_run',
+        required=False,
+        type=bool,
+        default=False)
 
     args = parser.parse_args()
 
@@ -242,6 +257,12 @@ class TestDistBase(unittest.TestCase):
         self._dc_asgd = False  # must use with async mode
         self._use_reader_alloc = True
         self._nccl2_mode = False
+        self._mp_mode = False
+        # FIXME(typhoonzero): I added this stupid argument to enable
+        # testing allreduce layers, which users can call layers.allreduce
+        # to accumulate tensors at anywhere. Find a better way to do this
+        # test, reduce check this argument everywhere.
+        self._nccl2_reduce_layer = False
         self._lr = 0.001
         self._setup_config()
         self._after_setup_config()
@@ -307,10 +328,16 @@ class TestDistBase(unittest.TestCase):
             cmd += " --batch_size %d" % batch_size
         if batch_merge_repeat > 1:
             cmd += " --batch_merge_repeat %d" % batch_merge_repeat
+        if self._nccl2_reduce_layer:
+            cmd += " --nccl2_reduce_layer_local_run 1"
 
         if self.__use_cuda:
             cmd += " --use_cuda"
-            env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+            env_local = {
+                "CUDA_VISIBLE_DEVICES": "0",
+                "PADDLE_TRAINERS_NUM": "1",
+                "PADDLE_TRAINER_ID": "0"
+            }
         else:
             env_local = {'CPU_NUM': '1'}
 
@@ -427,29 +454,30 @@ class TestDistBase(unittest.TestCase):
             sys.stderr.write("ps1 stderr: %s\n" % fn.read())
 
         # print log
-        if stat0 == 0:
-            sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
         with open("/tmp/tr0_err.log", "r") as fn:
             sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
-        if stat1 == 0:
-            sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
         with open("/tmp/tr1_err.log", "r") as fn:
             sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
 
         return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
-    def _run_cluster_nccl2(self, model, envs, check_error_log):
+    def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
+                           check_error_log):
         # NOTE: we reuse ps_endpoints as nccl2 worker endpoints
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
+        if nccl2_reduce_layer:
+            update_method = "nccl2_reduce_layer"
+        else:
+            update_method = "nccl2"
 
-        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
         tr0_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   0, w0_ep, self._lr)
+                   0, w0_ep, update_method, self._lr)
         tr1_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   1, w1_ep, self._lr)
+                   1, w1_ep, update_method, self._lr)
 
         if self._mem_opt:
             tr0_cmd += " --mem_opt"
@@ -463,12 +491,25 @@ class TestDistBase(unittest.TestCase):
         if self.__use_cuda:
             tr0_cmd += " --use_cuda"
             tr1_cmd += " --use_cuda"
-            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+            env0 = {
+                "CUDA_VISIBLE_DEVICES": "0",
+                # for test nccl2 layer
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ID": "0"
+            }
+            env1 = {
+                "CUDA_VISIBLE_DEVICES": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ID": "1"
+            }
         else:
             env0 = {'CPU_NUM': '1'}
             env1 = {'CPU_NUM': '1'}
 
+        if self._mp_mode:
+            env0 = {"FLAGS_selected_gpus": "0"}
+            env1 = {"FLAGS_selected_gpus": "1"}
+
         env0.update(envs)
         env1.update(envs)
 
@@ -498,8 +539,6 @@ class TestDistBase(unittest.TestCase):
         # print log
         sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
         sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
-        sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
-        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
 
         return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
@@ -528,10 +567,14 @@ class TestDistBase(unittest.TestCase):
 
         local_losses\
             = self._run_local(model_file, required_envs,
-                                       check_error_log)
+                                check_error_log)
         if self._nccl2_mode:
-            tr0_losses, tr1_losses = self._run_cluster_nccl2(
-                model_file, required_envs, check_error_log)
+            if self._nccl2_reduce_layer:
+                tr0_losses, tr1_losses = self._run_cluster_nccl2(
+                    model_file, required_envs, True, check_error_log)
+            else:
+                tr0_losses, tr1_losses = self._run_cluster_nccl2(
+                    model_file, required_envs, False, check_error_log)
         else:
             tr0_losses, tr1_losses = self._run_cluster(
                 model_file, required_envs, check_error_log)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 49a2ca40e3..030860ec79 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
new file mode 100644
index 0000000000..38f7bb80d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import os
+
+
+def skip_ci(func):
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+
+    def __func__(*args, **kwargs):
+        if on_ci:
+            return
+        return func(*args, **kwargs)
+
+    return __func__
+
+
+class TestDistSeResneXtNCCL(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+
+    @skip_ci
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_se_resnext.py", delta=1e-5)
+
+
+class TestDistSeResneXtNCCLMP(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._mp_mode = True
+
+    @skip_ci
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "dist_se_resnext.py",
+                delta=1e-5,
+                need_envs={"NCCL_P2P_DISABLE": "1"})
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 25dcccc28d..3307caa8b2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -21,7 +21,7 @@ from test_dist_base import TestDistBase
 
 
 def download_files():
-    url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
+    url_prefix = 'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
     vocab_url = url_prefix + 'vocab.bpe.32000'
     vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
     paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 12132477d2..f81d4fda50 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
             'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
             'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase):
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
             'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
             'lookup_table_grad', 'split_selected_rows', 'send',
             'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
             'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
             'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
             'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
             'lookup_table_grad', 'split_selected_rows', 'send',
             'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
             'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
             'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
index 603c8e7488..05cc41b96f 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -16,8 +16,7 @@ import os
 import unittest
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
 
-os.environ[
-    'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio'
+os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'
 
 from test_parallel_executor_transformer import TestTransformer
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
new file mode 100644
index 0000000000..898d04ebe1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+os.environ['CPU_NUM'] = '2'
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+import paddle.fluid.core as core
+from paddle.fluid.backward import append_backward
+import paddle.fluid.compiler as compiler
+import numpy
+import multiprocessing
+
+
+class TestEagerDeletionWhileOpBase(unittest.TestCase):
+    def test_main(self):
+        places = [core.CPUPlace(), ]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for p in places:
+            for with_data_parallel in [False, True]:
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(fluid.Scope()):
+                        self.run_main(p, with_data_parallel)
+
+    def run_main(self, place, with_data_parallel):
+        self.place = place
+        self.with_data_parallel = with_data_parallel
+
+        if not core.is_compiled_with_cuda() and isinstance(self.place,
+                                                           core.CUDAPlace):
+            return
+
+        if isinstance(self.place, core.CUDAPlace):
+            device_cnt = core.get_cuda_device_count(
+            ) if self.with_data_parallel else 1
+        else:
+            device_cnt = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count(
+                ))) if self.with_data_parallel else 1
+
+        d0 = layers.data(
+            "d0", shape=[10], append_batch_size=False, dtype='float32')
+        d1 = layers.data(
+            "d1", shape=[10], append_batch_size=False, dtype='float32')
+        d2 = layers.data(
+            "d2", shape=[10], append_batch_size=False, dtype='float32')
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        init = layers.zeros(shape=[10], dtype='float32')
+        mem_array = layers.array_write(x=init, i=i)
+        data_array = layers.array_write(x=d0, i=i)
+
+        i = layers.increment(i)
+        layers.array_write(d1, i, array=data_array)
+
+        i = layers.increment(i)
+        layers.array_write(d2, i, array=data_array)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
+        array_len.stop_gradient = True
+        cond = layers.less_than(x=i, y=array_len)
+
+        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
+        j.stop_gradient = True
+
+        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len2.stop_gradient = True
+        cond2 = layers.less_than(x=j, y=array_len2)
+
+        while_op = layers.While(cond=cond)
+        while_op2 = layers.While(cond=cond2)
+        with while_op.block():
+            d = layers.array_read(array=data_array, i=i)
+            prev = layers.array_read(array=mem_array, i=i)
+            d = layers.reshape(d, shape=[10])
+            prev = layers.reshape(prev, shape=[10])
+            result = layers.sums(input=[d, prev])
+
+            i = layers.increment(x=i, in_place=True)
+            layers.array_write(result, i=i, array=mem_array)
+            layers.less_than(x=i, y=array_len, cond=cond)
+            with while_op2.block():
+                d2 = layers.array_read(array=data_array, i=j)
+                prev2 = layers.array_read(array=mem_array, i=j)
+                d2 = layers.reshape(d2, shape=[10])
+                prev2 = layers.reshape(prev2, shape=[10])
+                result2 = layers.sums(input=[d2, prev2])
+
+                j = layers.increment(x=j, in_place=True)
+                layers.array_write(result2, i=j, array=mem_array)
+                layers.less_than(x=j, y=array_len2, cond=cond2)
+
+        sum_result = layers.array_read(array=mem_array, i=j)
+        sum_result.persistable = True
+        tmp = layers.unsqueeze(sum_result, axes=[0])
+        tmp = layers.expand(tmp, expand_times=[10, 1])
+        fc = layers.fc(tmp, size=256)
+        loss = layers.mean(sum_result)
+
+        optim = fluid.optimizer.Adam(learning_rate=1e-3)
+        optim.minimize(loss)
+
+        exe = Executor(self.place)
+        exe.run(fluid.default_startup_program())
+
+        prog = compiler.CompiledProgram(fluid.default_main_program())
+        if self.with_data_parallel:
+            prog = prog.with_data_parallel()
+
+        for _ in range(5):
+            d = []
+            for i in range(3):
+                tmp = numpy.random.random(size=[10]).astype('float32')
+                if not self.with_data_parallel:
+                    d.append(tmp)
+                else:
+                    d.append(numpy.array([tmp] * device_cnt))
+
+            outs = exe.run(program=prog,
+                           feed={'d0': d[0],
+                                 'd1': d[1],
+                                 'd2': d[2]},
+                           fetch_list=[sum_result])
+            self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
new file mode 100644
index 0000000000..104e896b6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+import random
+
+
+class TestElementwiseModOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.dtype = np.int32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
new file mode 100644
index 0000000000..a354ba0177
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+import random
+
+
+class TestElementwiseModOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_mod"
+        self.dtype = np.int32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.mod(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.mod(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 1bb4662e8d..0812b02b47 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -31,6 +31,99 @@ def dequantize_max_abs(x, scale, max_range):
     return y
 
 
+def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
+    scales = []
+    if not use_second_dim:
+        for i in range(x.shape[0]):
+            scales.append(np.max(np.abs(x[i])).astype("float32"))
+        y = x.copy()
+        max_range = math.pow(2, quant_bit - 1) - 1
+        for i, scale in enumerate(scales):
+            y[i] = np.round(x[i] / scale * max_range)
+    else:
+        for i in range(x.shape[0]):
+            s = []
+            for j in range(x.shape[1]):
+                s.append(np.max(np.abs(x[i][j])).astype("float32"))
+            scales.append(s)
+        scales = np.amax(np.array(scales), axis=0)
+        y = x.copy()
+        max_range = math.pow(2, quant_bit - 1) - 1
+        for i in range(x.shape[0]):
+            for j, scale in enumerate(scales):
+                y[i][j] = np.round(x[i][j] / scale * max_range)
+    return y, scales
+
+
+def channel_wise_dequantize_max_abs(x,
+                                    scales,
+                                    quant_bits,
+                                    activation_scale=None):
+    if activation_scale is None:
+        y = x.copy()
+        for i in range(x.shape[0]):
+            y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
+    else:
+        y = x.copy()
+        for i in range(x.shape[0]):
+            for j in range(x.shape[1]):
+                y[i][j] = (scales[j] /
+                           (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
+        y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
+    return y
+
+
+class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
+    def set_args(self):
+        self.quant_bits = [8, 8]
+        self.data_type = "float32"
+        self.activation_scale = 0.7861
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_channel_wise_dequantize_max_abs"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scales = channel_wise_quantize_max_abs(
+            x, self.quant_bits[0], use_second_dim=True)
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+                                              self.activation_scale)
+
+        self.inputs = {
+            'X': yq,
+            'Scales': [("scales0", np.array(scales).astype(self.data_type)),
+                       ("scales1", np.array(
+                           [self.activation_scale]).astype(self.data_type))]
+        }
+        self.attrs = {'quant_bits': self.quant_bits}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
+    def set_args(self):
+        self.quant_bits = [8]
+        self.data_type = "float32"
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_channel_wise_dequantize_max_abs"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
+
+        self.inputs = {
+            'X': yq,
+            'Scales': [("scales0", np.array(scales).astype(self.data_type))]
+        }
+        self.attrs = {'quant_bits': self.quant_bits}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeDequantizeMaxAbsOp(OpTest):
     def set_args(self):
         self.num_bits = 8
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 4582b2a0ee..07038b0441 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 class TestFakeQuantizeOp(OpTest):
@@ -35,6 +36,30 @@ class TestFakeQuantizeOp(OpTest):
         self.check_output()
 
 
+class TestFakeChannelWiseQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_channel_wise_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {
+            'X': np.random.random((4, 3, 64, 64)).astype("float32"),
+        }
+        scales = []
+        for i in range(self.inputs['X'].shape[0]):
+            scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
+        outputs = self.inputs['X'].copy()
+        for i, scale in enumerate(scales):
+            outputs[i] = np.round(outputs[i] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1))
+
+        self.outputs = {
+            'Out': outputs,
+            'OutScale': np.array(scales).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"
@@ -51,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
             'InScale': np.zeros(1).astype("float32")
         }
         scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+
         out_scales = np.zeros(self.attrs['window_size']).astype("float32")
         out_scales[0] = scale
         self.outputs = {
@@ -64,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
         self.check_output()
 
 
+class TestFakeQuantizeMovingOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_moving_average_abs_max"
+        self.attrs = {
+            'bit_length': int(5),
+            'moving_rate': float(0.9),
+            'is_test': False
+        }
+        accum = np.zeros(1).astype("float32")
+        accum[0] = 1
+        state = np.zeros(1).astype("float32")
+        state[0] = 1
+        scale = np.zeros(1).astype("float32")
+        scale[0] = 0.001
+        self.inputs = {
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'InScale': scale,
+            'InAccum': accum,
+            'InState': state,
+        }
+
+        out_accum = np.zeros(1).astype("float32")
+        out_state = np.zeros(1).astype("float32")
+        out_scale = np.zeros(1).astype("float32")
+        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
+            np.abs(self.inputs['X'])).astype("float32")
+        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+        out_scale = out_accum / out_state
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / out_scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutAccum': out_accum,
+            'OutState': out_state,
+            'OutScale': out_scale,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"
diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py
new file mode 100644
index 0000000000..6ad7418447
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fsp_matrix(a, b):
+    batch = a.shape[0]
+    a_channel = a.shape[1]
+    b_channel = b.shape[1]
+    h = a.shape[2]
+    w = a.shape[3]
+    a_t = a.transpose([0, 2, 3, 1])
+    a_t = a_t.reshape([batch, h * w, a_channel])
+    b_t = b.transpose([0, 2, 3, 1]).reshape([batch, h * w, b_channel])
+    a_r = a_t.repeat(
+        b_channel, axis=1).reshape(
+            [batch, h * w, b_channel, a_channel]).transpose([0, 1, 3, 2])
+    b_r = b_t.repeat(
+        a_channel, axis=1).reshape([batch, h * w, a_channel, b_channel])
+    return np.mean(a_r * b_r, axis=1)
+
+
+class TestFSPOp(OpTest):
+    def setUp(self):
+        self.op_type = "fsp"
+        self.initTestCase()
+
+        feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32')
+        feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32')
+
+        self.inputs = {'X': feature_map_0, 'Y': feature_map_1}
+        self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)}
+
+    def initTestCase(self):
+        self.a_shape = (2, 16, 32, 31)
+        self.b_shape = (2, 28, 32, 31)
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
new file mode 100644
index 0000000000..ca8669bbc6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+
+def simple_fc_net(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(2):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        img, label = self._init_data(random_data)
+
+        def _optimizer(learning_rate=1e-6):
+            optimizer = fluid.optimizer.SGD(
+                learning_rate=learning_rate,
+                regularization=fluid.regularizer.L2Decay(1e-6))
+            return optimizer
+
+        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=False,
+            memory_opt=False,
+            optimizer=_optimizer)
+        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=True,
+            memory_opt=False,
+            optimizer=_optimizer)
+
+        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_fc_with_fuse_op(self):
+        self._compare_fuse_all_reduce_ops(simple_fc_net, True)
+        self._compare_fuse_all_reduce_ops(simple_fc_net, False)
+
+    def test_batchnorm_fc_with_fuse_op(self):
+        self._compare_fuse_all_reduce_ops(fc_with_batchnorm, True)
+        self._compare_fuse_all_reduce_ops(fc_with_batchnorm, False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 6606162733..848c9a4952 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(atol=1e-8)
+        self.check_output(atol=1e-8, check_imperative=True)
 
     def test_check_grad(self):
         self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 97fc1eab3d..4c44195a3d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -152,7 +152,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
             type='reduce_sum',
             inputs={'X': softmax_out},
             outputs={'Out': reduce_out},
-            attrs={'dim': None,
+            attrs={'dim': [],
                    'keep_dim': False,
                    'reduce_all': True})
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
new file mode 100644
index 0000000000..62c25f7345
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.imperative.base import to_variable
+
+
+class SimpleImgConvPool(fluid.imperative.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestImperativeCheckpoint(unittest.TestCase):
+    def save_load_persistables(self):
+        seed = 90
+        epoch_num = 1
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            dy_param_init_value = {}
+
+            step = 0
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label._stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss._numpy()
+
+                    avg_loss._backward()
+                    sgd.minimize(avg_loss)
+                    fluid.imperative.save_persistables(mnist, "save_dir")
+                    mnist.clear_gradients()
+
+                    for param in mnist.parameters():
+                        dy_param_init_value[param.name] = param._numpy()
+
+                    mnist.load_dict(
+                        fluid.imperative.load_persistables(mnist, "save_dir"))
+
+                    restore = mnist.parameters()
+
+                    self.assertEqual(len(dy_param_init_value), len(restore))
+                    for value in restore:
+                        self.assertTrue(
+                            np.allclose(value, dy_param_init_value[value.name]))
+                        self.assertTrue(np.isfinite(value.all()))
+                        self.assertFalse(np.isnan(value.any()))
+
+                    step += 1
+
+                    if step > 20:
+                        break
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
new file mode 100644
index 0000000000..af80ca6ce7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+NUM_USERS = 100
+NUM_ITEMS = 1000
+
+BATCH_SIZE = 32
+NUM_BATCHES = 2
+
+
+class MLP(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+
+        self._user_layers = []
+        self._item_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._user_layers.append(
+                self.add_sublayer(
+                    'user_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+            self._item_layers.append(
+                self.add_sublayer(
+                    'item_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+
+        for ul, il in zip(self._user_layers, self._item_layers):
+            users = ul(users)
+            items = il(items)
+        return fluid.layers.elementwise_mul(users, items)
+
+
+class DMF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DMF, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._match_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._match_layers.append(
+                self.add_sublayer(
+                    'match_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+        self._mat
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+        match_vec = fluid.layers.concat(
+            [users, items], axis=len(users.shape) - 1)
+        for l in self._match_layers:
+            match_vec = l(match_vec)
+        return match_vec
+
+
+class DeepCF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DeepCF, self).__init__(name_scope)
+
+        self._user_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_USERS, 256])
+        self._item_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_ITEMS, 256])
+
+        self._mlp = MLP(self.full_name())
+        self._dmf = DMF(self.full_name())
+        self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
+
+    def forward(self, users, items):
+        users_emb = self._user_emb(users)
+        items_emb = self._item_emb(items)
+
+        mlp_predictive = self._mlp(users_emb, items_emb)
+        dmf_predictive = self._dmf(users_emb, items_emb)
+        predictive = fluid.layers.concat(
+            [mlp_predictive, dmf_predictive],
+            axis=len(mlp_predictive.shape) - 1)
+        prediction = self._match_fc(predictive)
+        return prediction
+
+
+def get_data():
+    user_ids = []
+    item_ids = []
+    labels = []
+    for uid in range(NUM_USERS):
+        for iid in range(NUM_ITEMS):
+            # 10% positive
+            label = float(random.randint(1, 10) == 1)
+            user_ids.append(uid)
+            item_ids.append(iid)
+            labels.append(label)
+    indices = np.arange(NUM_USERS * NUM_ITEMS)
+    np.random.shuffle(indices)
+    users_np = np.array(user_ids, dtype=np.int64)[indices]
+    items_np = np.array(item_ids, dtype=np.int64)[indices]
+    labels_np = np.array(labels, dtype=np.float32)[indices]
+    return np.expand_dims(users_np, -1), \
+           np.expand_dims(items_np, -1), \
+           np.expand_dims(labels_np, -1)
+
+
+class TestImperativeDeepCF(unittest.TestCase):
+    def test_gan_float32(self):
+        seed = 90
+        users_np, items_np, labels_np = get_data()
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            users = fluid.layers.data('users', [1], dtype='int64')
+            items = fluid.layers.data('items', [1], dtype='int64')
+            labels = fluid.layers.data('labels', [1], dtype='float32')
+
+            deepcf = DeepCF('deepcf')
+            prediction = deepcf(users, items)
+            loss = fluid.layers.reduce_sum(
+                fluid.layers.log_loss(prediction, labels))
+            adam = fluid.optimizer.AdamOptimizer(0.01)
+            adam.minimize(loss)
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                static_loss = exe.run(
+                    main,
+                    feed={
+                        users.name: users_np[slice:slice + BATCH_SIZE],
+                        items.name: items_np[slice:slice + BATCH_SIZE],
+                        labels.name: labels_np[slice:slice + BATCH_SIZE]
+                    },
+                    fetch_list=[loss])[0]
+                sys.stderr.write('static loss %s\n' % static_loss)
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            deepcf = DeepCF('deepcf')
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                prediction = deepcf(
+                    to_variable(users_np[slice:slice + BATCH_SIZE]),
+                    to_variable(items_np[slice:slice + BATCH_SIZE]))
+                loss = fluid.layers.reduce_sum(
+                    fluid.layers.log_loss(prediction,
+                                          to_variable(labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                loss._backward()
+                adam = fluid.optimizer.AdamOptimizer(0.01)
+                adam.minimize(loss)
+                deepcf.clear_gradients()
+                dy_loss = loss._numpy()
+
+        self.assertEqual(static_loss, dy_loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index a80202d6dd..6024fb5f81 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
         return self._fc3(x)
 
 
-class TestImperativeMnist(unittest.TestCase):
+class TestImperativeGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
new file mode 100644
index 0000000000..2086fab5c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+
+def gen_data():
+    pass
+
+
+class GraphConv(fluid.imperative.Layer):
+    def __init__(self, name_scope, in_features, out_features):
+        super(GraphConv, self).__init__(name_scope)
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self.weight = self.create_parameter(
+            attr=None,
+            dtype='float32',
+            shape=[self._in_features, self._out_features])
+        self.bias = self.create_parameter(
+            attr=None, dtype='float32', shape=[self._out_features])
+
+    def forward(self, features, adj):
+        support = fluid.layers.matmul(features, self.weight)
+        # TODO(panyx0718): sparse matmul?
+        return fluid.layers.matmul(adj, support) + self.bias
+
+
+class GCN(fluid.imperative.Layer):
+    def __init__(self, name_scope, num_hidden):
+        super(GCN, self).__init__(name_scope)
+        self.gc = GraphConv(self.full_name(), num_hidden, 32)
+        self.gc2 = GraphConv(self.full_name(), 32, 10)
+
+    def forward(self, x, adj):
+        x = fluid.layers.relu(self.gc(x, adj))
+        return self.gc2(x, adj)
+
+
+class TestImperativeGNN(unittest.TestCase):
+    def test_gnn_float32(self):
+        seed = 90
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            features = fluid.layers.data(
+                name='features',
+                shape=[1, 100, 50],
+                dtype='float32',
+                append_batch_size=False)
+            # Use selected rows when it's supported.
+            adj = fluid.layers.data(
+                name='adj',
+                shape=[1, 100, 100],
+                dtype='float32',
+                append_batch_size=False)
+            labels = fluid.layers.data(
+                name='labels',
+                shape=[100, 1],
+                dtype='int64',
+                append_batch_size=False)
+
+            model = GCN('test_gcn', 50)
+            logits = model(features, adj)
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
+            loss = fluid.layers.reduce_sum(loss)
+
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            static_loss = exe.run(feed={
+                'features': np.zeros(
+                    [1, 100, 50], dtype=np.float32),
+                'adj': np.zeros(
+                    [1, 100, 100], dtype=np.float32),
+                'labels': np.zeros(
+                    [100, 1], dtype=np.int64)
+            },
+                                  fetch_list=[loss])[0]
+
+            static_weight = np.array(
+                scope.find_var(model.gc.weight.name).get_tensor())
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            features = np.zeros([1, 100, 50], dtype=np.float32)
+            # Use selected rows when it's supported.
+            adj = np.zeros([1, 100, 100], dtype=np.float32)
+            labels = np.zeros([100, 1], dtype=np.int64)
+
+            model = GCN('test_gcn', 50)
+            logits = model(to_variable(features), to_variable(adj))
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits,
+                                                           to_variable(labels))
+            loss = fluid.layers.reduce_sum(loss)
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            self.assertEqual(static_loss, loss._numpy())
+            self.assertTrue(
+                np.allclose(static_weight, model.gc.weight._numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 3b602303ae..460ba65a48 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -59,7 +59,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                 dtype="float32",
                 default_initializer=fluid.initializer.UniformInitializer(
                     low=-self._init_scale, high=self._init_scale))
-            self.weight_1_arr.append(weight_1)
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
                     initializer=fluid.initializer.UniformInitializer(
@@ -67,7 +67,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                 shape=[self._hidden_size * 4],
                 dtype="float32",
                 default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_arr.append(bias_1)
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
     def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.cell_array = []
@@ -242,7 +242,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             dy_loss = None
             last_hidden = None
             last_cell = None
-            batch_num = 50
+            batch_num = 200
 
             for i in range(batch_num):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -264,8 +264,10 @@ class TestImperativePtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param._numpy()
                 dy_loss._backward()
                 sgd.minimize(dy_loss)
-                for param in ptb_model.parameters():
-                    dy_param_updated[param.name] = param._numpy()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param._numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -323,25 +325,28 @@ class TestImperativePtbRnn(unittest.TestCase):
                               },
                               fetch_list=fetch_list)
                 static_loss_value = out[0]
-                static_last_cell_value = out[1]
-                static_last_hidden_value = out[2]
-                for k in range(3, len(out)):
-                    static_param_updated[static_param_name_list[k - 3]] = out[k]
+                static_last_hidden_value = out[1]
+                static_last_cell_value = out[2]
 
+                if i == batch_num - 1:
+                    for k in range(3, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    3]] = out[k]
+
+        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
+        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
+        self.assertTrue(
+            np.allclose(static_last_hidden_value, last_hidden._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            # print("static_init name: {}, value {}".format(key, value))
+            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
+            self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
+        for key, value in six.iteritems(static_param_updated):
+            # print("static name: {}, value {}".format(key, value))
+            # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
             self.assertTrue(
-                np.allclose(static_loss_value.all(), dy_loss._numpy().all()))
-            self.assertTrue(
-                np.allclose(static_last_cell_value.all(),
-                            last_cell._numpy().all()))
-            self.assertTrue(
-                np.allclose(static_last_hidden_value.all(),
-                            last_hidden._numpy().all()))
-            for key, value in six.iteritems(static_param_init):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_init[key].all()))
-            for key, value in six.iteritems(static_param_updated):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_updated[key].all()))
+                np.allclose(
+                    value, dy_param_updated[key], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 94ac393315..ab9298890b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -277,7 +277,7 @@ class TestImperativeResnet(unittest.TestCase):
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
-                    if not param.stop_gradient:
+                    if param.trainable:
                         np_array = np.array(param._ivar._grad_ivar().value()
                                             .get_tensor())
                         dy_grad_value[param.name + core.grad_var_suffix(
@@ -322,7 +322,7 @@ class TestImperativeResnet(unittest.TestCase):
             for param in resnet.parameters():
                 static_param_name_list.append(param.name)
             for param in resnet.parameters():
-                if not param.stop_gradient:
+                if param.trainable:
                     static_grad_name_list.append(param.name +
                                                  core.grad_var_suffix())
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
new file mode 100644
index 0000000000..b06d3e8894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -0,0 +1,1096 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from test_imperative_base import new_program_scope
+from paddle.fluid import core
+import numpy as np
+import six
+np.set_printoptions(suppress=True)
+
+
+# Copy from models
+class TrainTaskConfig(object):
+    # support both CPU and GPU now.
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 30
+    # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
+    batch_size = 32
+    # the hyper parameters for Adam optimizer.
+    # This static learning_rate will be multiplied to the LearningRateScheduler
+    # derived learning rate the to get the final learning rate.
+    learning_rate = 2.0
+    beta1 = 0.9
+    beta2 = 0.997
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 8000
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
+    # the frequency to save trained models.
+    save_freq = 10000
+
+
+class InferTaskConfig(object):
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_out_len = 256
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = True
+    # the directory for loading the trained model.
+    model_path = "trained_models/pass_1.infer.model"
+
+
+class ModelHyperParams(object):
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences deciding the size of position encoding table.
+    max_length = 4
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 2048
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rates of different modules.
+    prepostprocess_dropout = 0.1
+    attention_dropout = 0.1
+    relu_dropout = 0.1
+    # to process before each sub-layer
+    preprocess_cmd = "n"  # layer normalization
+    # to process after each sub-layer
+    postprocess_cmd = "da"  # dropout + residual connection
+    # random seed used in dropout for CE.
+    dropout_seed = 1
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
+
+
+def merge_cfg_from_list(cfg_list, g_cfgs):
+    """
+    Set the above global configurations using the cfg_list.
+    """
+    assert len(cfg_list) % 2 == 0
+    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
+        for g_cfg in g_cfgs:
+            if hasattr(g_cfg, key):
+                try:
+                    value = eval(value)
+                except Exception:  # for file path
+                    pass
+                setattr(g_cfg, key, value)
+                break
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    channels = d_pos_vec
+    position = np.arange(n_position)
+    num_timescales = channels // 2
+    log_timescale_increment = (np.log(float(1e4) / float(1)) /
+                               (num_timescales - 1))
+    inv_timescales = np.exp(np.arange(
+        num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
+                                                               0)
+    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
+    position_enc = signal
+    return position_enc.astype("float32")
+
+
+def create_data(is_static=False):
+    if is_static:
+        return [
+            src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
+            trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
+            lbl_weight_np
+        ]
+    else:
+        enc_inputs = [
+            to_variable(src_word_np), to_variable(src_pos_np),
+            to_variable(src_slf_attn_bias_np)
+        ]
+        dec_inputs = [
+            to_variable(trg_word_np), to_variable(trg_pos_np),
+            to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
+        ]
+        label = to_variable(lbl_word_np)
+        weight = to_variable(lbl_weight_np)
+        return enc_inputs, dec_inputs, label, weight
+
+
+def create_feed_dict_list(data, init=False):
+    if init:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
+    else:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields
+    feed_dict_list = dict()
+    for i in range(len(data_input_names)):
+        feed_dict_list[data_input_names[i]] = data[i]
+    return feed_dict_list
+
+
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = fluid.layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+
+
+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = 32
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, 1), "int64", 2],
+    # The actual data shape of src_pos is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # The actual data shape of trg_word is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, 1), "int64",
+                 2],  # lod_level is only used in fast decoder.
+    # The actual data shape of trg_pos is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(batch_size * seq_len, 1), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
+    # This input is used in beam-search decoder.
+    "init_score": [(batch_size, 1), "float32", 2],
+    # This input is used in beam-search decoder for the first gather
+    # (cell states updation)
+    "init_idx": [(batch_size, ), "int32"],
+}
+
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "init_idx",
+    "trg_src_attn_bias", )
+# if we use py_reader
+use_py_reader = False
+
+# if we run sync mode
+sync = False
+
+# how many batches we use
+batch_num = 2
+
+np.random.seed = 1
+src_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+src_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+
+trg_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+trg_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+
+lbl_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size * seq_len, 1),
+    dtype='int64')
+lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+
+# np.random.seed = 1
+# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# src_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# trg_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
+# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+#
+pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+
+
+class PrePostProcessLayer(Layer):
+    def __init__(self, name_scope, process_cmd, shape_len=None):
+        super(PrePostProcessLayer, self).__init__(name_scope)
+        for cmd in process_cmd:
+            if cmd == "n":
+                self._layer_norm = LayerNorm(
+                    name_scope=self.full_name(),
+                    begin_norm_axis=shape_len - 1,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
+        for cmd in process_cmd:
+            if cmd == "a":  # add residual connection
+                out = out + prev_out if prev_out else out
+            elif cmd == "n":  # add layer normalization
+                out = self._layer_norm(out)
+            elif cmd == "d":  # add dropout
+                if dropout_rate:
+                    out = fluid.layers.dropout(
+                        out,
+                        dropout_prob=dropout_rate,
+                        seed=ModelHyperParams.dropout_seed,
+                        is_test=False)
+        return out
+
+
+class PositionwiseFeedForwardLayer(Layer):
+    def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
+        super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
+        self._i2h = FC(name_scope=self.full_name(),
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu")
+        self._h2o = FC(name_scope=self.full_name(),
+                       size=d_hid,
+                       num_flatten_dims=2)
+        self._dropout_rate = dropout_rate
+
+    def forward(self, x):
+        hidden = self._i2h(x)
+        if self._dropout_rate:
+            hidden = fluid.layers.dropout(
+                hidden,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+        out = self._h2o(hidden)
+        return out
+
+
+class MultiHeadAttentionLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.,
+                 cache=None,
+                 gather_idx=None,
+                 static_kv=False):
+        super(MultiHeadAttentionLayer, self).__init__(name_scope)
+        self._n_head = n_head
+        self._d_key = d_key
+        self._d_value = d_value
+        self._d_model = d_model
+        self._dropout_rate = dropout_rate
+        self._q_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._k_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._v_fc = FC(name_scope=self.full_name(),
+                        size=d_value * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._proj_fc = FC(name_scope=self.full_name(),
+                           size=self._d_model,
+                           bias_attr=False,
+                           num_flatten_dims=2)
+
+    def forward(self, queries, keys, values, attn_bias):
+        # compute q ,k ,v
+        keys = queries if keys is None else keys
+        values = keys if values is None else values
+
+        q = self._q_fc(queries)
+        k = self._k_fc(keys)
+        v = self._v_fc(values)
+
+        # split head
+        reshaped_q = fluid.layers.reshape(
+            x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
+        reshaped_k = fluid.layers.reshape(
+            x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
+        reshaped_v = fluid.layers.reshape(
+            x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
+        transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
+
+        #scale dot product attention
+        product = fluid.layers.matmul(
+            x=transpose_q,
+            y=transpose_k,
+            transpose_y=True,
+            alpha=self._d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = fluid.layers.softmax(product)
+        if self._dropout_rate:
+            weights_droped = fluid.layers.dropout(
+                weights,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+            out = fluid.layers.matmul(weights_droped, transpose_v)
+        else:
+            out = fluid.layers.matmul(weights, transpose_v)
+
+        # combine heads
+        if len(out.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
+        final_out = fluid.layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=False)
+
+        # fc to output
+        proj_out = self._proj_fc(final_out)
+        return proj_out
+
+
+class EncoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(EncoderSubLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._postprocess_cmd = postprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(), d_key, d_value, d_model, n_head,
+            attention_dropout)
+        self._postprocess_layer = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+        self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
+                                                      self._preprocess_cmd, 3)
+        self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._postprocess_layer2 = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+
+    def forward(self, enc_input, attn_bias):
+        pre_process_multihead = self._preprocess_layer(
+            None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
+        attn_output = self._multihead_attention_layer(pre_process_multihead,
+                                                      None, None, attn_bias)
+        attn_output = self._postprocess_layer(enc_input, attn_output,
+                                              self._postprocess_cmd,
+                                              self._prepostprocess_dropout)
+        pre_process2_output = self._preprocess_layer2(
+            None, attn_output, self._preprocess_cmd,
+            self._prepostprocess_dropout)
+        ffd_output = self._positionwise_feed_forward(pre_process2_output)
+        return self._postprocess_layer2(attn_output, ffd_output,
+                                        self._postprocess_cmd,
+                                        self._prepostprocess_dropout)
+
+
+class EncoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(EncoderLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._encoder_sublayers = list()
+        self._prepostprocess_dropout = prepostprocess_dropout
+        self._n_layer = n_layer
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        for i in range(n_layer):
+            self._encoder_sublayers.append(
+                self.add_sublayer(
+                    'esl_%d' % i,
+                    EncoderSubLayer(
+                        self.full_name(), n_head, d_key, d_value, d_model,
+                        d_inner_hid, prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+
+    def forward(self, enc_input, attn_bias):
+        for i in range(self._n_layer):
+            enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
+                                      self._prepostprocess_dropout)
+
+
+class PrepareEncoderDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 src_emb_dim,
+                 src_max_len,
+                 dropout_rate,
+                 word_emb_param_name=None,
+                 pos_enc_param_name=None):
+        super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
+        self._src_max_len = src_max_len
+        self._src_emb_dim = src_emb_dim
+        self._src_vocab_size = src_vocab_size
+        self._dropout_rate = dropout_rate
+        self._input_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[src_vocab_size, src_emb_dim],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+
+        if pos_enc_param_name is pos_enc_param_names[0]:
+            pos_inp = pos_inp1
+        else:
+            pos_inp = pos_inp2
+        self._pos_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[self._src_max_len, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=pos_enc_param_name,
+                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                trainable=False))
+
+        # use in imperative_mode to fit different length batch
+        # self._pos_emb._w = to_variable(
+        #     position_encoding_init(self._src_max_len, self._src_emb_dim))
+
+    def forward(self, src_word, src_pos):
+        src_word_emb = self._input_emb(src_word)
+        src_word_emb = fluid.layers.scale(
+            x=src_word_emb, scale=self._src_emb_dim**0.5)
+        # # TODO change this to fit dynamic length input
+        src_pos_emb = self._pos_emb(src_pos)
+        src_pos_emb.stop_gradient = True
+        enc_input = src_word_emb + src_pos_emb
+        return fluid.layers.dropout(
+            enc_input,
+            dropout_prob=self._dropout_rate,
+            seed=ModelHyperParams.dropout_seed,
+            is_test=False) if self._dropout_rate else enc_input
+
+
+class WrapEncoderLayer(Layer):
+    def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
+                 d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+                 attention_dropout, relu_dropout, preprocess_cmd,
+                 postprocess_cmd, weight_sharing):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapEncoderLayer, self).__init__(name_cope)
+
+        self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            src_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[0],
+            pos_enc_param_name=pos_enc_param_names[0])
+        self._encoder = EncoderLayer(
+            self.full_name(), n_layer, n_head, d_key, d_value, d_model,
+            d_inner_hid, prepostprocess_dropout, attention_dropout,
+            relu_dropout, preprocess_cmd, postprocess_cmd)
+
+    def forward(self, enc_inputs):
+        src_word, src_pos, src_slf_attn_bias = enc_inputs
+        enc_input = self._prepare_encoder_layer(src_word, src_pos)
+        enc_output = self._encoder(enc_input, src_slf_attn_bias)
+        return enc_output
+
+
+class DecoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 cache=None,
+                 gather_idx=None):
+        super(DecoderSubLayer, self).__init__(name_scope)
+        self._postprocess_cmd = postprocess_cmd
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprcess_dropout = prepostprocess_dropout
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx)
+        self._post_process_layer = PrePostProcessLayer(self.full_name(),
+                                                       postprocess_cmd, None)
+        self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._multihead_attention_layer2 = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx,
+            static_kv=True)
+        self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+        self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+
+    def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
+        pre_process_rlt = self._pre_process_layer(
+            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
+        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
+                                                          None, slf_attn_bias)
+        slf_attn_output_pp = self._post_process_layer(
+            dec_input, slf_attn_output, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        enc_attn_output_pp = self._multihead_attention_layer2(
+            pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
+        enc_attn_output = self._post_process_layer2(
+            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
+        dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
+                                               self._postprocess_cmd,
+                                               self._prepostprcess_dropout)
+        return dec_output
+
+
+class DecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 caches=None,
+                 gather_idx=None):
+        super(DecoderLayer, self).__init__(name_scope)
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._decoder_sub_layers = list()
+        self._n_layer = n_layer
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+        for i in range(n_layer):
+            self._decoder_sub_layers.append(
+                self.add_sublayer(
+                    'dsl_%d' % i,
+                    DecoderSubLayer(
+                        self.full_name(),
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        cache=None if caches is None else caches[i],
+                        gather_idx=gather_idx)))
+
+    def forward(self, dec_input, enc_output, dec_slf_attn_bias,
+                dec_enc_attn_bias):
+        for i in range(self._n_layer):
+            tmp_dec_output = self._decoder_sub_layers[i](
+                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
+            dec_input = tmp_dec_output
+
+        dec_output = self._pre_process_layer(None, tmp_dec_output,
+                                             self._preprocess_cmd,
+                                             self._prepostprocess_dropout)
+        return dec_output
+
+
+class WrapDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 caches=None,
+                 gather_idx=None):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapDecoderLayer, self).__init__(name_scope)
+
+        self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            trg_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[1],
+            pos_enc_param_name=pos_enc_param_names[1])
+        self._decoder_layer = DecoderLayer(
+            self.full_name(),
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd,
+            caches=caches,
+            gather_idx=gather_idx)
+        self._weight_sharing = weight_sharing
+        if not weight_sharing:
+            self._fc = FC(self.full_name(),
+                          size=trg_vocab_size,
+                          bias_attr=False)
+
+    def forward(self, dec_inputs=None, enc_output=None):
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+        dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
+        dec_output = self._decoder_layer(dec_input, enc_output,
+                                         trg_slf_attn_bias, trg_src_attn_bias)
+
+        dec_output_reshape = fluid.layers.reshape(
+            dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
+
+        if self._weight_sharing:
+            predict = fluid.layers.matmul(
+                x=dec_output_reshape,
+                y=self._prepare_decoder_layer._input_emb._w,
+                transpose_y=True)
+        else:
+            predict = self._fc(dec_output_reshape)
+
+        if dec_inputs is None:
+            # Return probs for independent decoder program.
+            predict_out = fluid.layers.softmax(predict)
+            return predict_out
+        return predict
+
+
+class TransFormer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 label_smooth_eps,
+                 use_py_reader=False,
+                 is_test=False):
+        super(TransFormer, self).__init__(name_scope)
+        self._label_smooth_eps = label_smooth_eps
+        self._trg_vocab_size = trg_vocab_size
+        if weight_sharing:
+            assert src_vocab_size == trg_vocab_size, (
+                "Vocabularies in source and target should be same for weight sharing."
+            )
+        self._wrap_encoder_layer = WrapEncoderLayer(
+            self.full_name(), src_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+        self._wrap_decoder_layer = WrapDecoderLayer(
+            self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+
+        if weight_sharing:
+            self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
+
+    def forward(self, enc_inputs, dec_inputs, label, weights):
+        enc_output = self._wrap_encoder_layer(enc_inputs)
+        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
+        if self._label_smooth_eps:
+            label_out = fluid.layers.label_smooth(
+                label=fluid.layers.one_hot(
+                    input=label, depth=self._trg_vocab_size),
+                epsilon=self._label_smooth_eps)
+
+        cost = fluid.layers.softmax_with_cross_entropy(
+            logits=predict,
+            label=label_out,
+            soft_label=True if self._label_smooth_eps else False)
+        weighted_cost = cost * weights
+        sum_cost = fluid.layers.reduce_sum(weighted_cost)
+        token_num = fluid.layers.reduce_sum(weights)
+        token_num.stop_gradient = True
+        avg_cost = sum_cost / token_num
+        return sum_cost, avg_cost, predict, token_num
+
+
+class TestImperativeTransformer(unittest.TestCase):
+    def test_transformer_float32(self):
+        seed = 90
+        with guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            if sync:
+                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
+                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
+                with fluid.default_main_program()._lr_schedule_guard():
+                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=learning_rate,
+                    beta1=TrainTaskConfig.beta1,
+                    beta2=TrainTaskConfig.beta2,
+                    epsilon=TrainTaskConfig.eps)
+            else:
+                optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            dy_param_init = dict()
+            dy_param_updated = dict()
+            for i in range(batch_num):
+                enc_inputs, dec_inputs, label, weights = create_data()
+                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
+                    enc_inputs, dec_inputs, label, weights)
+                if i == 0:
+                    for param in transformer.parameters():
+                        dy_param_init[param.name] = param._numpy()
+
+                dy_avg_cost._backward()
+                optimizer.minimize(dy_avg_cost)
+                transformer.clear_gradients()
+                if i == batch_num - 1:
+                    for param in transformer.parameters():
+                        dy_param_updated[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+
+            data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                                     -1] + label_data_input_fields
+            all_inputs = make_all_inputs(data_input_names)
+            enc_inputs_len = len(encoder_data_input_fields)
+            dec_inputs_len = len(decoder_data_input_fields[:-1])
+            enc_inputs = all_inputs[0:enc_inputs_len]
+            dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
+                                    dec_inputs_len]
+            label = all_inputs[-2]
+            weights = all_inputs[-1]
+            static_param_updated = dict()
+            static_param_init = dict()
+            static_param_name_list = list()
+            static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
+                enc_inputs, dec_inputs, label, weights)
+
+            optimizer.minimize(static_avg_cost)
+            for param in transformer.parameters():
+                static_param_name_list.append(param.name)
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init[static_param_name_list[i]] = out[i]
+            static_sum_cost_value = None
+            static_avg_cost_value = None
+            static_predict_value = None
+            static_token_num_value = None
+            for i in range(batch_num):
+                feed_dict = create_feed_dict_list(create_data(True))
+                fetch_list = [
+                    static_sum_cost, static_avg_cost, static_predict,
+                    static_token_num
+                ]
+                fetch_list.extend(static_param_name_list)
+
+                out = exe.run(fluid.default_main_program(),
+                              feed=feed_dict,
+                              fetch_list=fetch_list)
+                static_sum_cost_value = out[0]
+                static_avg_cost_value = out[1]
+                static_predict_value = out[2]
+                static_token_num_value = out[3]
+                if i == batch_num - 1:
+                    for k in range(4, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    4]] = out[k]
+
+        self.assertTrue(
+            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+        self.assertTrue(
+            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+        self.assertTrue(
+            np.allclose(
+                static_predict_value, dy_predict._numpy(), atol=1e-5))
+        self.assertTrue(
+            np.allclose(static_token_num_value, dy_token_num._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.allclose(value, dy_param_init[key]))
+        for key, value in six.iteritems(static_param_updated):
+            self.assertTrue(
+                np.allclose(
+                    value, dy_param_updated[key], atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b29ad25870..1429d24107 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -70,6 +70,34 @@ class LayerTest(unittest.TestCase):
 
 
 class TestLayer(LayerTest):
+    def test_layer_norm(self):
+        inp = np.ones([3, 32, 32], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            ret = layers.layer_norm(t)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            lm = nn.LayerNorm('layer_norm')
+            ret = lm(t)
+            static_ret2 = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.dynamic_graph():
+            lm = nn.LayerNorm('layer_norm')
+            dy_ret = lm(base.to_variable(inp))
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
+
     def test_relu(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -84,6 +112,27 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
 
+    def test_matmul(self):
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            ret = layers.matmul(t, t2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': np.ones(
+                        [3, 3], dtype='float32'),
+                    't2': np.ones(
+                        [3, 3], dtype='float32')
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            t = np.ones([3, 3], dtype='float32')
+            t2 = np.ones([3, 3], dtype='float32')
+            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+
     def test_conv2d(self):
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
@@ -112,6 +161,101 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
+    def test_gru_unit(self):
+        lod = [[2, 4, 3]]
+        D = 5
+        T = sum(lod[0])
+        N = len(lod[0])
+
+        input = np.random.rand(T, 3 * D).astype('float32')
+        hidden_input = np.random.rand(T, D).astype('float32')
+
+        with self.static_graph():
+            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
+            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
+            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
+                input=x, hidden=hidden, size=D * 3)
+            static_ret = self.get_static_graph_result(
+                feed={'x': input,
+                      'hidden': hidden_input},
+                fetch_list=[updated_hidden, reset_hidden_pre, gate])
+
+        with self.static_graph():
+            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
+            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
+            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
+                input=x, hidden=hidden, size=D * 3)
+            gru = nn.GRUUnit('gru', size=D * 3)
+            updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
+
+            static_ret2 = self.get_static_graph_result(
+                feed={'x': input,
+                      'hidden': hidden_input},
+                fetch_list=[updated_hidden, reset_hidden_pre, gate])
+
+        with self.dynamic_graph():
+            gru = nn.GRUUnit('gru', size=D * 3)
+            dy_ret = gru(
+                base.to_variable(input), base.to_variable(hidden_input))
+
+        for i in range(len(static_ret)):
+            self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
+            self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
+
+    def test_elementwise_math(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 1.1
+        n3 = np.ones([3, 3], dtype='float32') * 2
+        n4 = np.ones([3, 3], dtype='float32') * 3
+        n5 = np.ones([3, 3], dtype='float32') * 4
+        n6 = np.ones([3, 3], dtype='float32') * 5
+
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            t3 = layers.data(name='t3', shape=[3, 3], dtype='float32')
+            t4 = layers.data(name='t4', shape=[3, 3], dtype='float32')
+            t5 = layers.data(name='t5', shape=[3, 3], dtype='float32')
+            t6 = layers.data(name='t6', shape=[3, 3], dtype='float32')
+
+            ret = layers.elementwise_add(t, t2)
+            ret = layers.elementwise_pow(ret, t3)
+            ret = layers.elementwise_div(ret, t4)
+            ret = layers.elementwise_sub(ret, t5)
+            ret = layers.elementwise_mul(ret, t6)
+
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': n,
+                    't2': n2,
+                    't3': n3,
+                    't4': n4,
+                    't5': n5,
+                    't6': n6
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            ret = layers.elementwise_add(n, n2)
+            ret = layers.elementwise_pow(ret, n3)
+            ret = layers.elementwise_div(ret, n4)
+            ret = layers.elementwise_sub(ret, n5)
+            dy_ret = layers.elementwise_mul(ret, n6)
+        self.assertTrue(
+            np.allclose(static_ret, dy_ret._numpy()),
+            '%s vs %s' % (static_ret, dy_ret._numpy()))
+
+    def test_elementwise_minmax(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 2
+
+        with self.dynamic_graph():
+            min_ret = layers.elementwise_min(n, n2)
+            max_ret = layers.elementwise_max(n, n2)
+
+        self.assertTrue(np.allclose(n, min_ret._numpy()))
+        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
@@ -1124,6 +1268,14 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_range(self):
+        program = Program()
+        with program_guard(program):
+            layers.range(0, 10, 2, 'int32')
+            layers.range(0.1, 10.0, 0.2, 'float32')
+
+        print(str(program))
+
     def test_spectral_norm(self):
         program = Program()
         with program_guard(program):
@@ -1145,6 +1297,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_fsp(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            y = layers.data(name="Y", shape=[8, 4, 4], dtype="float32")
+            out = layers.fsp_matrix(x, y)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 9548598d75..1f23fae92c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
+os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
+os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
 
 import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
@@ -22,7 +25,6 @@ import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
-import os
 import numpy as np
 
 # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
@@ -312,17 +314,59 @@ class TestResnet(TestParallelExecutorBase):
         self.assertAlmostEquals(
             np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
+    def _compare_with_fused_all_reduce(self,
+                                       model,
+                                       use_cuda,
+                                       iter=20,
+                                       delta2=1e-5):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_bn
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
     def test_seresnext_with_learning_rate_decay(self):
         self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
         self._check_resnet_convergence(
             model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
 
-    def test_seresnext_with_new_strategy(self):
+    def test_seresnext_with_reduce(self):
         self._compare_reduce_and_allreduce(
             model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
         self._compare_reduce_and_allreduce(
             model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
+    def test_seresnext_with_fused_all_reduce(self):
+        self._compare_with_fused_all_reduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
+        self._compare_with_fused_all_reduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
new file mode 100644
index 0000000000..7607189454
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55"
+
+os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
+
+from test_parallel_executor_transformer import TestTransformer
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
new file mode 100644
index 0000000000..4efca5e2aa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import unittest
+import numpy as np
+import os
+
+os.environ['CPU_NUM'] = '1'
+
+
+def random_reader(sample_num):
+    def __impl__():
+        for _ in range(sample_num):
+            yield np.random.random(
+                size=[784]).astype('float32'), np.random.random_integers(
+                    low=0, high=9, size=[1]).astype('int64')
+
+    return paddle.reader.cache(__impl__)
+
+
+class TestCaseBase(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 165
+
+    def generate_all_data(self, reader):
+        ret = []
+        for d in reader():
+            slots = [[], []]
+            for item in d:
+                slots[0].append(item[0])
+                slots[1].append(item[1])
+            slots = [np.array(slot) for slot in slots]
+            ret.append(slots)
+        return ret
+
+    def run_main(self, reader, use_sample_generator, iterable, drop_last):
+        image = fluid.layers.data(name='image', dtype='float32', shape=[784])
+        label = fluid.layers.data(name='label', dtype='int64', shape=[1])
+        py_reader = fluid.io.PyReader(
+            feed_list=[image, label],
+            capacity=16,
+            iterable=iterable,
+            use_double_buffer=False)
+
+        batch_reader = paddle.batch(reader, self.batch_size, drop_last)
+        all_datas = self.generate_all_data(batch_reader)
+
+        if not use_sample_generator:
+            py_reader.decorate_sample_list_generator(
+                batch_reader, places=fluid.cpu_places())
+        else:
+            py_reader.decorate_sample_generator(
+                reader, self.batch_size, drop_last, places=fluid.cpu_places())
+
+        if drop_last:
+            batch_num = int(self.sample_num / self.batch_size)
+        else:
+            batch_num = math.ceil(float(self.sample_num) / self.batch_size)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        for _ in range(self.epoch_num):
+            if py_reader.iterable:
+                step = 0
+                for data in py_reader():
+                    img, lbl = exe.run(feed=data, fetch_list=[image, label])
+                    self.assertArrayEqual(img, all_datas[step][0])
+                    self.assertArrayEqual(lbl, all_datas[step][1])
+                    step += 1
+                self.assertEqual(step, len(all_datas))
+            else:
+                step = 0
+                try:
+                    py_reader.start()
+                    while True:
+                        img, lbl = exe.run(fetch_list=[image, label])
+                        self.assertArrayEqual(img, all_datas[step][0])
+                        self.assertArrayEqual(lbl, all_datas[step][1])
+                        step += 1
+                except fluid.core.EOFException:
+                    py_reader.reset()
+                    self.assertEqual(step, len(all_datas))
+                    break
+
+    def assertArrayEqual(self, arr1, arr2):
+        self.assertEqual(arr1.shape, arr2.shape)
+        self.assertTrue((arr1 == arr2).all())
+
+    def test_main(self):
+        reader = random_reader(self.sample_num)
+        for use_sample_generator in [False, True]:
+            for iterable in [False, True]:
+                for drop_last in [False, True]:
+                    with fluid.program_guard(fluid.Program(), fluid.Program()):
+                        self.run_main(reader, use_sample_generator, iterable,
+                                      drop_last)
+
+
+class TestCase1(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 10
+        self.sample_num = 160
+
+
+class TestCase2(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 200
+
+
+class TestCase3(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 159
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py
new file mode 100644
index 0000000000..f129ae78cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_range.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            'Start': np.array([self.case[0]]).astype(self.dtype),
+            'End': np.array([self.case[1]]).astype(self.dtype),
+            'Step': np.array([self.case[2]]).astype(self.dtype)
+        }
+
+        self.outputs = {
+            'Out': np.arange(self.case[0], self.case[1],
+                             self.case[2]).astype(self.dtype)
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFloatRangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 5, 1)
+
+
+class TestInt32RangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (0, 5, 2)
+
+
+class TestInt32RangeOpCase1(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (10, 1, -2)
+
+
+class TestInt32RangeOpCase2(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (-1, -10, -2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 4e6ed3a74b..aefd8cb6d3 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp):
         self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16_2(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Out',
+                max_relative_error=0.006,
+                numeric_grad_delta=0.5)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
new file mode 100644
index 0000000000..f6a658cb1b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import six
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler
+
+
+class TestSyncBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        #self.dtype = np.float32
+        self.dtype = np.float64
+        self.N = 32
+        self.C = 16
+        self.H = 64
+        self.W = 32
+        self.dshape = [self.N, self.C, self.H, self.W]
+
+    def build_program(self,
+                      place,
+                      layout,
+                      seed,
+                      sync_bn=False,
+                      only_forward=False):
+        main = fluid.Program()
+        startup = fluid.Program()
+        main.random_seed = seed
+        startup.random_seed = seed
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False)
+                conv = fluid.layers.conv2d(
+                    input=data,
+                    num_filters=32,
+                    filter_size=1,
+                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
+                    bias_attr=False,
+                    use_cudnn=False)
+                bn = fluid.layers.batch_norm(
+                    conv,
+                    param_attr=fluid.ParamAttr(name='bn_scale'),
+                    bias_attr=fluid.ParamAttr(name='bn_bias'),
+                    moving_mean_name='bn_moving_mean',
+                    moving_variance_name='bn_moving_variance',
+                    data_layout=layout,
+                    is_test=only_forward)
+                sigmoid = fluid.layers.sigmoid(bn)
+                out = fluid.layers.reduce_sum(sigmoid)
+                if not sync_bn:
+                    out = out / core.get_cuda_device_count()
+                if not only_forward:
+                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
+                    sgd_opt.backward(out)
+        return main, startup, [out, conv, bn]
+
+    def compare(self, place, layout, only_forward):
+        seed = 10
+        os.environ['FLAGS_cudnn_deterministic'] = "1"
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        # Single-GPU, N = 32 per GPU
+        main, startup, outs = self.build_program(place, layout, seed, False,
+                                                 only_forward)
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        bn_fetches = exe.run(program=main,
+                             feed={'input': data},
+                             fetch_list=fetch_names)
+
+        #####################################################################
+        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
+        main, startup, outs = self.build_program(place, layout, seed, True,
+                                                 only_forward)
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        for nm in fetch_names:
+            fv = fluid.framework._get_var(str(nm), program=main)
+            fv.persistable = True
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.sync_batch_norm = True
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
+        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
+            outs[0].name if not only_forward else None,
+            build_strategy=build_strategy)
+        sync_bn_fetches = exe.run(program=comp_prog,
+                                  feed={'input': data},
+                                  fetch_list=fetch_names)
+
+        for i in six.moves.xrange(1, len(sync_bn_fetches)):
+            bn_val = bn_fetches[i]
+            sync_bn_val = sync_bn_fetches[i]
+            if sync_bn_val.shape != bn_val.shape:
+                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+            self.assertTrue(
+                np.allclose(
+                    bn_val, sync_bn_val, atol=1e-3),
+                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
+                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
+
+    def test_train(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self.compare(place, layout, False)
+
+    def test_infer(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self.compare(place, layout, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
new file mode 100644
index 0000000000..416e6ea9f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def YoloBox(x, img_size, attrs):
+    n, c, h, w = x.shape
+    anchors = attrs['anchors']
+    an_num = int(len(anchors) // 2)
+    class_num = attrs['class_num']
+    conf_thresh = attrs['conf_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+
+    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    pred_conf[pred_conf < conf_thresh] = 0.
+    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
+    pred_box = pred_box * (pred_conf > 0.).astype('float32')
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_box[:, :, :2], pred_box[:, :, 2:4] = \
+        pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
+        pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
+    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
+    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
+
+    for i in range(len(pred_box)):
+        pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
+        pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
+        pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf,
+                                    img_size[i, 1] - 1)
+        pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf,
+                                    img_size[i, 0] - 1)
+
+    return pred_box, pred_score.reshape((n, -1, class_num))
+
+
+class TestYoloBoxOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'yolo_box'
+        x = np.random.random(self.x_shape).astype('float32')
+        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
+
+        self.attrs = {
+            "anchors": self.anchors,
+            "class_num": self.class_num,
+            "conf_thresh": self.conf_thresh,
+            "downsample": self.downsample,
+        }
+
+        self.inputs = {
+            'X': x,
+            'ImgSize': img_size,
+        }
+        boxes, scores = YoloBox(x, img_size, self.attrs)
+        self.outputs = {
+            "Boxes": boxes,
+            "Scores": scores,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
+        self.imgsize_shape = (self.batch_size, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 020c113923..e4d6edc72c 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -23,8 +23,8 @@ from op_test import OpTest
 from paddle.fluid import core
 
 
-def l2loss(x, y):
-    return 0.5 * (y - x) * (y - x)
+def l1loss(x, y):
+    return abs(x - y)
 
 
 def sce(x, label):
@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2):
     return inter_area / union
 
 
-def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
     n, c, h, w = x.shape
     b = gtbox.shape[1]
     anchors = attrs['anchors']
@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     mask_num = len(anchor_mask)
     class_num = attrs["class_num"]
     ignore_thresh = attrs['ignore_thresh']
-    downsample = attrs['downsample']
-    input_size = downsample * h
+    downsample_ratio = attrs['downsample_ratio']
+    use_label_smooth = attrs['use_label_smooth']
+    input_size = downsample_ratio * h
     x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
     loss = np.zeros((n)).astype('float32')
 
+    label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
+    label_neg = 1.0 / class_num if use_label_smooth else 0.0
+
     pred_box = x[:, :, :, :, :4].copy()
     grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
     grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
     pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
     pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
 
-    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
-                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
-                                 class_num)
-
     mask_anchors = []
     for m in anchor_mask:
         mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
             ty = gtbox[i, j, 1] * w - gj
             tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
             th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
-            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
             loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
             loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale
 
-            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+            objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
 
             for label_idx in range(class_num):
-                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
-                               float(label_idx == gtlabel[i, j]))
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
+                               if label_idx == gtlabel[i, j] else
+                               label_neg) * gtscore[i, j]
 
         for j in range(mask_num * h * w):
             if objness[i, j] > 0:
-                loss[i] += sce(pred_obj[i, j], 1.0)
+                loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
             elif objness[i, j] == 0:
                 loss[i] += sce(pred_obj[i, j], 0.0)
 
@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest):
             "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "downsample": self.downsample,
+            "downsample_ratio": self.downsample_ratio,
+            "use_label_smooth": self.use_label_smooth,
         }
 
         self.inputs = {
@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest):
             'GTBox': gtbox.astype('float32'),
             'GTLabel': gtlabel.astype('int32'),
         }
-        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
+
+        gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
+        if self.gtscore:
+            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
+            self.inputs['GTScore'] = gtscore
+
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
+                                               self.attrs)
         self.outputs = {
             'Loss': loss,
             'ObjectnessMask': objness,
@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest):
 
     def test_check_output(self):
         place = core.CPUPlace()
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad_ignore_gtbox(self):
         place = core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['X'],
-            'Loss',
-            no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.3)
+        self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
+
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
+        self.use_label_smooth = True
+
+
+class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
+        self.use_label_smooth = False
+
 
+class TestYolov3LossNoGTScore(TestYolov3LossOp):
     def initTestCase(self):
-        self.anchors = [10, 13, 16, 30, 33, 23]
-        self.anchor_mask = [1, 2]
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
         self.class_num = 5
-        self.ignore_thresh = 0.5
-        self.downsample = 32
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
         self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
+        self.gtscore = False
+        self.use_label_smooth = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 678026cf95..b55a6298f6 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -38,9 +38,8 @@ items. It can be any function with no parameter that creates a iterable
 Element produced from the iterable should be a **single** entry of data,
 **not** a mini batch. That entry of data could be a single item, or a tuple of
 items.
-Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
-/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
-array of float32, int, list of int)
+Item should be of supported type (e.g., numpy array or list/tuple of float 
+or int).
 
 An example implementation for single item data reader creator:
 
@@ -62,8 +61,6 @@ An example implementation for multiple item data reader creator:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
     return reader
 
-
-TODO(yuyang18): Should we add whole design doc here?
 """
 
 import paddle.reader.decorator
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index c861020225..353aca92f4 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -44,8 +44,11 @@ def text_file(path):
     Creates a data reader that outputs text line by line from given text file.
     Trailing new line ('\\\\n') of each line will be removed.
 
-    :path: path of the text file.
-    :returns: data reader of text file
+    Args:
+        path (str): path of the text file.
+    
+    Returns: 
+        callable: data reader of text file.
     """
 
     def reader():
@@ -59,10 +62,15 @@ def text_file(path):
 
 def recordio(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",",
-        glob pattern is supported.
-    :path: path of recordio files, can be a string or a string list.
-    :returns: data reader of recordio files.
+    Creates a data reader from given RecordIO file paths separated 
+    by ",", glob pattern is supported.
+
+    Args:
+        paths (str|list(str)): path of recordio files.
+        buf_size (int): prefetched buffer size. 
+
+    Returns:
+        callable: data reader of recordio files.
     """
 
     import recordio as rec
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index b2ef9f7580..f8c5ae0eaf 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 __all__ = [
-    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
     'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
     'multiprocess_reader', 'Fake'
 ]
@@ -33,6 +33,30 @@ import zlib
 import paddle.compat as cpt
 
 
+def cache(reader):
+    """
+    Cache the reader data into memory. 
+
+    Be careful that this method may take long time to process, 
+    and consume lots of memory. :code:`reader()` would only 
+    call once. 
+
+    Args:
+        reader (generator): a reader object which yields 
+            data each time.
+
+    Returns:
+        generator: a decorated reader object which yields data from cached memory.
+    """
+    all_data = tuple(reader())
+
+    def __impl__():
+        for item in all_data:
+            yield item
+
+    return __impl__
+
+
 def map_readers(func, *readers):
     """
     Creates a data reader that outputs return value of function using
@@ -242,20 +266,18 @@ class XmapEndSignal():
 
 def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     """
-    Use multiprocess to map samples from reader by a mapper defined by user.
-    And this function contains a buffered decorator.
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param reader: the data reader to read from
-    :type reader: callable
-    :param process_num: process number to handle original sample
-    :type process_num: int
-    :param buffer_size: max buffer size
-    :type buffer_size: int
-    :param order: keep the order of reader
-    :type order: bool
-    :return: the decarated reader
-    :rtype: callable
+    Use multi-threads to map samples from reader by a mapper defined by user.
+
+    Args:
+        mapper (callable): a function to map the data from reader.
+        reader (callable): a data reader which yields the data. 
+        process_num (int): thread number to handle original sample.
+        buffer_size (int): size of the queue to read data in. 
+        order (bool): whether to keep the data order from original reader. 
+            Default False.
+
+    Returns:
+        callable: a decorated reader with data mapping. 
     """
     end = XmapEndSignal()
 
@@ -477,7 +499,7 @@ class PipeReader:
         """
         :param cut_lines: cut buffer to lines
         :type cut_lines: bool
-        :param line_break: line break of the file, like \n or \r
+        :param line_break: line break of the file, like '\\\\n' or '\\\\r'
         :type line_break: string
 
         :return: one line or a buffer of bytes
diff --git a/python/requirements.txt b/python/requirements.txt
index 36bd5d4261..ce56462fac 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -12,3 +12,4 @@ six
 funcsigs
 pyyaml
 decorator
+prettytable
diff --git a/python/setup.py.in b/python/setup.py.in
index a7c1e91f9c..9f87f5644f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -117,6 +117,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.slim.quantization',
+          'paddle.fluid.contrib.slim.distillation',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
diff --git a/tools/diff_api.py b/tools/diff_api.py
index ec51711d68..fe6a2aa819 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -30,6 +30,6 @@ if error:
         '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
     1. cd ${paddle_path}, compile paddle;
     2. pip install build/python/dist/(build whl package);
-    3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
+    3. run "python tools/print_signatures.py paddle.fluid,paddle.reader > paddle/fluid/API.spec"'''
     )
     sys.exit(1)
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index c2fd743f62..c37a9a92e6 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
-RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 1b0059a8c6..3be94a42d5 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -153,3 +153,9 @@ done
 
 # Restore LD_LIBRARY_PATH
 LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
+
+# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
+# we should install new version ar with 64-bit supported here
+wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
+tar xzf binutils-2.27.tar.gz && cd binutils-2.27
+./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index c56f30f724..d32b247342 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
             all = (args, doc)
             member_dict[cur_name] = all
         except TypeError:  # special for PyBind method
+            if cur_name in check_modules_list:
+                return
             member_dict[cur_name] = "  ".join([
                 line.strip() for line in pydoc.render_doc(member).split('\n')
                 if "->" in line
@@ -78,6 +80,7 @@ def visit_all_module(mod):
             visit_member(mod.__name__, instance)
 
 
+check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
 modules = sys.argv[1].split(",")
 for m in modules:
     visit_all_module(importlib.import_module(m))
diff --git a/tools/timeline.py b/tools/timeline.py
index 7879666417..44c1c09b80 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
         event['args'] = args
         self._events.append(event)
 
+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
     def format_to_string(self, pretty=False):
         """Formats the chrome trace to a string.
 
@@ -117,6 +133,7 @@ class Timeline(object):
         self._profile_dict = profile_dict
         self._pid = 0
         self._devices = dict()
+        self._mem_devices = dict()
         self._chrome_trace = _ChromeTraceFormatter()
 
     def _allocate_pid(self):
@@ -143,6 +160,47 @@ class Timeline(object):
                         self._devices[(k, event.device_id, "GPUKernel")] = pid
                         self._chrome_trace.emit_pid("%s:gpu:%d" %
                                                     (k, event.device_id), pid)
+            if not hasattr(profile_pb, "mem_events"):
+                continue
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id, "CUDAPinnedPlace"
+                        ) not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id,
+                                           "CUDAPinnedPlace")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cudapinnedplace:%d" %
+                            (k, mevent.device_id), pid)
+                if (k, 0, "CPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "GPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "GPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -155,7 +213,7 @@ class Timeline(object):
                 args = {'name': event.name}
                 if event.memcopy.bytes > 0:
                     args['mem_bytes'] = event.memcopy.bytes
-                if event.detail_info:
+                if hasattr(event, "detail_info") and event.detail_info:
                     args['detail_info'] = event.detail_info
                 # TODO(panyx0718): Chrome tracing only handles ms. However, some
                 # ops takes micro-seconds. Hence, we keep the ns here.
@@ -163,9 +221,59 @@ class Timeline(object):
                     event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
                     event.sub_device_id, 'Op', event.name, args)
 
+    def _allocate_memory_event(self):
+        if not hasattr(profiler_pb2, "MemEvent"):
+            return
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+        }
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+                crt_info['place'] = place
+                pid = self._mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                self._chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
+                    0, total_size)
+                i += 1
+
     def generate_chrome_trace(self):
         self._allocate_pids()
         self._allocate_events()
+        self._allocate_memory_event()
         return self._chrome_trace.format_to_string()