From 22715487dc222bc52aa88beb8b41137f7724068f Mon Sep 17 00:00:00 2001
From: zhhsplendid <zhhsplendid@gmail.com>
Date: Tue, 19 Mar 2019 11:39:47 +0000
Subject: [PATCH 01/11] add allocator flags test=develop

---
 CMakeLists.txt                                |   2 +
 paddle/fluid/API.spec                         |  21 +-
 .../fluid/framework/details/graph_test_base.h |  10 +-
 paddle/fluid/framework/details/op_registry.h  |   6 +-
 paddle/fluid/framework/grad_op_desc_maker.h   |   8 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   9 +-
 .../fluid/framework/ir/cpu_quantize_pass.cc   | 239 ++++++++++++++
 paddle/fluid/framework/ir/cpu_quantize_pass.h |  66 ++++
 .../framework/ir/cpu_quantize_pass_tester.cc  | 211 +++++++++++++
 .../ir/cpu_quantize_placement_pass.cc         |  58 ++++
 .../ir/cpu_quantize_placement_pass.h          |  34 ++
 .../ir/cpu_quantize_placement_pass_tester.cc  | 129 ++++++++
 .../framework/ir/graph_pattern_detector.cc    |  51 ++-
 .../framework/ir/graph_pattern_detector.h     |  29 ++
 paddle/fluid/framework/ir/graph_test.cc       |  14 +-
 .../ir/runtime_context_cache_pass.cc          |  39 +++
 .../framework/ir/runtime_context_cache_pass.h |  32 ++
 paddle/fluid/framework/op_desc.cc             |   4 +-
 paddle/fluid/framework/operator.cc            |  28 +-
 paddle/fluid/framework/operator.h             |  11 +
 paddle/fluid/framework/tensor_util.cc         |   5 +
 paddle/fluid/framework/type_defs.h            |   3 +-
 paddle/fluid/framework/var_type_inference.h   | 117 ++++++-
 .../framework/var_type_inference_test.cc      |  12 +-
 paddle/fluid/imperative/CMakeLists.txt        |   1 +
 paddle/fluid/imperative/layer.cc              | 100 ++++--
 paddle/fluid/imperative/layer.h               | 201 ++++++++++--
 paddle/fluid/imperative/profiler.cc           |  62 ++++
 paddle/fluid/imperative/profiler.h            |  25 ++
 paddle/fluid/imperative/tracer.cc             |  78 ++---
 paddle/fluid/imperative/tracer.h              |   2 +-
 paddle/fluid/imperative/type_defs.h           |   1 +
 paddle/fluid/inference/CMakeLists.txt         |   2 +-
 paddle/fluid/inference/analysis/argument.h    |   6 +
 .../inference/analysis/ir_pass_manager.cc     |  11 +-
 paddle/fluid/inference/api/analysis_config.cc |  17 +-
 .../inference/api/paddle_analysis_config.h    |  26 ++
 .../fluid/inference/tests/api/CMakeLists.txt  |   2 +-
 .../tests/api/analyzer_pyramid_dnn_tester.cc  |   1 +
 .../tests/api/analyzer_transformer_tester.cc  |  20 +-
 .../inference/tests/api/config_printer.h      |   3 +-
 .../allocation/allocator_facade_test.cc       |  37 ++-
 .../memory/allocation/legacy_allocator.cc     |  20 +-
 paddle/fluid/memory/detail/buddy_allocator.cc |  26 +-
 paddle/fluid/memory/detail/buddy_allocator.h  |   2 +
 .../fluid/memory/detail/system_allocator.cc   |  20 +-
 paddle/fluid/operators/CMakeLists.txt         |   6 +-
 .../fluid/operators/beam_search_decode_op.cc  |  21 +-
 paddle/fluid/operators/beam_search_op.cc      |  15 +-
 paddle/fluid/operators/concat_op.cc           |  16 +-
 .../operators/controlflow/get_places_op.cc    |   8 +-
 .../controlflow/tensor_array_read_write_op.cc |  15 +-
 .../fluid/operators/controlflow/while_op.cc   |  17 +-
 paddle/fluid/operators/conv_op.cc             |   7 +
 .../fluid/operators/detection/CMakeLists.txt  |   1 +
 .../fluid/operators/detection/yolo_box_op.cc  | 167 ++++++++++
 .../fluid/operators/detection/yolo_box_op.cu  | 120 +++++++
 .../fluid/operators/detection/yolo_box_op.h   | 149 +++++++++
 .../operators/distributed_ops/fake_init_op.cc |   3 +-
 .../operators/distributed_ops/merge_ids_op.cc |   9 +-
 .../operators/distributed_ops/split_ids_op.cc |  11 +-
 paddle/fluid/operators/fc_op.cc               |  27 +-
 paddle/fluid/operators/fc_op.h                |  16 +
 paddle/fluid/operators/fill_constant_op.cc    |   9 +-
 .../fused/fused_embedding_seq_pool_op.cc      |  17 +-
 .../get_tensor_from_selected_rows_op.cc       |  15 +-
 paddle/fluid/operators/hash_op.cc             |   3 +-
 .../operators/hierarchical_sigmoid_op.cc      |  24 +-
 paddle/fluid/operators/lod_rank_table_op.cc   |   8 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |   7 +-
 paddle/fluid/operators/lookup_table_op.cc     |  14 +-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |   1 +
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  24 +-
 .../operators/mkldnn/transpose_mkldnn_op.cc   |  28 +-
 paddle/fluid/operators/nccl/nccl_op.cc        |   9 +-
 paddle/fluid/operators/nce_op.cc              |  14 +-
 .../operators/ngraph/ngraph_engine_op.cc      |   3 +-
 paddle/fluid/operators/optimizers/adam_op.h   |  49 +--
 .../operators/optimizers/lars_momentum_op.cc  |   7 +-
 .../fluid/operators/optimizers/momentum_op.cc |  18 +-
 .../fluid/operators/optimizers/momentum_op.h  |  19 +-
 .../fluid/operators/optimizers/rmsprop_op.h   |  18 +-
 paddle/fluid/operators/optimizers/sgd_op.cc   |  14 +-
 paddle/fluid/operators/pool_op.cc             |   7 +
 paddle/fluid/operators/py_func_op.cc          |  41 ++-
 .../reader/create_custom_reader_op.cc         |  23 +-
 paddle/fluid/operators/reader/read_op.cc      |  17 +-
 .../operators/reader/reader_op_registry.cc    |  21 +-
 .../operators/reader/reader_op_registry.h     |   8 +-
 paddle/fluid/operators/save_op.cc             |   9 +-
 paddle/fluid/operators/scale_op.cc            |  15 +-
 .../sequence_ops/sequence_enumerate_op.cc     |  10 +-
 .../sequence_ops/sequence_enumerate_op.h      |  39 ++-
 paddle/fluid/operators/slice_op.cu            |  14 +-
 .../softmax_with_cross_entropy_op.cu          |   3 +-
 .../fluid/operators/split_selected_rows_op.cc |   9 +-
 paddle/fluid/operators/squeeze_op.cc          |   1 +
 paddle/fluid/operators/sum_op.cc              |  32 +-
 .../operators/tensor_array_to_tensor_op.cc    |   7 +-
 .../operators/tensorrt/tensorrt_engine_op.cc  |   3 +-
 paddle/fluid/operators/uniform_random_op.cc   |  15 +-
 paddle/fluid/platform/device_context.cc       |   2 +
 paddle/fluid/platform/device_context.h        |   4 +
 paddle/fluid/platform/gpu_info.cc             |  61 +++-
 paddle/fluid/platform/gpu_info.h              |   6 +
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/fluid/pybind/imperative.cc             |   6 +-
 paddle/fluid/pybind/inference_api.cc          |   4 +
 paddle/fluid/pybind/pybind.cc                 |   8 +-
 paddle/testing/paddle_gtest_main.cc           |   2 +
 python/paddle/fluid/__init__.py               |   6 +-
 .../fluid/contrib/utils/lookup_table_utils.py | 294 ++++++++++++++----
 python/paddle/fluid/data_feeder.py            |   6 +-
 python/paddle/fluid/executor.py               |  20 +-
 python/paddle/fluid/framework.py              |   5 +
 python/paddle/fluid/imperative/__init__.py    |   4 +
 python/paddle/fluid/imperative/profiler.py    |  30 ++
 python/paddle/fluid/layers/detection.py       |  78 +++++
 python/paddle/fluid/layers/nn.py              |  67 +++-
 python/paddle/fluid/tests/test_detection.py   |  10 +
 .../mkldnn/test_transpose_int8_mkldnn_op.py   |  78 +++++
 .../tests/unittests/test_imperative_gnn.py    | 144 +++++++++
 .../fluid/tests/unittests/test_layers.py      |  75 +++++
 .../fluid/tests/unittests/test_slice_op.py    |  26 ++
 .../fluid/tests/unittests/test_yolo_box_op.py | 117 +++++++
 python/paddle/reader/__init__.py              |   7 +-
 python/paddle/reader/creator.py               |  20 +-
 python/paddle/reader/decorator.py             |  28 +-
 tools/manylinux1/build_scripts/build.sh       |   6 +
 129 files changed, 3392 insertions(+), 708 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass.cc
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass.h
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
 create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.cc
 create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.h
 create mode 100644 paddle/fluid/imperative/profiler.cc
 create mode 100644 paddle/fluid/imperative/profiler.h
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cc
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cu
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.h
 create mode 100644 python/paddle/fluid/imperative/profiler.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_gnn.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_yolo_box_op.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e7ffe72b5..6bb0e5f51f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+message(STATUS "AR tools: ${CMAKE_AR}")
+
 if(WIN32)
     set(CMAKE_SUPPRESS_REGENERATION ON)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index fdd23681af..9a6d0d1c08 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -12,7 +12,7 @@ paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], va
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
 paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
 paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
 paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
@@ -331,6 +331,7 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
@@ -392,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'
 paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
 paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
-paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7'))
-paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196'))
-paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8'))
+paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
+paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
+paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
 paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
 paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
@@ -493,7 +494,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -517,11 +518,11 @@ paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', de
 paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
 paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
 paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
 paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
 paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
 paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
-paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
-paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index 126959bcd8..d139f84883 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
-    auto type = block->Var(inputs.front())->GetType();
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(type);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
+    auto type = ctx->GetType(inputs.front());
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, type);
   }
 };
 
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 0901e59f97..e13ff99f3f 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string>
 #include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
@@ -127,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+    info->infer_var_type_ = [](InferVarTypeContext* context) {
       T inference;
-      inference(fwd_op, block);
+      inference(context);
     };
   }
 };
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 9bccb1a32b..f2f4c53eea 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
@@ -55,11 +57,11 @@ class GradOpDescMakerBase {
                    std::back_inserter(ret_val),
                    [this](const std::string& fwd_var_name) -> std::string {
                      auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.count(g_name)) {
-                       return kEmptyVarName;
-                     } else {
+                     if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
                        (*this->grad_to_var_)[g_name] = fwd_var_name;
                        return g_name;
+                     } else {
+                       return kEmptyVarName;
                      }
                    });
     if (!drop_empty_grad) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index faf7768a7b..a79a53867d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,8 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_placement_pass base)
+pass_library(cpu_quantize_pass inference)
 pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
@@ -68,6 +70,7 @@ pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
+pass_library(runtime_context_cache_pass base)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -102,8 +105,12 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
 cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
+if(NOT WIN32)
+    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+endif()
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
new file mode 100644
index 0000000000..edfaf47f01
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+}  // namespace
+
+enum { U8_MAX = 255, S8_MAX = 127 };
+
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
+using string::PrettyLogDetail;
+
+void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
+                                    std::string input_name, double scale_to_one,
+                                    bool is_unsigned,
+                                    std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize");
+  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
+  q_desc.SetOutput("Output",
+                   std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("Scale", scale);
+  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+  // update op's input
+  op->Op()->SetInput(input_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
+                                       std::string output_name,
+                                       double scale_to_one, bool is_unsigned,
+                                       std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  // create a dequantize op node for output.
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("Scale", scale);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  // update op's output
+  op->Op()->SetOutput(output_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   bool with_residual_data) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ConvResidual conv_pattern{pattern, name_scope_};
+  conv_pattern(with_residual_data);
+
+  int quantize_conv_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize conv2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    auto* conv_op_desc = conv_op->Op();
+
+    // skip if should not be quantized
+    if (!conv_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[conv_input->Name()].first;
+    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
+                  is_input_unsigned, "Scale_in");
+
+    auto filter_scale_tensor = scales[conv_filter->Name()].second;
+    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
+                                     filter_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        filter_scale_tensor.data<double>(),
+        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
+
+    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
+
+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
+                                conv_pattern);
+      auto residual_scale =
+          scales[conv_residual_data->Name()].second.data<double>()[0];
+      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
+
+      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
+                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
+    }
+
+    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[conv_output->Name()].first;
+    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
+                     is_output_unsigned, "Scale_out");
+
+    ++quantize_conv_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_conv_count);
+
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
+  if (with_residual_data) msg_ss << " with residual connection";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+
+void CPUQuantizePass::QuantizePool(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Pool pool_pattern{pattern, name_scope_};
+  pool_pattern();
+
+  int quantize_pool_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize pool2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
+    auto* pool_op_desc = pool_op->Op();
+
+    // skip if should not be quantized
+    if (!pool_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[pool_input->Name()].first;
+    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
+
+    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[pool_output->Name()].first;
+    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_pool_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_pool_count);
+
+  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Quantizing the graph.";
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  PADDLE_ENFORCE(param_scope());
+
+  QuantizeConv(graph.get(), true /* with_residual_data */);
+  QuantizeConv(graph.get());
+  QuantizePool(graph.get());
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
+    .RequirePassAttr("quant_var_scales");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.h b/paddle/fluid/framework/ir/cpu_quantize_pass.h
new file mode 100644
index 0000000000..9873bb04e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+
+/*
+ * Quantize all supported operators.
+ */
+class CPUQuantizePass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+
+  void QuantizePool(Graph* graph) const;
+
+  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
+                     double scale_to_one, bool is_unsigned,
+                     std::string scale_attr_name = "") const;
+
+  void DequantizeOutput(Graph* g, Node* op, Node* output,
+                        std::string output_name, double scale_to_one,
+                        bool is_unsigned,
+                        std::string scale_attr_name = "") const;
+
+  const std::string name_scope_{"quantize"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
new file mode 100644
index 0000000000..89601be7d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           bool use_quantizer = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+    if (inputs.size() > 3) {
+      op->SetInput("ResidualData", {inputs[3]});
+      op->SetAttr("fuse_residual_connection", true);
+    } else {
+      op->SetInput("ResidualData", {});
+      op->SetAttr("fuse_residual_connection", false);
+    }
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Out", {outputs[0]});
+  }
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
+    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+// (a,w1)->Conv1->c and c->Pool1->d
+//
+// (d,w2)->Conv2->e and e->Pool2->f
+//
+// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+//
+// (d,w4, b2)->Conv4->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
+        use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
+        use_quantizer);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int quant_count, int dequant_count, int added_nodes_count,
+              float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  auto* scales = new VarQuantScale();
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
+            << "Scale_in for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+        EXPECT_EQ(
+            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
+            scale)
+            << "Scale_weights for node '" + op_name + "'.";
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = true;
+  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
+  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
+  //
+  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
+  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
+  //
+  // d->Dropout1->g and g->Fc1->h and
+  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
+  //
+  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
+  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
+  int added_nodes = 7 + 7 + 6 + 6;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+           2.0f * 127);
+}
+
+TEST(CpuQuantizePass, do_not_quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = false;
+  int added_nodes = 0;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
+           1.0f);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_pass);
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
new file mode 100644
index 0000000000..50bbe4915b
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include <string>
+#include <unordered_set>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Marks operators which are to be quantized.";
+  const auto& excluded_ids_list =
+      Get<std::unordered_set<int>>("quantize_excluded_op_ids");
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                    n->id()) != excluded_ids_list.end())
+        continue;
+      auto* op = n->Op();
+      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_quantizer", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_quantizer", true);
+        }
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_placement_pass,
+              paddle::framework::ir::CPUQuantizePlacementPass)
+    // a vector of operator type names to be quantized ("conv2d" etc.)
+    .RequirePassAttr("quantize_enabled_op_types")
+    // a vector of operator ids that are to be excluded from quantization
+    .RequirePassAttr("quantize_excluded_op_ids");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
new file mode 100644
index 0000000000..ef3861b249
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be quantized.
+ */
+class CPUQuantizePlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
new file mode 100644
index 0000000000..5a4d622645
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           boost::tribool use_quantizer) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_quantizer))
+    op->SetAttr("use_quantizer", use_quantizer);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_quantizer
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     false
+// f->relu->g                    none
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   false
+// k->pool->l                    false
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
+              std::initializer_list<int> quantize_excluded_op_ids,
+              unsigned expected_use_quantizer_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+  pass->Set("quantize_excluded_op_ids",
+            new std::unordered_set<int>(quantize_excluded_op_ids));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_quantizer_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_quantizer") &&
+          boost::get<bool>(op->GetAttr("use_quantizer"))) {
+        ++use_quantizer_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+}
+
+TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
+
+TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
+  MainTest({"conv2d"}, {4}, 1);
+}
+
+TEST(QuantizerPlacementPass, excluded_none) {
+  // 2 conv + 2 pool
+  MainTest({}, {}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_placement_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 08354b526a..b653e5a521 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
+                subgraphs.size());
   int id = 0;
   for (auto &g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";
@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() {
                         ->AsOutput()
                         ->assert_is_op_output("conv2d", "Output");
 
-  conv_op->LinksFrom({input_var, filter_var});
-  conv_op->LinksTo({output_var});
+  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+
+  if (!with_residual_data)
+    conv_op->assert_op_attr("fuse_residual_connection", false);
+
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+
+  std::vector<PDNode *> links_from{input_var, filter_var};
+
+  if (with_residual_data) {
+    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("conv2d", "ResidualData");
+    links_from.push_back(res_conn_var);
+  }
+
+  conv_op->LinksFrom(links_from).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::Pool::operator()() {
+  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
+
+  auto input_var = pattern->NewNode(pool_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("pool2d", "X");
+
+  auto output_var = pattern->NewNode(pool_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("pool2d", "Out");
 
+  pool_op->LinksFrom({input_var}).LinksTo({output_var});
   return output_var;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 3db4bba10d..fc30b5b21c 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -659,6 +659,35 @@ struct Conv : public PatternBase {
   PATTERN_DECL_NODE(conv_output);
 };
 
+// Convolution op with residual data
+struct ConvResidual : public PatternBase {
+  ConvResidual(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_residual") {}
+
+  PDNode* operator()(bool with_residual_data);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+
+// Pool op
+// Forward pass for pooling.
+// pool_input is the input.
+// pool_output is a result of the operator.
+struct Pool : public PatternBase {
+  Pool(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "pooling") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(pool_op);
+  PATTERN_DECL_NODE(pool_input);
+  PATTERN_DECL_NODE(pool_output);
+};
+
 // ElementwiseAdd used in residual connections.
 // y_var is used and convolution output.
 // The operator is removed, when residual
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 7ed2f96eb2..a95588a57b 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 
@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
new file mode 100644
index 0000000000..67b29512c4
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Applies Runtime Context Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(runtime_context_cache_pass,
+              paddle::framework::ir::RuntimeContextCachePass);
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
new file mode 100644
index 0000000000..a6cf1a9ae5
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RuntimeContextCachePass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 0e7b0cbeb9..8f9c6cb5e9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const {
   // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
-    info.infer_var_type_(*this, block);
+    InferVarTypeContext context(this, block);
+    info.infer_var_type_(&context);
   }
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 44821aadf6..ab96201b33 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -874,9 +874,23 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
   return kernel_configs;
 }
 
+RuntimeContext* OperatorWithKernel::GetRuntimeContext(
+    const Scope& scope) const {
+  if (!HasAttr(kEnableCacheRuntimeContext)) {
+    return new RuntimeContext(Inputs(), Outputs(), scope);
+  } else {
+    const Scope* cur_scope = &scope;
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
+    }
+    return runtime_ctx_.get();
+  }
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeContext ctx(Inputs(), Outputs(), scope);
+  auto runtime_ctx = GetRuntimeContext(scope);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -891,7 +905,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -915,8 +929,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
+                                     &transfered_inplace_vars, runtime_ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -927,13 +941,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 
   if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
-    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(
-      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
+                                       *runtime_ctx, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 822bf5c9ce..ca5f0e27b3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
+/// RuntimeContext is used to relate input/output names of Operator with
+/// the corresponding variables in name scope.
+/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
+/// name scope, since the input/output names of this Op do not change in the
+/// execution, RuntimeContext could be created only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -456,6 +464,7 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  RuntimeContext* GetRuntimeContext(const Scope& scope) const;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to
@@ -474,6 +483,8 @@ class OperatorWithKernel : public OperatorBase {
 
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
+  mutable const Scope* pre_scope_ = nullptr;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a7f09df491..5f21dae605 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
+#ifdef PADDLE_WITH_MKLDNN
+    if (src.layout() == DataLayout::kMKLDNN) {
+      dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
+    }
+#endif
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index d02c699b97..f55520901c 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -27,6 +27,7 @@ namespace framework {
 class OperatorBase;
 class OpDesc;
 class InferShapeContext;
+class InferVarTypeContext;
 class BlockDesc;
 class Variable;
 
@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
     const std::vector<BlockDesc*>& grad_block)>;
 
 using InferVarTypeFN =
-    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+    std::function<void(framework::InferVarTypeContext* /*context*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 64236b78d2..2e9c64d3e6 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -21,26 +23,123 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpDesc;
+class BlockDesc;
+// default infer var type context
+class InferVarTypeContext {
+ public:
+  InferVarTypeContext(const OpDesc* op, BlockDesc* block)
+      : op_(op), block_(block) {}
+
+  virtual ~InferVarTypeContext() {}
+
+  virtual Attribute GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->GetAttr(name);
+  }
+
+  virtual bool HasVar(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindVarRecursive(name) != nullptr;
+  }
+
+  virtual bool HasInput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Inputs().count(name) > 0;
+  }
+
+  virtual bool HasOutput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Outputs().count(name) > 0;
+  }
+
+  virtual const std::vector<std::string>& Input(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Input(name);
+  }
+
+  virtual const std::vector<std::string>& Output(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Output(name);
+  }
+
+  virtual proto::VarType::Type GetType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetType();
+  }
+
+  virtual void SetType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetType(type);
+  }
+
+  virtual proto::VarType::Type GetDataType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataType();
+  }
+
+  virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataType(type);
+  }
+
+  virtual std::vector<proto::VarType::Type> GetDataTypes(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
+  }
+
+  virtual void SetDataTypes(
+      const std::string& name,
+      const std::vector<proto::VarType::Type>& multiple_data_type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
+  }
+
+  virtual std::vector<int64_t> GetShape(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetShape();
+  }
+
+  virtual void SetShape(const std::string& name,
+                        const std::vector<int64_t>& dims) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetShape(dims);
+  }
+
+  virtual int32_t GetLoDLevel(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
+  }
+
+  virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
+  }
+
+ protected:
+  const OpDesc* op_;
+  BlockDesc* block_;
+};
+
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+  virtual void operator()(InferVarTypeContext* context) const = 0;  // NOLINT
 };
 
 class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const final {
+  void operator()(framework::InferVarTypeContext* ctx) const final {  // NOLINT
     auto in_out_var_names = this->GetInputOutputWithSameType();
 
     for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = op_desc.Input(i_o_n.first).at(0);
-      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+      auto& x_name = ctx->Input(i_o_n.first).at(0);
+      auto& out_name = ctx->Output(i_o_n.second).at(0);
 
-      auto& x = block->FindRecursiveOrCreateVar(x_name);
-      auto& out = block->FindRecursiveOrCreateVar(out_name);
-      out.SetType(x.GetType());
-      out.SetDataType(x.GetDataType());
+      ctx->SetType(out_name, ctx->GetType(x_name));
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 2a75394fca..6bbb25a573 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 }  // namespace framework
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index ec8dedd605..0d116a6495 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_PYTHON)
 cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
+cc_library(imperative_profiler SRCS profiler.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5530823b90..3d1de95f58 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -214,13 +214,11 @@ framework::LoDTensor& VarBase::GradValue() {
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    VLOG(3) << "op with no grad: " << Type();
-    return {};
-  }
+  PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
+                 "%s has no backward implementation", Type());
 
   VLOG(3) << "apply op grad: " << Type();
-  std::vector<framework::VariableValueMap> tmp_grad_outputs;
+  std::vector<VarBasePtrMap> tmp_grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
     tmp_grad_outputs.resize(1);
@@ -239,30 +237,66 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       VLOG(3) << "apply grad op " << grad_op_desc->Type();
 
       // Allocate tmp grad output variable
-      for (auto it : grad_output_variable_map) {
+      for (const auto& it : grad_output_variable_map) {
         auto& outputs = tmp_grad_outputs[k][it.first];
         outputs.reserve(it.second.size());
         for (size_t i = 0; i < it.second.size(); ++i) {
+          VarBase* origin_grad_var_base = it.second[i];
+
           // Allocate a new variable
-          Variable* tmp_var = new framework::Variable();
-          tmp_var->GetMutable<framework::LoDTensor>();
-          outputs.emplace_back(tmp_var);
+          VarBase* tmp_grad_var_base = new VarBase(
+              string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
+              origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
+              place_, true, false);
+          outputs.emplace_back(tmp_grad_var_base);
         }
       }
 
-      // Run grad op
-      framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
-
       // No need to do compile time infer shape here.
       // grad_op_desc_->InferShape(*block_);
       // grad_op_desc->InferVarType(block_);
 
       std::unique_ptr<framework::OperatorBase> opbase =
           framework::OpRegistry::CreateOp(*grad_op_desc);
+
+      auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      if (info.infer_var_type_) {
+        RuntimeInferVarTypeContext infer_var_type_ctx(
+            &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
+        info.infer_var_type_(&infer_var_type_ctx);
+      }
+
       framework::OperatorWithKernel* op_kernel =
           dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
       PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
+      // Run grad op
+      framework::VariableValueMap grad_invars_map;
+      framework::VariableValueMap grad_outvars_map;
+
+      for (const auto& it : grad_input_vars_[k]) {
+        auto& grad_invars = grad_invars_map[it.first];
+        grad_invars.reserve(it.second.size());
+        for (const VarBase* grad_inp : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
+                                  grad_op_desc->Type(), grad_inp->Name());
+
+          grad_invars.emplace_back(grad_inp->var_);
+        }
+      }
+
+      for (const auto& it : tmp_grad_outputs[k]) {
+        auto& grad_outvars = grad_outvars_map[it.first];
+        grad_outvars.reserve(it.second.size());
+        for (VarBase* grad_out : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
+                                  grad_op_desc->Type(), grad_out->Name());
+
+          grad_outvars.emplace_back(grad_out->var_);
+        }
+      }
+
+      framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
@@ -273,14 +307,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 
   // Add tmp grad outputs to original grad vars
   for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
-    for (auto it : grad_output_vars_[k]) {
+    for (const auto& it : grad_output_vars_[k]) {
       auto& outputs = tmp_grad_outputs[k][it.first];
-      auto& origin_outputs = it.second;
+      const auto& origin_outputs = it.second;
       PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
       for (size_t i = 0; i < outputs.size(); ++i) {
-        framework::Variable* grad = outputs[i];
-        framework::Variable* orig_grad = origin_outputs[i];
+        framework::Variable* grad = outputs[i]->var_;
+        framework::Variable* orig_grad = origin_outputs[i]->var_;
         AddTo(grad, orig_grad, place_);
         delete grad;
       }
@@ -328,28 +362,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
 
 int PyLayer::NumFuncs() { return py_funcs_.size(); }
 
-std::vector<Variable*> PyLayer::Apply(int func_id,
-                                      const std::vector<VarBase*>& inputs) {
-  std::vector<framework::Variable*> invars;
-  for (const VarBase* in : inputs) {
-    invars.push_back(in->var_);
-  }
+std::vector<framework::Variable*> PyLayer::Apply(
+    int func_id, const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], invars);
+  return CallPythonFunc(py_funcs_[func_id], inputs);
 }
 
-std::vector<Variable*> PyLayer::ApplyGrad(
-    int func_id, const std::vector<framework::Variable*>& inputs) {
+std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], inputs);
+  auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
+
+  std::vector<VarBase*> outs;
+  outs.reserve(rets.size());
+  for (size_t i = 0U; i != rets.size(); ++i) {
+    outs.emplace_back(new VarBase(
+        string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
+                        i),
+        rets[i], nullptr, true));
+  }
+
+  return outs;
 }
 
 std::vector<framework::Variable*> PyLayer::CallPythonFunc(
-    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+    const py::object& callable, const std::vector<VarBase*>& ins) {
   py::gil_scoped_acquire guard;
   py::tuple in_args(ins.size());
   for (size_t i = 0; i < ins.size(); ++i) {
-    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
     in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
   }
   VLOG(3) << "pyfunc in " << py::len(in_args);
@@ -359,6 +400,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
   std::vector<framework::Variable*> outs;
+  outs.reserve(ret_num);
   VLOG(3) << "pyfunc out " << ret_num;
   for (size_t i = 0; i < ret_num; ++i) {
     try {
@@ -369,7 +411,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
       auto* tensor = var->GetMutable<framework::LoDTensor>();
       tensor->ShareDataWith(*py_out_tensor);
       tensor->set_lod(py_out_tensor->lod());
-      outs.push_back(var);
+      outs.emplace_back(var);
     } catch (py::cast_error&) {
       PADDLE_THROW("The %d-th output must be LoDTensor", i);
     }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 618a5b7a03..72c548d5e9 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -18,14 +18,16 @@
 #include "paddle/fluid/framework/python_headers.h"
 // clang-format on
 
-#include <map>     // NOLINT
-#include <string>  // NOLINT
-#include <vector>  // NOLINT
-#include <memory>  // NOLINT
+#include <map>            // NOLINT
+#include <string>         // NOLINT
+#include <vector>         // NOLINT
+#include <memory>         // NOLINT
+#include <unordered_map>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -135,13 +137,13 @@ class VarBase {
                 persistable) {}
 
  private:
+  // TODO(minqiyang): need support SelectedRows
   VarBase(const std::string& name, framework::proto::VarType::Type dtype,
           const framework::DDim& shape, const platform::Place& place,
           framework::Variable* var, VarBase* grad, bool stop_gradient,
           bool persistable)
       : name_(name),
-        dtype_(dtype),
-        place_(place),
+        type_(framework::proto::VarType::LOD_TENSOR),
         var_(var),
         grads_(grad),
         stop_gradient_(stop_gradient),
@@ -151,10 +153,12 @@ class VarBase {
         pre_op_out_idx_(-1) {
     if (!var_) {
       var_ = new framework::Variable();
-      auto tensor = var_->GetMutable<framework::LoDTensor>();
-      tensor->Resize(shape);
-      tensor->mutable_data(place_, dtype_);
     }
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->Resize(shape);
+    tensor->mutable_data(place, dtype);
+    VLOG(10) << "create varbase: " << name_ << " type: " << dtype
+             << " place: " << place;
   }
 
  public:
@@ -184,7 +188,23 @@ class VarBase {
     }
   }
 
-  inline framework::proto::VarType::Type DType() const { return dtype_; }
+  inline framework::DDim Dims() const {
+    return var_->Get<framework::LoDTensor>().dims();
+  }
+
+  // data type. e.g.. FP32
+  inline void SetDataType(framework::proto::VarType::Type type) {
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(tensor->place(), type);
+  }
+  inline framework::proto::VarType::Type DataType() const {
+    auto tensor = var_->Get<framework::LoDTensor>();
+    return tensor.type();
+  }
+
+  // tensor type. e.g.. LoDTensor
+  inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
+  inline framework::proto::VarType::Type Type() const { return type_; }
 
   inline void SetStopGradient(bool stop_gradient) {
     stop_gradient_ = stop_gradient;
@@ -238,7 +258,7 @@ class VarBase {
   }
 
   std::string name_;
-  framework::proto::VarType::Type dtype_;
+  framework::proto::VarType::Type type_;
   platform::Place place_;
 
   framework::Variable* var_;
@@ -294,17 +314,23 @@ class PYBIND11_HIDDEN OpBase {
 
   void InvokeBackwardHooks();
 
-  void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
-    if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
-      VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
-              << inp_name;
-      pre_ops_[inp_name].push_back(inp_var->PreOp());
-      pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
-    } else {
-      VLOG(3) << "no pre op in slot " << inp_name
-              << " input var stop_gradient: " << inp_var->IsStopGradient();
-      pre_ops_[inp_name].push_back(nullptr);
-      // pre_ops_out_idx_[inp_name].push_back(-1);
+  void TrackPreOp(const std::string& inp_name,
+                  const std::vector<VarBase*>& inputs) {
+    auto& pre_ops_list = pre_ops_[inp_name];
+    pre_ops_list.reserve(inputs.size());
+    auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
+    for (VarBase* inp_var : inputs) {
+      if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
+        VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
+                << inp_name;
+        pre_ops_list.emplace_back(inp_var->PreOp());
+        pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
+      } else {
+        VLOG(3) << "no pre op in slot " << inp_name
+                << " input var stop_gradient: " << inp_var->IsStopGradient();
+        pre_ops_list.emplace_back(nullptr);
+        // pre_ops_out_idx_list.push_back(-1);
+      }
     }
   }
 
@@ -328,11 +354,13 @@ class PYBIND11_HIDDEN OpBase {
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
   // Inputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_input_vars_;
+  std::vector<VarBasePtrMap> grad_input_vars_;
   // Outputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_output_vars_;
+  std::vector<VarBasePtrMap> grad_output_vars_;
 
   std::vector<py::object> backward_hooks_;
+
+  framework::AttributeMap attrs_;
 };
 
 class Layer {
@@ -359,12 +387,131 @@ class PyLayer {
   static std::vector<framework::Variable*> Apply(
       int func_id, const std::vector<VarBase*>& inputs);
 
-  static std::vector<framework::Variable*> ApplyGrad(
-      int func_id, const std::vector<framework::Variable*>& inputs);
+  static std::vector<VarBase*> ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs);
 
  private:
   static std::vector<framework::Variable*> CallPythonFunc(
-      const py::object& callable, const std::vector<framework::Variable*>& ins);
+      const py::object& callable, const std::vector<VarBase*>& ins);
+};
+
+// infer var type context for imperative mode
+class PYBIND11_HIDDEN RuntimeInferVarTypeContext
+    : public framework::InferVarTypeContext {
+ public:
+  RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
+                             imperative::VarBasePtrMap* outputs,
+                             const framework::AttributeMap* attrs_map)
+      : InferVarTypeContext(nullptr, nullptr),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs_map),
+        input_names_(),
+        output_names_(),
+        var_set_() {
+    input_names_.reserve(inputs_->size());
+    for (auto& it : *inputs_) {
+      for (imperative::VarBase* var : it.second) {
+        input_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+
+    output_names_.reserve(outputs_->size());
+    for (auto& it : *outputs_) {
+      for (imperative::VarBase* var : it.second) {
+        output_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+  }
+
+  virtual ~RuntimeInferVarTypeContext() {}
+
+  framework::Attribute GetAttr(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(attrs_);
+    return attrs_->at(name);
+  }
+
+  bool HasVar(const std::string& name) const override {
+    return var_set_.count(name) > 0;
+  }
+
+  bool HasInput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(inputs_);
+    return inputs_->count(name) > 0;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(outputs_);
+    return outputs_->count(name) > 0;
+  }
+
+  const std::vector<std::string>& Input(
+      const std::string& name) const override {
+    return input_names_.at(name);
+  }
+
+  const std::vector<std::string>& Output(
+      const std::string& name) const override {
+    return output_names_.at(name);
+  }
+
+  framework::proto::VarType::Type GetType(
+      const std::string& name) const override {
+    return var_set_.at(name)->Type();
+  }
+
+  void SetType(const std::string& name,
+               framework::proto::VarType::Type type) override {
+    var_set_[name]->SetType(type);
+  }
+
+  framework::proto::VarType::Type GetDataType(
+      const std::string& name) const override {
+    return var_set_.at(name)->DataType();
+  }
+
+  void SetDataType(const std::string& name,
+                   framework::proto::VarType::Type type) override {
+    var_set_[name]->SetDataType(type);
+  }
+
+  std::vector<framework::proto::VarType::Type> GetDataTypes(
+      const std::string& name) const override {
+    PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
+  }
+
+  void SetDataTypes(const std::string& name,
+                    const std::vector<framework::proto::VarType::Type>&
+                        multiple_data_type) override {
+    PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
+  }
+
+  std::vector<int64_t> GetShape(const std::string& name) const override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  void SetShape(const std::string& name,
+                const std::vector<int64_t>& dims) override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  int32_t GetLoDLevel(const std::string& name) const override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+  void SetLoDLevel(const std::string& name, int32_t lod_level) override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+ private:
+  const imperative::VarBasePtrMap* inputs_;
+  imperative::VarBasePtrMap* outputs_;
+  const framework::AttributeMap* attrs_;
+  std::unordered_map<std::string, std::vector<std::string>> input_names_;
+  std::unordered_map<std::string, std::vector<std::string>> output_names_;
+  std::unordered_map<std::string, imperative::VarBase*> var_set_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
new file mode 100644
index 0000000000..34570b3a60
--- /dev/null
+++ b/paddle/fluid/imperative/profiler.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/profiler.h"
+
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
+
+DEFINE_string(
+    tracer_profile_fname, "xxgperf",
+    "Profiler filename for imperative tracer, which generated by gperftools."
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag gTracerProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gTracerProfilerStarted = false;
+#endif
+
+void StartProfile() {
+  if (!FLAGS_tracer_profile_fname.empty()) {
+    std::call_once(gTracerProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
+      gTracerProfilerStarted = true;
+#else
+      LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                      "FLAGS_tracer_profile_fname will be ignored";
+#endif
+    });
+  }
+}
+
+void StopProfile() {
+#ifdef WITH_GPERFTOOLS
+  ProfilerFlush();
+#else
+  LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                  "FLAGS_tracer_profile_fname will be ignored";
+#endif
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h
new file mode 100644
index 0000000000..d52aeed4e8
--- /dev/null
+++ b/paddle/fluid/imperative/profiler.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace imperative {
+
+extern void StartProfile();
+
+extern void StopProfile();
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7ee92b4d8c..0cfdea030e 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,38 +19,26 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-
-DEFINE_string(
-    tracer_profile_fname, "",
-    "Profiler filename for imperative tracer, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
-
 namespace paddle {
 namespace imperative {
 
-static std::once_flag gTracerProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gTracerProfilerStarted = false;
-#endif
-
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
                   std::vector<framework::OpDesc*>* grad_op_descs,
                   std::unordered_map<std::string, std::string>* grad_to_var) {
   PADDLE_ENFORCE(grad_op_descs->empty());
-  std::vector<std::unique_ptr<framework::OpDesc>> descs =
-      framework::OpInfoMap::Instance()
-          .Get(op_desc.Type())
-          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  const framework::OpInfo& op_info =
+      framework::OpInfoMap::Instance().Get(op_desc.Type());
+  if (!op_info.grad_op_maker_) return;
 
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
+      op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
   for (auto& desc : descs) {
     grad_op_descs->emplace_back(desc.release());
   }
@@ -145,31 +133,13 @@ framework::VariableNameMap CreateOutputVarNameMap(
   return result;
 }
 
-Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
-  if (!FLAGS_tracer_profile_fname.empty()) {
-    std::call_once(gTracerProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
-      gTracerProfilerStarted = true;
-#else
-      LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                      "FLAGS_tracer_profile_fname will be ignored";
-#endif
-    });
-  }
-}
+Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                                    const VarBasePtrMap& outputs,
+                                    VarBasePtrMap* outputs,
                                     framework::AttributeMap attrs_map,
                                     const platform::Place expected_place,
                                     const bool stop_gradient) {
-#ifdef WITH_GPERFTOOLS
-  if (gTracerProfilerStarted) {
-    ProfilerFlush();
-  }
-#endif
-
   framework::VariableValueMap invars_map;
   framework::VariableValueMap outvars_map;
 
@@ -184,7 +154,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                               inp->Name());
 
       invars.emplace_back(inp->var_);
-      op->TrackPreOp(inp, it.first);
       if (!stop_gradient) {
         current_vars_map[inp->Name()] = inp;
       }
@@ -192,9 +161,10 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
               << " inited: " << inp->var_->IsInitialized()
               << " stop_grad: " << inp->IsStopGradient();
     }
+    op->TrackPreOp(it.first, it.second);
   }
 
-  op->output_vars_ = outputs;
+  op->output_vars_ = *outputs;
   for (auto it : op->output_vars_) {
     auto& outvars = outvars_map[it.first];
     const std::vector<VarBase*>& outputs = it.second;
@@ -217,7 +187,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   framework::VariableNameMap invars_name_map =
       CreateInputVarNameMap(op, inputs);
   framework::VariableNameMap outvars_name_map =
-      CreateOutputVarNameMap(op, outputs);
+      CreateOutputVarNameMap(op, *outputs);
 
   auto& info = framework::OpInfoMap::Instance().Get(op->Type());
   if (info.Checker() != nullptr) {
@@ -228,6 +198,11 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
                                       outvars_name_map, attrs_map);
 
+  if (info.infer_var_type_) {
+    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
+    info.infer_var_type_(&infer_var_type_ctx);
+  }
+
   // TODO(minqiyang): Support infer var type in imperative mode
   // Run forward op
   VLOG(3) << "tracer running " << op->Type();
@@ -252,6 +227,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
     VLOG(5) << "start construct backward op";
 
     // construct grad op descs
+    op->attrs_ = attrs_map;
     std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
         op->Type(), invars_name_map, outvars_name_map, attrs_map));
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
@@ -278,12 +254,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
             auto fwd_var_it = current_vars_map.find(grad_invar);
             PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
             // Forward inputs or outputs.
-            grad_in_vars.emplace_back(fwd_var_it->second->var_);
+            grad_in_vars.emplace_back(fwd_var_it->second);
           } else {
             VarBase* var = current_vars_map[var_it->second];
             InitGrad(var, prepared_op.GetDeviceContext());
             // Douts.
-            grad_in_vars.emplace_back(var->grads_->var_);
+            grad_in_vars.emplace_back(var->grads_);
           }
 
           vars_saved_for_backward.insert(it.first);
@@ -300,7 +276,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                          op->Type());
           VarBase* var = current_vars_map[var_it->second];
           InitGrad(var, prepared_op.GetDeviceContext());
-          grad_out_vars.push_back(var->grads_->var_);
+          grad_out_vars.push_back(var->grads_);
         }
       }
     }
@@ -319,9 +295,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   std::vector<framework::Variable*> ret_vars =
       PyLayer::Apply(op->forward_id_, inputs);
 
-  for (VarBase* inp : inputs) {
-    op->TrackPreOp(inp, PyLayer::kFwdInp);
-  }
+  op->TrackPreOp(PyLayer::kFwdInp, inputs);
 
   std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
   outputs.reserve(ret_vars.size());
@@ -342,23 +316,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     auto& grad_output_vars =
         op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
 
-    for (const VarBase* inp : inputs) {
-      grad_input_vars.push_back(inp->var_);
+    for (VarBase* inp : inputs) {
+      grad_input_vars.push_back(inp);
     }
     for (VarBase* out : outputs) {
-      grad_input_vars.push_back(out->var_);
+      grad_input_vars.push_back(out);
     }
 
     // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
     platform::CPUPlace place;
     for (VarBase* out : outputs) {
       InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
-      grad_input_vars.push_back(out->grads_->var_);
+      grad_input_vars.push_back(out->grads_);
     }
 
     for (VarBase* inp : inputs) {
       InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
-      grad_output_vars.push_back(inp->grads_->var_);
+      grad_output_vars.push_back(inp->grads_);
     }
   }
   return outputs;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 7b65d55e9e..a87f3b8009 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -48,7 +48,7 @@ class Tracer {
   virtual ~Tracer() {}
 
   std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
-                              const VarBasePtrMap& outputs,
+                              VarBasePtrMap* outputs,  // NOLINT
                               framework::AttributeMap attrs_map,
                               const platform::Place expected_place,
                               const bool stop_gradient = false);
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index fc9e42f8d0..c51ce931de 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -25,6 +25,7 @@ class VarBase;
 class OpBase;
 
 typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
 typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 762640d6d1..d27ef8fe3c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -91,5 +91,5 @@ if(WITH_TESTING)
   add_subdirectory(tests/book)
   if(WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
-  endif()  
+  endif()
 endif()
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 89e934ae27..321deccf86 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -38,7 +39,10 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
+
 using framework::ir::Graph;
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -127,6 +131,8 @@ struct Argument {
   // Pass a set of op types to enable its mkldnn kernel
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
+  // Scales for variables to be quantized
+  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
 
   // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 1cdb4881fb..8fd86b2cc5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument,
                                   ".dot";
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
-    }
-    if (pass_name == "mkldnn_placement_pass") {
+    } else if (pass_name == "mkldnn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
-    }
-
-    if (pass_name == "tensorrt_subgraph_pass") {
+    } else if (pass_name == "cpu_quantize_pass") {
+      pass->Set("quant_var_scales",
+                new VarQuantScale(argument->quant_var_scales()));
+    } else if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7741111222..4cad8a9dfc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -118,6 +118,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(serialized_info_cache_);
 
+  // framework related.
+  CP_MEMBER(enable_runtime_context_cache_);
+
   if (use_gpu_) {
     pass_builder_.reset(new GpuPassStrategy(
         *static_cast<GpuPassStrategy *>(other.pass_builder())));
@@ -219,12 +222,23 @@ void AnalysisConfig::Update() {
   }
 
   if (enable_memory_optim_) {
-    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+    auto analysis_passes = pass_builder()->AnalysisPasses();
+    auto memory_opti_pass_name = "memory_optimize_pass";
+    bool already_exists =
+        std::find(analysis_passes.begin(), analysis_passes.end(),
+                  memory_opti_pass_name) != analysis_passes.end();
+    if (!already_exists) {
+      pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
+    }
   }
 
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
+
+  if (enable_runtime_context_cache_) {
+    pass_builder()->AppendPass("runtime_context_cache_pass");
+  }
 }
 
 std::string AnalysisConfig::SerializeInfoCache() {
@@ -258,6 +272,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << specify_input_name_;
   ss << cpu_math_library_num_threads_;
+  ss << enable_runtime_context_cache_;
 
   return ss.str();
 }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9b05c33504..3b7faa5400 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -194,6 +194,23 @@ struct AnalysisConfig {
   /** Tell whether the memory optimization is activated. */
   bool enable_memory_optim() const;
 
+  // framework related
+  /** \brief Control whether to perform runtime context cache optimization.
+   *
+   * If turned off, in Op's every execution, RuntimeContext would be called to
+   * relate input/output names of this Op with the corresponding variables in
+   * Scope.
+   */
+  void SwitchRuntimeContextCache(int x = true) {
+    enable_runtime_context_cache_ = x;
+  }
+  /** A boolean state tell whether the runtime context cache optimization is
+   * actived.
+   */
+  bool runtime_context_cache_enabled() const {
+    return enable_runtime_context_cache_;
+  }
+
   friend class ::paddle::AnalysisPredictor;
 
   /** NOTE just for developer, not an official API, easily to be broken.
@@ -254,6 +271,15 @@ struct AnalysisConfig {
 
   int cpu_math_library_num_threads_{1};
 
+  // framework related
+  // RuntimeContext is used to relate input/output names of Operator with
+  // the corresponding variables in Scope.
+  // If enable_runtime_context_cache_ is true, it means that in a same Scope,
+  // since the input/output names of this Op do not change in the execution,
+  // RuntimeContext could be created only at the first iteration of this Op's
+  // execution to save the elapsed time.
+  bool enable_runtime_context_cache_{false};
+
   // A runtime cache, shouldn't be transferred to others.
   std::string serialized_info_cache_;
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8f7b6f31de..d9ac73b063 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -110,7 +110,7 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 5157bd280d..e1787a7177 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim();
+  cfg->SwitchRuntimeContextCache();
   if (FLAGS_zero_copy) {
     cfg->SwitchUseFeedFetchOps(false);
   }
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index 9d17f38ab7..f765f55611 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_Transformer, profile) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 }
 
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_Transformer, fuse_statis) {
   AnalysisConfig cfg;
@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Transformer, compare) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b0c23fbd53..b7b39d4dd4 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -72,7 +72,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
   }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+  os << GenSpaces(num_spaces)
+     << "use_runtime_context_cache: " << config.runtime_context_cache_enabled()
      << "\n";
   os << GenSpaces(num_spaces)
      << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_test.cc
index 802d79e15d..2df1486c91 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@@ -19,6 +19,8 @@
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(gpu_init_memory_in_mb);
+DECLARE_uint64(gpu_reallocate_memory_in_mb);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
 
@@ -26,13 +28,8 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
+//! Run allocate test cases for different places
+void AllocateTestCases() {
   auto &instance = AllocatorFacade::Instance();
   platform::Place place;
   size_t size = 1024;
@@ -82,6 +79,32 @@ TEST(allocator, allocator) {
 #endif
 }
 
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
+TEST(allocator, specify_gpu_memory) {
+#ifdef PADDLE_WITH_CUDA
+  // Set to 0.0 to test FLAGS_gpu_init_memory_in_mb and
+  // FLAGS_gpu_reallocate_memory_in_mb
+  FLAGS_fraction_of_gpu_memory_to_use = 0.0;
+  // 512 MB
+  FLAGS_gpu_init_memory_in_mb = 512;
+  // 4 MB
+  FLAGS_gpu_reallocate_memory_in_mb = 4;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index c233bf4edf..5f30b2221d 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
             "that initializing the allocated memory with a small value "
             "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(gpu_init_memory_in_mb);
+DECLARE_uint64(gpu_reallocate_memory_in_mb);
 DECLARE_bool(benchmark);
 
 namespace paddle {
@@ -153,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
                                     platform::GpuMinChunkSize(),
                                     platform::GpuMaxChunkSize());
 
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_gpu_init_memory_in_mb' "
+               << "or 'FLAGS_gpu_reallocate_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_gpu_init_memory_in_mb' value is "
+               << FLAGS_gpu_init_memory_in_mb
+               << ". Current 'FLAGS_gpu_reallocate_memory_in_mb' value is "
+               << FLAGS_gpu_reallocate_memory_in_mb << "\n\n";
     }
   });
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 26ef27c3ca..2898185a34 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+
+#include <utility>  // for std::move
+
 #include "glog/logging.h"
 
 DEFINE_bool(free_idle_memory, false,
@@ -185,18 +188,27 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+  size_t allocate_bytes = max_chunk_size_;
+  size_t index = 0;
+
 #ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
+      // Compute the allocation size for gpu for the first allocation.
       max_chunk_size_ = platform::GpuMaxChunkSize();
+      allocate_bytes = platform::GpuInitAllocSize();
+    } else {
+      // Reallocation size
+      if (realloc_size_ == 0) {
+        realloc_size_ = platform::GpuReallocSize();
+      }
+      allocate_bytes = realloc_size_;
     }
   }
 #endif
 
-  // Allocate a new maximum sized block
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
+  // Allocate a new block
+  void* p = system_allocator_->Alloc(&index, allocate_bytes);
 
   if (p == nullptr) return pool_.end();
 
@@ -204,7 +216,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
            << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     allocate_bytes, nullptr, nullptr);
 
   // gpu fallback allocation
   if (system_allocator_->UseGpu() &&
@@ -212,10 +224,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
     fallback_alloc_count_++;
   }
 
-  total_free_ += max_chunk_size_;
+  total_free_ += allocate_bytes;
 
   // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 3f86a51f0d..7a785daba8 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -89,6 +89,8 @@ class BuddyAllocator {
   size_t min_chunk_size_;  // the minimum size of each chunk
   size_t max_chunk_size_;  // the maximum size of each chunk
 
+  size_t realloc_size_ = 0;  // the size of re-allocated chunk
+
  private:
   /**
    * \brief A list of free allocation
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 197d1c2f21..70fa203df7 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -32,6 +32,9 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(gpu_init_memory_in_mb);
+DECLARE_uint64(gpu_reallocate_memory_in_mb);
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    LOG(WARNING)
-        << "Cannot malloc " << size / 1024.0 / 1024.0
-        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
-           "environment variable to a lower value. Current value is "
-        << FLAGS_fraction_of_gpu_memory_to_use;
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB GPU memory. Please shrink "
+                    "FLAGS_fraction_of_gpu_memory_to_use or "
+                    "FLAGS_gpu_init_memory_in_mb or "
+                    "FLAGS_gpu_reallocate_memory_in_mb"
+                    "environment variable to a lower value. "
+                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+                 << FLAGS_fraction_of_gpu_memory_to_use
+                 << ". Current FLAGS_gpu_init_memory_in_mb value is "
+                 << FLAGS_gpu_init_memory_in_mb
+                 << ". Current FLAGS_gpu_reallocate_memory_in_mb value is "
+                 << FLAGS_gpu_reallocate_memory_in_mb;
     return nullptr;
   }
 }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2f8e0b3a30..651c5e6e75 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -58,8 +58,10 @@ if (WITH_GPU)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
-    op_library(sync_batch_norm_op)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    if (NOT WIN32)
+        op_library(sync_batch_norm_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index cf78c83297..4cef49280d 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for
 each source sentence by walking back along the LoDTensorArray Input(ids)
 whose lods can be used to restore the path in the beam search tree.
 
-The Output(SentenceIds) and Output(SentenceScores) separately contain the 
-generated id sequences and the corresponding scores. The shapes and lods of the 
-two LodTensor are same. The lod level is 2 and the two levels separately 
-indicate how many hypotheses each source sentence has and how many ids each 
+The Output(SentenceIds) and Output(SentenceScores) separately contain the
+generated id sequences and the corresponding scores. The shapes and lods of the
+two LodTensor are same. The lod level is 2 and the two levels separately
+indicate how many hypotheses each source sentence has and how many ids each
 hypothesis has.
 )DOC");
   }
@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    for (auto& o : op_desc.Output("SentenceIds")) {
-      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
-      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    for (auto& o : ctx->Output("SentenceIds")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto& o : op_desc.Output("SentenceScores")) {
-      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
-      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto& o : ctx->Output("SentenceScores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index fa6b09b4e7..a6aa35e056 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
 
     AddComment(R"DOC(
-This operator does the search in beams for one time step. 
+This operator does the search in beams for one time step.
 Specifically, it selects the top-K candidate word ids of current step from
 Input(ids) according to their Input(scores) for all source sentences,
 where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 
 class BeamSearchInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("selected_ids")) {
-      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
-      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("selected_ids")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto &o : op_desc.Output("selected_scores")) {
-      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
-      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto &o : ctx->Output("selected_scores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 194f9cf503..5d5ad9e711 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -50,9 +50,19 @@ class ConcatOp : public framework::OperatorWithKernel {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
         } else {
-          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                            "Input tensors should have the same "
-                            "elements except the specify axis.");
+          if (ctx->IsRuntime()) {
+            // check all shape in run time
+            PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                              "Input tensors should have the same "
+                              "elements except the specify axis.");
+          } else {
+            // not check -1 with other in compile time
+            if (out_dims[j] != -1 && ins[i][j] != -1) {
+              PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                                "Input tensors should have the same "
+                                "elements except the specify axis.");
+            }
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 1a157688f3..fa77f97419 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -93,11 +93,9 @@ execution.
 
 class GetPlacesInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o_name : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o_name).SetType(
-          framework::proto::VarType::PLACE_LIST);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o_name : ctx->Output("Out")) {
+      ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index fa18ade323..45f18ac925 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = ctx->Input("X")[0];
+    auto out_name = ctx->Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
-    auto *x = block->FindVarRecursive(x_name);
-    if (x != nullptr) {
-      out.SetDataType(x->GetDataType());
+    ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
+    if (ctx->HasVar(x_name)) {
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 8352ba4f2b..deb8ec3bb2 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kX);
-    auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto p_names = ctx->Input(kX);
+    auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
-      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
-      auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
-      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+      if (ctx->HasVar(pg_ig_names[i])) {
         VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
-        g_var->SetType(p_var.GetType());
-        g_var->SetDataType(p_var.GetDataType());
+                << " type: " << ctx->GetType(p_names[i]);
+        ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
+        ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index ca6bc4df0f..c6121d00da 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c87837e694..94a2016aa5 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
new file mode 100644
index 0000000000..e0d7e25d94
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class YoloBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
+                   "Input(ImgSize) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
+                   "Output(Boxes) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Scores"),
+                   "Output(Scores) of YoloBoxOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_imgsize = ctx->GetInputDim("ImgSize");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], anchor_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
+                      "Input(ImgSize) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0], dim_x[0],
+        "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
+    PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater than 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater than 0.");
+
+    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
+
+    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+    ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YoloBox operator is a 4-D tensor with "
+             "shape of [N, C, H, W]. The second dimension(C) stores "
+             "box locations, confidence score and classification one-hot "
+             "keys of each anchor box. Generally, X should be the output "
+             "of YOLOv3 network.");
+    AddInput("ImgSize",
+             "The image size tensor of YoloBox operator, "
+             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
+             "height and width of each input image used for resizing output "
+             "box in input image scale.");
+    AddOutput("Boxes",
+              "The output tensor of detection boxes of YoloBox operator, "
+              "This is a 3-D tensor with shape of [N, M, 4], N is the "
+              "batch num, M is output box number, and the 3rd dimension "
+              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
+    AddOutput("Scores",
+              "The output tensor of detection boxes scores of YoloBox "
+              "operator, This is a 3-D tensor with shape of "
+              "[N, M, :attr:`class_num`], N is the batch num, M is "
+              "output box number.");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YoloBox operator "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YoloBox operators.")
+        .SetDefault(32);
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
+                   "be ignored.")
+        .SetDefault(0.01);
+    AddComment(R"DOC(
+         This operator generates YOLO detection boxes from output of YOLOv3 network.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors. In the second dimension(the channel
+         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor 
+         box.
+
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         predictions should be as follows:
+
+         $$
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
+         b_w = p_w e^{t_w}
+         $$
+         $$
+         b_h = p_h e^{t_h}
+         $$
+
+         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
+
+         The logistic regression value of the 5th channel of each anchor prediction boxes
+         represents the confidence score of each prediction box, and the logistic
+         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         boxes represents the classifcation scores. Boxes with confidence scores less than
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+         confidence scores and classification scores.
+
+         $$
+         score_{pred} = score_{conf} * score_{class}
+         $$
+
+         )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
+                       ops::YoloBoxKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
new file mode 100644
index 0000000000..5a882958e6
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
+                            T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    T conf = sigmoid<T>(input[obj_idx]);
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
+                  grid_num, img_height, img_width);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+
+template <typename T>
+class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* img_size = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = sizeof(int) * anchors.size();
+    auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
+    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto cplace = platform::CPUPlace();
+    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
+                 dev_ctx.stream());
+
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = img_size->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, boxes, static_cast<T>(0));
+    set_zero(dev_ctx, scores, static_cast<T>(0));
+
+    int grid_dim = (n * box_num + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
+        anchors_data, n, h, w, an_num, class_num, box_num, input_size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
+                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
new file mode 100644
index 0000000000..8b7c7df0f3
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
+                                  int j, int an_idx, int grid_size,
+                                  int input_size, int index, int stride,
+                                  int img_height, int img_width) {
+  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
+                                        const int img_height,
+                                        const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<T>(img_height - 1);
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+template <typename T>
+class YoloBoxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* imgsize = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    Tensor anchors_;
+    auto anchors_data =
+        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = imgsize->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    memset(boxes_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    memset(scores_data, 0, scores->numel() * sizeof(T));
+
+    T box[4];
+    for (int i = 0; i < n; i++) {
+      int img_height = imgsize_data[2 * i];
+      int img_width = imgsize_data[2 * i + 1];
+
+      for (int j = 0; j < an_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            int obj_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            T conf = sigmoid<T>(input_data[obj_idx]);
+            if (conf < conf_thresh) {
+              continue;
+            }
+
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
+                          box_idx, stride, img_height, img_width);
+            box_idx = (i * box_num + j * stride + k * w + l) * 4;
+            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
+                                img_width);
+
+            int label_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
+                              class_num, conf, stride);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 28ebdcb03e..5ee35e0458 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
 
 class FakeInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index da0185b8c4..1b0b4dd316 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
 
 class MergeIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index f61d387fbe..191ca1efe8 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index eb4617a935..242f5390b8 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
       "The input tensor Input's rank of FCOp should be larger than "
       "in_num_col_dims.");
 
-  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
-
   std::vector<int64_t> output_dims;
-  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
-  for (int i = 0; i < in_num_col_dims; ++i) {
-    output_dims.push_back(in_dims[i]);
-  }
-  output_dims.push_back(w_dims[1]);
+  FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
   ctx->ShareLoD("Input", "Out");
@@ -128,6 +119,9 @@ void FCOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                "Skip calling InferShape() function in the runtime.")
+      .SetDefault(true);
   AddComment(R"DOC(
   Fully Connected Operator.
 
@@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     auto w_dims = w->dims();
+
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     auto out_dims = output->dims();
-    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
+    int M = framework::product(out_dims) / w_dims[1];
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index e1b780fc0c..b82a63cd83 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
+inline void FCOutputSize(const framework::DDim& in_dims,
+                         const framework::DDim& w_dims,
+                         std::vector<int64_t>& out_dims,  // NOLINT
+                         int in_num_col_dims) {
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+
+  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    out_dims.push_back(in_dims[i]);
+  }
+  out_dims.push_back(w_dims[1]);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index c86430524e..cf2f4776cf 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
-    auto& out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetDataType(data_type);
+        boost::get<int>(ctx->GetAttr("dtype")));
+    auto& out_var_name = ctx->Output("Out").front();
+    ctx->SetDataType(out_var_name, data_type);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index a0026427e2..9cc94ab88d 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -88,7 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
     AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.
@@ -137,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
 class FusedEmbeddingSeqPoolOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index a4ae19d9c1..c0893359af 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
 class GetTensorFromSelectedRowsOpVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const final {
-    auto out_var_name = op_desc.Output("Out").front();
-    auto in_var_name = op_desc.Input("X").front();
-
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
-    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
-    out_var.SetDataType(in_var.GetDataType());
+  void operator()(framework::InferVarTypeContext *ctx) const {  // NOLINT
+    auto out_var_name = ctx->Output("Out").front();
+    auto in_var_name = ctx->Input("X").front();
+
+    ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
   }
 };
 
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index f6395fb32f..82222d0a7e 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -54,7 +54,8 @@ $$Out = scale * X$$
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int>("mod_by", "").SetDefault(100000);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
   }
 };
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 6ca6f0bc04..d0e1057c43 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
 class HierarchicalSigmoidGradOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto bias_grad_var_name_vec =
-        op_desc.Output(framework::GradVarName("Bias"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
     std::string bias_grad_var_name;
     bool hasBias = false;
     if (bias_grad_var_name_vec.size()) {
       hasBias = true;
-      bias_grad_var_name =
-          op_desc.Output(framework::GradVarName("Bias")).front();
+      bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
     }
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to SelectedRows";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to LoDTensor";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
     if (hasBias) {
       VLOG(30) << "hierarchical_sigmoid_grad op "
                << framework::GradVarName("Bias") << " is set to LoDTensor";
-      block->Var(bias_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 166952fe23..0a43ac0c52 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o).SetType(
-          framework::proto::VarType::LOD_RANK_TABLE);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("Out")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 9b91cf5260..61e3427370 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 0029932bc0..8d1ebe6b1c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 14ca3e8073..8d96ae7e42 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                  &dst_memory_p);
         } else {
+          need_s8_to_u8 = fuse_relu;
           platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
                                                 &dst_memory_p);
         }
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 3a926a716f..69c0486eb6 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
 
@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
 
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
     auto dst_memory = mem.dst(output_data);
@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
 
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
     if (input_grad) {
+      input_grad->Resize(input->dims());
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     }
     if (w_grad) {
+      w_grad->Resize(w->dims());
       w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
     }
 
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const T* input_data = input->data<T>();
-
-    const Tensor* w = ctx.Input<Tensor>("W");
-    const T* w_data = w->data<T>();
-
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index e41bfb80df..4debc7ca5e 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> axis_int8 = {0, 2, 3, 1};
+    if (axis.size() != 1) {
+      PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size());
+      for (size_t i = 0; i < axis.size(); i++) {
+        PADDLE_ENFORCE_EQ(axis[i], axis_int8[i],
+                          "Current INT8 MKLDNN Transpose kernel only surpport "
+                          "axis with [0, 2, 3, 1] due to MKL-DNN kernel "
+                          "implementation.");
+      }
+    }
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->ShareDataWith(*input);
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(input->format());
+  }
+};
+
 template <typename T>
 class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
+                   ops::TransposeMKLDNNOpKernel<float>,
+                   ops::TransposeINT8MKLDNNOpKernel<uint8_t>,
+                   ops::TransposeINT8MKLDNNOpKernel<int8_t>);
+
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 0018139cb0..6a0ae0dede 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Communicator").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Communicator").front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 256da34912..fa7cc58c08 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 
 class NCEOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
 
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to SelectedRows";
-      block->Var(weight_grad)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to LoDTensor";
-      block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
+    ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
index f941f917c8..479c95ba08 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -37,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class NgraphEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 09255f60e6..6262ef0c2d 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
 #include <Eigen/Dense>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> {
     T beta1_pow = *beta1_pow_;
     T beta2_pow = *beta2_pow_;
     lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    size_t row_count = numel / row_numel_;
+    int64_t row_count = static_cast<int64_t>(numel / row_numel_);
 
-    for (size_t i = 0U, j = 0U; i != row_count; ++i) {
+    for (int64_t i = 0, j = 0; i != row_count; ++i) {
       if (i == *(rows_ + j)) {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T g = grad_[j * row_numel_ + k];
           adam_update(i * row_numel_ + k, g);
         }
         ++j;
       } else {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T mom1 = moment1_[i * row_numel_ + k];
           T mom2 = moment2_[i * row_numel_ + k];
           T p = param_[i * row_numel_ + k];
@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows cpu_grad_merge;
+      framework::SelectedRows tmp_grad_merge;
       const framework::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
         // merge duplicated rows if any.
         // The rows of grad_merge have been sorted inside MergeAdd functor
-        framework::SelectedRows* grad_merge_var;
         scatter::MergeAdd<DeviceContext, T> merge_func;
-        if (platform::is_cpu_place(ctx.GetPlace())) {
-          grad_merge_var = &cpu_grad_merge;
-        } else {
-          // FIXME(qiao): GPU also need to fix this
-          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                               .Var()
-                               ->GetMutable<framework::SelectedRows>();
-        }
         merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   grad_merge_var, true);
-        grad_merge_ptr = grad_merge_var;
+                   &tmp_grad_merge, true);
+        grad_merge_ptr = &tmp_grad_merge;
       }
 
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = nullptr;
-// When compiled without CUDA, the CUDAData() interface should not be
-// provided.
-#if defined(PADDLE_WITH_CUDA)
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = grad_merge.rows().data();
-#if defined(PADDLE_WITH_CUDA)
-      }
-#endif
+      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
         }
 #ifndef _WIN32
-        else if (FLAGS_inner_op_parallelism > 1 &&
+        else if (FLAGS_inner_op_parallelism > 1 &&  // NOLINT
                  min_row_size_to_use_multithread > 0 &&
                  param.dims()[0] > min_row_size_to_use_multithread) {
           VLOG(3) << "use multi thread, inner_op_parallelism="
@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
           for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
             int64_t start = i * line_in_each_thread;
             int64_t end = (i + 1) * line_in_each_thread;
-            if (start >= param_row_count) {
+            if (start >= static_cast<int64_t>(param_row_count)) {
               break;
             }
-            if (end > param_row_count) {
-              end = param_row_count;
+            if (end > static_cast<int64_t>(param_row_count)) {
+              end = static_cast<int64_t>(param_row_count);
             }
             fs.push_back(
                 framework::Async([&functor, &row_id_to_grad_row_offset,
@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
           for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
         }
-#endif  // !_WIN32
-        else {
+#endif          // !_WIN32
+        else {  // NOLINT
           functor(param.numel());
         }
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
index 574a03680b..126b665dd4 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
 weight using a local learning rate:
 
 $$
-local\_lr = \eta  * 
+local\_lr = \eta  *
     \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
-velocity = mu * velocity + 
+velocity = mu * velocity +
     local\_lr * (grad + \beta * param) \\
 param = param - velocity. \\
 $$
@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS.
 
 class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index cde238c076..7cf218c20f 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -21,18 +21,14 @@ using Tensor = framework::Tensor;
 
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto input_var = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
-          framework::proto::VarType::SELECTED_ROWS) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& input_var = ctx->Input("Param")[0];
+    for (auto& out_var : ctx->Output("ParamOut")) {
+      if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
+        ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
+      } else if (ctx->GetType(input_var) ==
                  framework::proto::VarType::LOD_TENSOR) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+        ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
       } else {
         PADDLE_THROW(
             "Only support LodTensor and SelectedRows, Unexpected Input Type.");
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 3ed1bff5ff..29a2ae6755 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         VLOG(3) << "Grad SelectedRows contains no data!";
         return;
       }
-      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
+
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows* merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = nullptr;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
       int64_t row_numel =
           merged_grad->value().numel() / merged_grad->rows().size();
       platform::ForRange<DeviceContext> for_range(
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 389c84d246..4550052b2d 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto &grad = grad_var->Get<framework::SelectedRows>();
-      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
-
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows *merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(dev_ctx, grad, merged_grad);
 
       platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+
       auto &merged_tensor = merged_grad->value();
       int64_t row_count = merged_grad->rows().size();
       int64_t row_numel = merged_tensor.numel() / row_count;
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 690381a67f..34e99a14ff 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto input_var_n = op_desc.Input("Param")[0];
-    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &input_var_n = ctx->Input("Param")[0];
+    auto in_var_type = ctx->GetType(input_var_n);
     PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                        in_var_type == framework::proto::VarType::LOD_TENSOR,
                    "The input Var's type should be LoDtensor or SelectedRows,"
                    " but the received var(%s)'s type is %s",
                    input_var_n, in_var_type);
 
-    for (auto &out_var_n : op_desc.Output("ParamOut")) {
-      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
-      if (out_var.GetType() != in_var_type) {
-        out_var.SetType(in_var_type);
+    for (auto &out_var_n : ctx->Output("ParamOut")) {
+      if (ctx->GetType(out_var_n) != in_var_type) {
+        ctx->SetType(out_var_n, in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 0a0ece162c..7963c27a01 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 53eff2de3e..5300e80747 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -14,8 +14,11 @@
 
 #include "paddle/fluid/operators/py_func_op.h"
 
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable,
   }
 }
 
-class PyFuncOpVarTypInference : public framework::VarTypeInference {
+class PyFuncOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op,
-                  framework::BlockDesc *block) const override {
-    auto &outs = op.Outputs();
-    bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
 
-    auto &ins = op.Inputs();
-    bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
+    bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
 
     /**
      * X or Out can be empty, so that py_func can be more flexible
@@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      */
     PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
 
-    PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0,
-                      "Function id cannot be less than 0");
+    PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
+                      0, "Function id cannot be less than 0");
 
     if (!has_out) return;
 
@@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      * the corresponding forward variable
      */
     const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = outs.at("Out");
+    auto &out_var_names = ctx->Output("Out");
     for (auto &out_var_name : out_var_names) {
       if (out_var_name == framework::kEmptyVarName ||
           out_var_name.size() < kGradVarSuffix.size()) {
@@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
       size_t len = out_var_name.size() - kGradVarSuffix.size();
       if (out_var_name.substr(len) == kGradVarSuffix) {
         auto fwd_var_name = out_var_name.substr(0, len);
-        auto *out_var_desc = block->FindVarRecursive(out_var_name);
-        auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name);
-        PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found",
-                                out_var_name);
-        PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
-                                fwd_var_name);
+        PADDLE_ENFORCE(ctx->HasVar(out_var_name),
+                       "Backward variable %s not found", out_var_name);
+        PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
+                       "Backward variable %s not found", fwd_var_name);
         VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
                  << fwd_var_name << ")";
-        out_var_desc->SetShape(fwd_var_desc->GetShape());
-        out_var_desc->SetDataType(fwd_var_desc->GetDataType());
-        out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel());
-        out_var_desc->SetType(fwd_var_desc->GetType());
+
+        ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
+        ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
+        ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
+        ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
       }
     }
   }
@@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
-                  ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference,
+                  ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
                   ops::PyFuncOpGradDescMaker);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 85394b336f..fdc7b0f6a0 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
     AddComment(R"DOC(
       CreateCustomReader Operator
 
-      A custom reader can be used for input data preprocessing. 
-      A custom reader holds its own sub-block, which will be executed in CPU 
-      in its 'ReadNext()' function. Users can configurate their own 
-      preprocessing pipelines by inserting operators into custom reader's 
+      A custom reader can be used for input data preprocessing.
+      A custom reader holds its own sub-block, which will be executed in CPU
+      in its 'ReadNext()' function. Users can configurate their own
+      preprocessing pipelines by inserting operators into custom reader's
       sub-block.
     )DOC");
   }
@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 
 class CustomReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
-    PADDLE_ENFORCE_NOT_NULL(out_reader);
-    out_reader->SetType(framework::proto::VarType::READER);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& out_var_name = ctx->Output("Out")[0];
+    PADDLE_ENFORCE(ctx->HasVar(out_var_name));
+    ctx->SetType(out_var_name, framework::proto::VarType::READER);
 
     auto sink_var_names =
-        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+        boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
     const auto* sub_block =
-        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+        boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
     std::vector<framework::proto::VarType::Type> res_data_types;
     for (const std::string& var_name : sink_var_names) {
       framework::VarDesc* var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(var);
       res_data_types.emplace_back(var->GetDataType());
     }
-    out_reader->SetDataTypes(res_data_types);
+    ctx->SetDataTypes(out_var_name, res_data_types);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 846b2ed77e..33a69ad5fe 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
 
 class ReadInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
     if (infer_out) {
-      std::string reader_name = op_desc.Input("Reader")[0];
-      std::vector<std::string> out_names = op_desc.Output("Out");
-      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-      auto dtypes = reader->GetDataTypes();
+      std::string reader_name = ctx->Input("Reader")[0];
+      std::vector<std::string> out_names = ctx->Output("Out");
+      auto dtypes = ctx->GetDataTypes(reader_name);
       PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
       for (size_t i = 0; i < dtypes.size(); ++i) {
-        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-        out.SetType(framework::proto::VarType::LOD_TENSOR);
-        out.SetDataType(dtypes[i]);
+        ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
+        ctx->SetDataType(out_names[i], dtypes[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 3921eedf94..64a1f6b687 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
   }
 }
 
-void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
-                                        framework::BlockDesc* block) const {
-  std::string reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-  reader->SetType(framework::proto::VarType::READER);
+void FileReaderInferVarType::operator()(
+    framework::InferVarTypeContext* ctx) const {
+  std::string reader_name = ctx->Output("Out")[0];
+  ctx->SetType(reader_name, framework::proto::VarType::READER);
 }
 
 void DecoratedReaderInferShape::operator()(
@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()(
 }
 
 void DecoratedReaderInferVarType::operator()(
-    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
-  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-  std::string out_reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-  out_reader->SetType(framework::proto::VarType::READER);
-  out_reader->SetDataTypes(in_reader->GetDataTypes());
+    framework::InferVarTypeContext* ctx) const {
+  const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
+  const std::string& out_reader_name = ctx->Output("Out")[0];
+  ctx->SetType(out_reader_name, framework::proto::VarType::READER);
+  ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
 }
 
 void DecoratedReaderMakerBase::Make() {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 25c3e7d77b..795a580605 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
@@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
 
 class FileReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 // general infershape for decorated reader
@@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
 // general var type inference for decorated reader
 class DecoratedReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index fcc598f4f1..b02c098099 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -159,12 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
 
 class SaveOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4ea77ed30d..4e4a015e18 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 
+#include <memory>
 #include <string>
 
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$
 
 class ScaleOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto &in_var_name = op_desc.Input("X").front();
-    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
-
-    auto out_var_name = op_desc.Output("Out").front();
-    auto *out_var = block->FindVarRecursive(out_var_name);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &in_var_name = ctx->Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
 
     if (in_var_name != out_var_name) {
-      out_var->SetType(in_var.GetType());
-      out_var->SetDataType(in_var.GetDataType());
+      ctx->SetType(out_var_name, ctx->GetType(in_var_name));
+      ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index f357c9c08d..cc4eedbf4d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -30,13 +30,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
         "Output(X) of SequenceEnumerate operator should not be null.");
 
     const auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2,
-        "Input(X) of SequenceEnumerate operator's rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
-                      "Input(X) of SequenceEnumerate operator's 2nd "
-                      "dimension should be 1.");
-
     const auto win_size = ctx->Attrs().Get<int>("win_size");
     ctx->SetOutputDim("Out", {x_dims[0], win_size});
     ctx->ShareLoD("X", "Out");
@@ -59,7 +52,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
     AddComment(R"DOC(
 Sequence Enumerate Operator.
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 18da69993b..6a1eb6e625 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -27,30 +27,47 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
     int win_size = context.Attr<int>("win_size");
-    int pad_value = context.Attr<int>("pad_value");
+    auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
 
     auto in_dims = in->dims();
-    auto in_lod = in->lod();
-
+    auto lod0 = in->lod()[0];
     PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        static_cast<uint64_t>(in_dims[0]), lod0.back(),
         "The actual input data's size mismatched with LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(in_dims[1], 1,
+                      "Input(X) of SequenceEnumerate operator's 2nd "
+                      "dimension should be 1.");
 
     // Generate enumerate sequence set
-    auto lod0 = in_lod[0];
     auto in_data = in->data<T>();
     out->Resize({in_dims[0], win_size});
+    out->set_lod(in->lod());
     auto out_data = out->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < lod0.size() - 1; ++i) {
-      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
-        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
-          size_t word_pos = idx + word_idx;
-          out_data[win_size * idx + word_idx] =
-              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+      int start = lod0[i];
+      int end = lod0[i + 1];
+      int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
+      int mid = end + 1 - copy_size;
+      int pad_num = win_size - copy_size;
+      copy_size *= sizeof(T);
+      for (int idx = start; idx < mid; ++idx) {
+        std::memcpy(out_data, in_data + idx, copy_size);
+        out_data += win_size;
+      }
+      for (int idx = mid; idx < end; ++idx) {
+        copy_size -= sizeof(T);
+        pad_num++;
+        std::memcpy(out_data, in_data + idx, copy_size);
+        T* pdata = out_data + copy_size / sizeof(T);
+        for (int i = 0; i < pad_num; ++i) {
+          pdata[i] = pad_value;
         }
+        out_data += win_size;
       }
     }
-    out->set_lod(in->lod());
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
index 1af57b89a3..24a564f9ef 100644
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
                         paddle::platform::float16* d_in) {
   int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
   if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
     int coords[D] = {0};
     for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx % out_dims[i];
-      out_idx /= out_dims[i];
+      coords[i] = out_idx_tmp % out_dims[i];
+      out_idx_tmp /= out_dims[i];
       coords[i] += offsets[i];
     }
 
     int64_t in_idx = 0;
-    for (int i = 0; i < D - 1; ++i) {
-      in_idx += coords[i] * in_dims[i + 1];
+    for (int i = 0; i < D; ++i) {
+      in_idx = in_idx * in_dims[i] + coords[i];
     }
-    in_idx += coords[D - 1];
 
     d_in[in_idx] = d_out[out_idx];
   }
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
     set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
 
     int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
     auto stream = ctx.cuda_device_context().stream();
 
     auto out_shape = framework::vectorize2int(out_dims);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 52b8dcc681..89aaac4cbe 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
+                          context.device_context(), logit_grad);
     T* logit_grad_data = logit_grad->data<T>();
 
     const int batch_size = logit_grad->dims()[0];
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index 0e7b1463d1..88dfebc0cf 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/split_selected_rows_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
 
 class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
     }
   }
 };
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index e389c6a65e..ecfb4e8956 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
   }
 };
 
+// TODO(paddle-dev): Should use OpKernel.
 class SqueezeOp : public framework::OperatorBase {
  public:
   using OperatorBase::OperatorBase;
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7abfbbd3cb..1391148ccf 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -159,24 +160,20 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : op_desc.Input("X")) {
-      VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name).GetType();
+    for (auto& name : ctx->Input("X")) {
+      VLOG(10) << name << " " << ctx->GetType(name);
     }
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name).GetType() ==
-                 framework::proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
+          return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
         });
 
-    auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name).GetType() ==
-             framework::proto::VarType::LOD_TENSOR_ARRAY;
+    auto is_tensor_array = [ctx](const std::string& name) {
+      return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       if (!all_inputs_are_tensor_array) {
         std::ostringstream os;
         for (auto& each : inputs) {
-          os << "    " << each << " type is "
-             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
+          os << "    " << each << " type is " << ctx->GetType(each) << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
@@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       var_type = framework::proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    out_var.SetType(var_type);
-    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
-    out_var.SetDataType(in_var.GetDataType());
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, var_type);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
   }
 };
 
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 58a74ec2c1..2b83c42f20 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
 class LoDTensorArray2TensorGradInferVarType
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index a8c86de9f9..6cf3e65e00 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class TensorRTEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index e3132ae76f..bb6a1c5b16 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Out").front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Out").front();
     auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
+        boost::get<int>(ctx->GetAttr("dtype")));
 
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
-      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+    if (ctx->GetType(out_var_name) !=
+        framework::proto::VarType::SELECTED_ROWS) {
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    out_var.SetDataType(var_data_type);
+    ctx->SetDataType(out_var_name, var_data_type);
   }
 };
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index ada9a19736..d54a3e8670 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+#if !defined(_WIN32)
   PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+#endif
 }
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3f7ce3d944..1eb8d9691a 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -265,11 +265,13 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+#if !defined(_WIN32)
   /*! \brief  Return nccl communicators. */
   ncclComm_t nccl_comm() const { return nccl_comm_; }
 
   /*! \brief  Set nccl communicators. */
   void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
+#endif
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
@@ -295,12 +297,14 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
 
+#if !defined(_WIN32)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
   // But, this collectives is used for collectives over multiple GPUs within
   // nodes.
   ncclComm_t nccl_comm_{nullptr};
+#endif
 
   int compute_capability_;
   int runtime_version_;
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 400a6d7bfa..ca858f6024 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
-
 #include <algorithm>
 #include <cstdlib>
 #include <string>
@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
 
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
 DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
@@ -38,6 +39,23 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_uint64(gpu_init_memory_in_mb, 0ul,
+              "Allocate a trunk of gpu memory whose byte size is specified by "
+              "the flag. Future memory usage will be allocated from the "
+              "truck. If the trunk doesn't have enough gpu memory, additional "
+              "trunks of the gpu memory will be requested from gpu with size "
+              "speified by FLAGS_gpu_reallocate_memory_in_mb until the gpu has "
+              "no memory left for the additional trunk. Note: if you set this "
+              "flag, the memory size set by "
+              "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+              "flag. If you don't set this flag, PaddlePaddle will use "
+              "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+
+DEFINE_uint64(gpu_reallocate_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
+
 DEFINE_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -180,13 +198,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }
 
 size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+
+size_t GpuInitAllocSize() {
+  if (FLAGS_gpu_init_memory_in_mb > 0ul) {
+    // Initial memory will be allocated by FLAGS_gpu_init_memory_in_mb
+    return static_cast<size_t>(FLAGS_gpu_init_memory_in_mb << 20);
+  }
+
+  // FLAGS_gpu_init_memory_in_mb is 0, initial memory will be allocated by
+  // fraction
   size_t total = 0;
   size_t available = 0;
 
   GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
 
-  // Reserve the rest for page tables, etc.
-  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuReallocSize() {
+  if (FLAGS_gpu_reallocate_memory_in_mb > 0ul) {
+    // Additional memory will be allocated by FLAGS_gpu_reallocate_memory_in_mb
+    return static_cast<size_t>(FLAGS_gpu_reallocate_memory_in_mb << 20);
+  }
+
+  // FLAGS_gpu_reallocate_memory_in_mb is 0, additional memory will be allocated
+  // by fraction
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
+
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {
@@ -201,16 +249,13 @@ size_t GpuMaxChunkSize() {
   GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(0.05 * total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
       std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
                total - reserving);
 
-  // Reserving the rest memory for page tables, etc.
-
-  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
-                                          (total - reserving));
+  size_t allocating = GpuMaxAllocSize();
 
   PADDLE_ENFORCE_LE(allocating, available,
                     "Insufficient GPU memory to allocation.");
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 1e1ab2503f..d4be7ac97b 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the initial allocation size of current GPU device.
+size_t GpuInitAllocSize();
+
+//! Get the re-allocation size of current GPU device.
+size_t GpuReallocSize();
+
 //! Get the minimum chunk size for GPU buddy allocator.
 size_t GpuMinChunkSize();
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4ac5b83c56..f1385f5718 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor)
+  tracer analysis_predictor imperative_profiler)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 6bbda69297..e9ed4e1644 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -38,20 +38,22 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap* outputs,
               framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
+             pybind11::gil_scoped_release release;
              return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap* outputs,
               framework::AttributeMap attrs_map,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {
+             pybind11::gil_scoped_release release;
              return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 236afc77f7..11e9725aea 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -242,6 +242,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
       .def("model_from_memory", &AnalysisConfig::model_from_memory)
+      .def("runtime_context_cache_enabled",
+           &AnalysisConfig::runtime_context_cache_enabled)
+      .def("switch_runtime_context_cache",
+           &AnalysisConfig::SwitchRuntimeContextCache, py::arg("x") = true)
       .def("pass_builder", &AnalysisConfig::pass_builder,
            py::return_value_policy::reference);
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5a753d0a78..691b437ab0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) {
   m.def("print_mem_usage",
         []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
 
+  m.def("start_imperative_gperf_profiler",
+        []() { imperative::StartProfile(); });
+
+  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
+
   py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
       .def(
           py::init<const std::string &, paddle::framework::proto::VarType::Type,
@@ -194,7 +200,7 @@ PYBIND11_MODULE(core, m) {
       .def_property("name", &imperative::VarBase::Name,
                     &imperative::VarBase::SetName)
       .def_property_readonly("shape", &imperative::VarBase::Shape)
-      .def_property_readonly("dtype", &imperative::VarBase::DType)
+      .def_property_readonly("dtype", &imperative::VarBase::DataType)
       .def_property("persistable", &imperative::VarBase::IsPersistable,
                     &imperative::VarBase::SetPersistable)
       .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index e91fa92924..cce153eb18 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("gpu_init_memory_in_mb");
+  envs.push_back("gpu_reallocate_memory_bytes");
   envs.push_back("allocator_strategy");
 #elif __clang__
   envs.push_back("use_mkldnn");
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index dd35c6deaf..9d22b62e7e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -132,7 +132,8 @@ def __bootstrap__():
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
         'inner_op_parallelism', 'enable_parallel_graph',
-        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize'
+        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
+        'tracer_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -161,7 +162,8 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'fraction_of_gpu_memory_to_use', 'gpu_init_memory_in_mb',
+            'gpu_reallocate_memory_in_mb', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
             'sync_nccl_allreduce', 'limit_of_tmp_allocation',
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index 20e6328d81..a127f5b11b 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -18,6 +18,7 @@ import os
 import time
 import logging
 
+import paddle
 from paddle.fluid import core
 from paddle.fluid import io
 from paddle.fluid import Program
@@ -84,8 +85,9 @@ def convert_dist_to_sparse_program(program):
     when we train model with distributed lookup table but want to do the local inference, we can use
     this function to convert the train program with distributed lookup table to sparse lookup table.
 
-    :param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    :return:
+    Args:
+        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
+    Returns:
         program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
     """
     if not program._distributed_lookup_table:
@@ -128,68 +130,92 @@ def convert_dist_to_sparse_program(program):
     return program
 
 
-def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-    def _is_checkpoint_var(exclude_fluid_vars=None):
-        """
-        the checkpoint will not save or load all the variables.
-        var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-        : param var(Variable)
-        """
-
-        if exclude_fluid_vars is None:
-            exclude_fluid_vars = []
-
-        def is_valid(var):
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.RAW:
-                return False
-            # @GRAD are named for gradient variables, checkpoint will not save it.
-            if "@GRAD" in var.name:
-                return False
-            # .trainer_ are named for distribute train variables, checkpoint will not save it.
-            if ".trainer_" in var.name:
-                return False
-
-            # .block is named for distribute train variables, checkpoint will not save it.
-            if ".block" in var.name:
-                return False
-
-            if "tmp_" in var.name:
-                return False
-
-            if var.name in exclude_fluid_vars:
-                return False
-
-            return var.persistable
-
-        return is_valid
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var(lookup_table_vars),
-        filename=None)
-
-
 def load_persistables_for_increment(dirname, executor, program,
                                     lookup_table_var, lookup_table_var_path):
     """
     WARNING: this function will only be used for distributed training with distributed lookup table.
     for increment trainning, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of slice lookup table
-    var with HASH, we must load the correct slice var.
+    but also load the suitable lookup table var. Because of sliced lookup table
+    var with HASH, we must load the correct sliced var.
+
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var: the distributed lookup tables var name.
+        lookup_table_var_path: the the distributed lookup tables var location.
+
+    Returns:
+        None
+    """
 
+    def _load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var: the distributed lookup tables var name.
-    :param lookup_table_var_path: the the distributed lookup tables var location.
-    :return: None
-    """
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
 
     def __load_lookup_table_vars(executor, main_program, lookup_table_var,
                                  lookup_table_var_path):
@@ -217,7 +243,9 @@ def load_persistables_for_increment(dirname, executor, program,
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var])
+    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
+        program._ps_endpoint)
+    _load_persistable_vars(executor, dirname, need_load_vars)
     __load_lookup_table_vars(executor, program, lookup_table_var,
                              lookup_table_var_path)
 
@@ -233,15 +261,62 @@ def load_persistables_for_inference(dirname, executor, program,
     Inference with distributed lookup table is a little funky, this function will load distributed
     lookup table vars into sparse var, can be used in local inference mode.
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var_name: the distributed lookup tables var name.
-    :return: None
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var_name: the distributed lookup tables var name.
+    Returns:
+        None
     """
 
-    def __load_lookup_table_vars(executor, dirname, main_program,
-                                 lookup_table_vars):
+    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
+        def _is_checkpoint_var(exclude_fluid_vars=None):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+
+            if exclude_fluid_vars is None:
+                exclude_fluid_vars = []
+
+            def is_valid(var):
+                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.RAW:
+                    return False
+                # @GRAD are named for gradient variables, checkpoint will not save it.
+                if "@GRAD" in var.name:
+                    return False
+                # .trainer_ are named for distribute train variables, checkpoint will not save it.
+                if ".trainer_" in var.name:
+                    return False
+
+                # .block is named for distribute train variables, checkpoint will not save it.
+                if ".block" in var.name:
+                    return False
+
+                if "tmp_" in var.name:
+                    return False
+
+                if var.name in exclude_fluid_vars:
+                    return False
+
+                return var.persistable
+
+            return is_valid
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var(lookup_table_vars),
+            filename=None)
+
+    def _load_lookup_table_vars(executor, dirname, main_program,
+                                lookup_table_vars):
         if not os.path.isdir(dirname):
             raise ValueError("There is no directory named '%s'", dirname)
 
@@ -313,11 +388,96 @@ def load_persistables_for_inference(dirname, executor, program,
                      dirname, time.ctime()))
 
     _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    __load_lookup_table_vars(executor, dirname, program,
-                             [lookup_table_var_name])
+    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
 
     _logger.info("Finish Load Sparse Program With "
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
     return program
+
+
+def get_inference_model(main_program, feeded_var_names, target_vars):
+    """
+    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
+    and then add `feeded_vars` and `target_vars` in this program.
+
+    Args:
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
+                                    the default main program will be used.
+                                    Default: None.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference
+                                     results.
+    Returns:
+        program(Program)
+
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    """
+
+    def prepend_feed_ops(inference_program,
+                         feed_target_names,
+                         feed_holder_name='feed'):
+        if len(feed_target_names) == 0:
+            return
+
+        global_block = inference_program.global_block()
+
+        feed_var = global_block.create_var(
+            name=feed_holder_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed_target_names):
+            out = global_block.var(name)
+            global_block._prepend_op(
+                type='feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+
+    def append_fetch_ops(inference_program,
+                         fetch_target_names,
+                         fetch_holder_name='fetch'):
+        global_block = inference_program.global_block()
+        fetch_var = global_block.create_var(
+            name=fetch_holder_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+
+        for i, name in enumerate(fetch_target_names):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [name]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+    origin_program = main_program.clone()
+    main_program = main_program.clone()
+    global_block = main_program.global_block()
+
+    need_to_remove_op_index = []
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            need_to_remove_op_index.append(i)
+
+    for index in need_to_remove_op_index[::-1]:
+        global_block._remove_op(index)
+
+    main_program.desc.flush()
+
+    main_program = main_program._prune(targets=target_vars)
+    main_program = main_program._inference_optimize(prune_read_op=True)
+
+    fetch_var_names = [v.name for v in target_vars]
+
+    prepend_feed_ops(main_program, feeded_var_names)
+    append_fetch_ops(main_program, fetch_var_names)
+
+    return main_program
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index a24e1d1300..3dac41ce43 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -268,8 +268,8 @@ class DataFeeder(object):
         Args:
             reader(function): the reader is the function which can generate data.
             multi_devices(bool): whether to use multiple devices or not.
-            num_places(int): if the multi_devices is True, you can specify the number
-                of GPU to use, if 'num_places' is None, the function will use all the
+            num_places(int): if multi_devices is True, you can specify the number
+                of GPU to use, if multi_devices is None, the function will use all the
                 GPU of the current machine. Default None.
             drop_last(bool): whether to drop the last batch if the
                 size of the last batch is less than batch_size. Default True.
@@ -278,7 +278,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot fit for devices.
+            ValueError: If drop_last is False and the data batch cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index cc3c0dd689..03aa9917f3 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -470,13 +470,21 @@ class Executor(object):
             program(Program|CompiledProgram): the program that need to run,
                 if not provided, then default_main_program (not compiled) will be used.
             feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
-            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
-            feed_var_name(str): the name for the input variable of feed Operator.
-            fetch_var_name(str): the name for the output variable of fetch Operator.
-            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
+            fetch_list(list): a list of variable or variable names that user 
+                wants to get, this method will return them according to this list.
+            feed_var_name(str): the name for the input variable of 
+                feed Operator.
+            fetch_var_name(str): the name for the output variable of 
+                fetch Operator.
+            scope(Scope): the scope used to run this program, you can switch 
+                it to different scope. default is global_scope
             return_numpy(bool): if convert the fetched tensor to numpy
-            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
-
+            use_program_cache(bool): whether to use the cached program 
+                settings across batches. Setting it be true would be faster 
+                only when (1) the program is not compiled with data parallel, 
+                and (2) program, feed variable names and fetch_list variable 
+                names do not changed compared to the last step. 
+                
         Returns:
 
             list(numpy.array): fetch result according to fetch_list.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 8988c55096..556ce71ee5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -430,6 +430,11 @@ class Variable(object):
         Returns:
             str: The debug string.
         """
+        if _in_imperative_mode():
+            # TODO(panyx0718): add more imperative debug info.
+            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
+                                                     self.shape)
+
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 034a11e0a6..7f31ca1b9b 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -26,8 +26,12 @@ from .nn import *
 from . import tracer
 from .tracer import *
 
+from . import profiler
+from .profiler import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
+__all__ += profiler.__all__
diff --git a/python/paddle/fluid/imperative/profiler.py b/python/paddle/fluid/imperative/profiler.py
new file mode 100644
index 0000000000..04c865500b
--- /dev/null
+++ b/python/paddle/fluid/imperative/profiler.py
@@ -0,0 +1,30 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from .. import core
+
+__all__ = [
+    'start_gperf_profiler',
+    'stop_gperf_profiler',
+]
+
+
+def start_gperf_profiler():
+    core.start_imperative_gperf_profiler()
+
+
+def stop_gperf_profiler():
+    core.stop_imperative_gperf_profiler()
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 9183bfd43b..0a1ddbc1db 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -49,6 +49,7 @@ __all__ = [
     'box_coder',
     'polygon_box_transform',
     'yolov3_loss',
+    'yolo_box',
     'box_clip',
     'multiclass_nms',
     'distribute_fpn_proposals',
@@ -628,6 +629,83 @@ def yolov3_loss(x,
     return loss
 
 
+@templatedoc(op_type="yolo_box")
+def yolo_box(x,
+             img_size,
+             anchors,
+             class_num,
+             conf_thresh,
+             downsample_ratio,
+             name=None):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        img_size (Variable): ${img_size_comment}
+        anchors (list|tuple): ${anchors_comment}
+        class_num (int): ${class_num_comment}
+        conf_thresh (float): ${conf_thresh_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
+        name (string): the name of yolo box layer. Default None.
+
+    Returns:
+        Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
+        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
+        scores of boxes.
+
+    Raises:
+        TypeError: Input x of yolov_box must be Variable
+        TypeError: Attr anchors of yolo box must be list or tuple
+        TypeError: Attr class_num of yolo box must be an integer
+        TypeError: Attr conf_thresh of yolo box must be a float number
+
+    Examples:
+
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+        anchors = [10, 13, 16, 30, 33, 23]
+        loss = fluid.layers.yolo_box(x=x, class_num=80, anchors=anchors, 
+                                        conf_thresh=0.01, downsample_ratio=32)
+    """
+    helper = LayerHelper('yolo_box', **locals())
+
+    if not isinstance(x, Variable):
+        raise TypeError("Input x of yolo_box must be Variable")
+    if not isinstance(img_size, Variable):
+        raise TypeError("Input img_size of yolo_box must be Variable")
+    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+        raise TypeError("Attr anchors of yolo_box must be list or tuple")
+    if not isinstance(class_num, int):
+        raise TypeError("Attr class_num of yolo_box must be an integer")
+    if not isinstance(conf_thresh, float):
+        raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
+
+    boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
+    scores = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    attrs = {
+        "anchors": anchors,
+        "class_num": class_num,
+        "conf_thresh": conf_thresh,
+        "downsample_ratio": downsample_ratio,
+    }
+
+    helper.append_op(
+        type='yolo_box',
+        inputs={
+            "X": x,
+            "ImgSize": img_size,
+        },
+        outputs={
+            'Boxes': boxes,
+            'Scores': scores,
+        },
+        attrs=attrs)
+    return boxes, scores
+
+
 @templatedoc()
 def detection_map(detect_res,
                   label,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ea028b0566..dbe495b75c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,8 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder
+from ..framework import Variable, OpProtoHolder, _in_imperative_mode
+from ..imperative import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
 from .tensor import concat, assign
@@ -205,16 +206,23 @@ def fc(input,
     **Fully Connected Layer**
 
     This function creates a fully connected layer in the network. It can take
-    multiple tensors as its inputs. It creates a variable called weights for
-    each input tensor, which represents a fully connected weight matrix from
-    each input unit to each output unit. The fully connected layer multiplies
-    each input tensor with its coresponding weight to produce an output Tensor.
-    If multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a bias variable will be created
-    and added to the output. Finally, if activation is not None, it will be applied
-    to the output as well.
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
 
-    This process can be formulated as follows:
+    .. math::
+
+        Out = Act({XW + b})
+
+    When the input are multiple tensors:
 
     .. math::
 
@@ -222,13 +230,31 @@ def fc(input,
 
     In the above equation:
 
-    * :math:`N`: Number of the input.
-    * :math:`X_i`: The input tensor.
-    * :math:`W`: The weights created by this layer.
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
     * :math:`b`: The bias parameter created by this layer (if needed).
     * :math:`Act`: The activation function.
     * :math:`Out`: The output tensor.
 
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
     Args:
         input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
             the input tensor(s) is at least 2.
@@ -260,8 +286,14 @@ def fc(input,
     Examples:
         .. code-block:: python
 
+          # when input is single tensor
           data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
     """
 
     helper = LayerHelper("fc", **locals())
@@ -4864,7 +4896,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
         if transpose_y:
             y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
         if x_shape[-1] != y_shape[-2]:
-            raise ValueError("Invalid inputs for matmul.")
+            raise ValueError("Invalid inputs for matmul. x: %s, y: %s\n" %
+                             (x_shape, y_shape))
 
         if len(y_shape) > 2 and len(x_shape) > 2:
             for i, dim_x in enumerate(x_shape[:-2]):
@@ -6367,6 +6400,8 @@ def squeeze(input, axes, name=None):
             x = layers.data(name='x', shape=[5, 1, 10])
             y = layers.sequeeze(input=x, axes=[1])
     """
+    assert not _in_imperative_mode(), (
+        "squeeze layer is not supported in imperative mode yet.")
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -9104,6 +9139,10 @@ def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
+    if _in_imperative_mode():
+        x = base.to_variable(x)
+        y = base.to_variable(y)
+
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     axis = helper.kwargs.get('axis', -1)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index b756c532ca..7d1b869cf5 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -489,6 +489,16 @@ class TestYoloDetection(unittest.TestCase):
 
             self.assertIsNotNone(loss)
 
+    def test_yolo_box(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
+            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
+            boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
+                                            0.01, 32)
+            self.assertIsNotNone(boxes)
+            self.assertIsNotNone(scores)
+
 
 class TestBoxClip(unittest.TestCase):
     def test_box_clip(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
new file mode 100644
index 0000000000..a8127bcc78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from mkldnn_op_test import format_reorder
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.initInputData()
+        self.use_mkldnn = True
+        self.axis = (0, 2, 3, 1)
+
+        self.inputs = {
+            'X': format_reorder(self.input_data, self.shape)
+        }  #transform data format to 'NHWC' for INT8 transpose specially.
+
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+        }
+
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype('int8'),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 6, 8)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestUINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 5, 7)
+
+    def initDataType(self):
+        self.input_data = (np.random.randint(0, 100,
+                                             self.shape)).astype('uint8')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
new file mode 100644
index 0000000000..2086fab5c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+
+def gen_data():
+    pass
+
+
+class GraphConv(fluid.imperative.Layer):
+    def __init__(self, name_scope, in_features, out_features):
+        super(GraphConv, self).__init__(name_scope)
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self.weight = self.create_parameter(
+            attr=None,
+            dtype='float32',
+            shape=[self._in_features, self._out_features])
+        self.bias = self.create_parameter(
+            attr=None, dtype='float32', shape=[self._out_features])
+
+    def forward(self, features, adj):
+        support = fluid.layers.matmul(features, self.weight)
+        # TODO(panyx0718): sparse matmul?
+        return fluid.layers.matmul(adj, support) + self.bias
+
+
+class GCN(fluid.imperative.Layer):
+    def __init__(self, name_scope, num_hidden):
+        super(GCN, self).__init__(name_scope)
+        self.gc = GraphConv(self.full_name(), num_hidden, 32)
+        self.gc2 = GraphConv(self.full_name(), 32, 10)
+
+    def forward(self, x, adj):
+        x = fluid.layers.relu(self.gc(x, adj))
+        return self.gc2(x, adj)
+
+
+class TestImperativeGNN(unittest.TestCase):
+    def test_gnn_float32(self):
+        seed = 90
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            features = fluid.layers.data(
+                name='features',
+                shape=[1, 100, 50],
+                dtype='float32',
+                append_batch_size=False)
+            # Use selected rows when it's supported.
+            adj = fluid.layers.data(
+                name='adj',
+                shape=[1, 100, 100],
+                dtype='float32',
+                append_batch_size=False)
+            labels = fluid.layers.data(
+                name='labels',
+                shape=[100, 1],
+                dtype='int64',
+                append_batch_size=False)
+
+            model = GCN('test_gcn', 50)
+            logits = model(features, adj)
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
+            loss = fluid.layers.reduce_sum(loss)
+
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            static_loss = exe.run(feed={
+                'features': np.zeros(
+                    [1, 100, 50], dtype=np.float32),
+                'adj': np.zeros(
+                    [1, 100, 100], dtype=np.float32),
+                'labels': np.zeros(
+                    [100, 1], dtype=np.int64)
+            },
+                                  fetch_list=[loss])[0]
+
+            static_weight = np.array(
+                scope.find_var(model.gc.weight.name).get_tensor())
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            features = np.zeros([1, 100, 50], dtype=np.float32)
+            # Use selected rows when it's supported.
+            adj = np.zeros([1, 100, 100], dtype=np.float32)
+            labels = np.zeros([100, 1], dtype=np.int64)
+
+            model = GCN('test_gcn', 50)
+            logits = model(to_variable(features), to_variable(adj))
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits,
+                                                           to_variable(labels))
+            loss = fluid.layers.reduce_sum(loss)
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            self.assertEqual(static_loss, loss._numpy())
+            self.assertTrue(
+                np.allclose(static_weight, model.gc.weight._numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 5b186ae038..885ee170e8 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -84,6 +84,27 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
 
+    def test_matmul(self):
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            ret = layers.matmul(t, t2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': np.ones(
+                        [3, 3], dtype='float32'),
+                    't2': np.ones(
+                        [3, 3], dtype='float32')
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            t = np.ones([3, 3], dtype='float32')
+            t2 = np.ones([3, 3], dtype='float32')
+            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+
     def test_conv2d(self):
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
@@ -153,6 +174,60 @@ class TestLayer(LayerTest):
             self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
             self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
 
+    def test_elementwise_math(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 1.1
+        n3 = np.ones([3, 3], dtype='float32') * 2
+        n4 = np.ones([3, 3], dtype='float32') * 3
+        n5 = np.ones([3, 3], dtype='float32') * 4
+        n6 = np.ones([3, 3], dtype='float32') * 5
+
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            t3 = layers.data(name='t3', shape=[3, 3], dtype='float32')
+            t4 = layers.data(name='t4', shape=[3, 3], dtype='float32')
+            t5 = layers.data(name='t5', shape=[3, 3], dtype='float32')
+            t6 = layers.data(name='t6', shape=[3, 3], dtype='float32')
+
+            ret = layers.elementwise_add(t, t2)
+            ret = layers.elementwise_pow(ret, t3)
+            ret = layers.elementwise_div(ret, t4)
+            ret = layers.elementwise_sub(ret, t5)
+            ret = layers.elementwise_mul(ret, t6)
+
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': n,
+                    't2': n2,
+                    't3': n3,
+                    't4': n4,
+                    't5': n5,
+                    't6': n6
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            ret = layers.elementwise_add(n, n2)
+            ret = layers.elementwise_pow(ret, n3)
+            ret = layers.elementwise_div(ret, n4)
+            ret = layers.elementwise_sub(ret, n5)
+            dy_ret = layers.elementwise_mul(ret, n6)
+        self.assertTrue(
+            np.allclose(static_ret, dy_ret._numpy()),
+            '%s vs %s' % (static_ret, dy_ret._numpy()))
+
+    def test_elementwise_minmax(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 2
+
+        with self.dynamic_graph():
+            min_ret = layers.elementwise_min(n, n2)
+            max_ret = layers.elementwise_max(n, n2)
+
+        self.assertTrue(np.allclose(n, min_ret._numpy()))
+        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 5fdabbabed..aefd8cb6d3 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -87,5 +87,31 @@ class TestFP16(TestSliceOp):
                 place, ['Input'], 'Out', max_relative_error=0.006)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16_2(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Out',
+                max_relative_error=0.006,
+                numeric_grad_delta=0.5)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
new file mode 100644
index 0000000000..416e6ea9f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def YoloBox(x, img_size, attrs):
+    n, c, h, w = x.shape
+    anchors = attrs['anchors']
+    an_num = int(len(anchors) // 2)
+    class_num = attrs['class_num']
+    conf_thresh = attrs['conf_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+
+    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    pred_conf[pred_conf < conf_thresh] = 0.
+    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
+    pred_box = pred_box * (pred_conf > 0.).astype('float32')
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_box[:, :, :2], pred_box[:, :, 2:4] = \
+        pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
+        pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
+    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
+    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
+
+    for i in range(len(pred_box)):
+        pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
+        pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
+        pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf,
+                                    img_size[i, 1] - 1)
+        pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf,
+                                    img_size[i, 0] - 1)
+
+    return pred_box, pred_score.reshape((n, -1, class_num))
+
+
+class TestYoloBoxOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'yolo_box'
+        x = np.random.random(self.x_shape).astype('float32')
+        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
+
+        self.attrs = {
+            "anchors": self.anchors,
+            "class_num": self.class_num,
+            "conf_thresh": self.conf_thresh,
+            "downsample": self.downsample,
+        }
+
+        self.inputs = {
+            'X': x,
+            'ImgSize': img_size,
+        }
+        boxes, scores = YoloBox(x, img_size, self.attrs)
+        self.outputs = {
+            "Boxes": boxes,
+            "Scores": scores,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
+        self.imgsize_shape = (self.batch_size, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 678026cf95..b55a6298f6 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -38,9 +38,8 @@ items. It can be any function with no parameter that creates a iterable
 Element produced from the iterable should be a **single** entry of data,
 **not** a mini batch. That entry of data could be a single item, or a tuple of
 items.
-Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
-/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
-array of float32, int, list of int)
+Item should be of supported type (e.g., numpy array or list/tuple of float 
+or int).
 
 An example implementation for single item data reader creator:
 
@@ -62,8 +61,6 @@ An example implementation for multiple item data reader creator:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
     return reader
 
-
-TODO(yuyang18): Should we add whole design doc here?
 """
 
 import paddle.reader.decorator
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index c861020225..353aca92f4 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -44,8 +44,11 @@ def text_file(path):
     Creates a data reader that outputs text line by line from given text file.
     Trailing new line ('\\\\n') of each line will be removed.
 
-    :path: path of the text file.
-    :returns: data reader of text file
+    Args:
+        path (str): path of the text file.
+    
+    Returns: 
+        callable: data reader of text file.
     """
 
     def reader():
@@ -59,10 +62,15 @@ def text_file(path):
 
 def recordio(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",",
-        glob pattern is supported.
-    :path: path of recordio files, can be a string or a string list.
-    :returns: data reader of recordio files.
+    Creates a data reader from given RecordIO file paths separated 
+    by ",", glob pattern is supported.
+
+    Args:
+        paths (str|list(str)): path of recordio files.
+        buf_size (int): prefetched buffer size. 
+
+    Returns:
+        callable: data reader of recordio files.
     """
 
     import recordio as rec
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index b2ef9f7580..685d08b9e0 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -242,20 +242,18 @@ class XmapEndSignal():
 
 def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     """
-    Use multiprocess to map samples from reader by a mapper defined by user.
-    And this function contains a buffered decorator.
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param reader: the data reader to read from
-    :type reader: callable
-    :param process_num: process number to handle original sample
-    :type process_num: int
-    :param buffer_size: max buffer size
-    :type buffer_size: int
-    :param order: keep the order of reader
-    :type order: bool
-    :return: the decarated reader
-    :rtype: callable
+    Use multi-threads to map samples from reader by a mapper defined by user.
+
+    Args:
+        mapper (callable): a function to map the data from reader.
+        reader (callable): a data reader which yields the data. 
+        process_num (int): thread number to handle original sample.
+        buffer_size (int): size of the queue to read data in. 
+        order (bool): whether to keep the data order from original reader. 
+            Default False.
+
+    Returns:
+        callable: a decorated reader with data mapping. 
     """
     end = XmapEndSignal()
 
@@ -477,7 +475,7 @@ class PipeReader:
         """
         :param cut_lines: cut buffer to lines
         :type cut_lines: bool
-        :param line_break: line break of the file, like \n or \r
+        :param line_break: line break of the file, like '\\\\n' or '\\\\r'
         :type line_break: string
 
         :return: one line or a buffer of bytes
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 1b0059a8c6..3be94a42d5 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -153,3 +153,9 @@ done
 
 # Restore LD_LIBRARY_PATH
 LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
+
+# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
+# we should install new version ar with 64-bit supported here
+wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
+tar xzf binutils-2.27.tar.gz && cd binutils-2.27
+./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install

From 090d25f724f34e92e74c95a6a0c95d17759b9eeb Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 22 Mar 2019 12:22:29 +0800
Subject: [PATCH 02/11] test=develop;fix docker build failed

---
 tools/manylinux1/Dockerfile.x64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index c2fd743f62..c37a9a92e6 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
-RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]

From 77a08750e954cd7e995cd76e59b3a8ea8efbd20e Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Fri, 22 Mar 2019 05:08:59 +0000
Subject: [PATCH 03/11] add var name in optimizer; test=develop

---
 paddle/fluid/API.spec            | 11 +++++++++++
 python/paddle/fluid/optimizer.py | 11 ++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9b111e09e0..598604ca7a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -432,48 +432,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
 paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d501d02bd4..8918886a80 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -70,6 +70,10 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._opti_name_list = []
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
 
     def _create_global_learning_rate(self):
         lr = self._global_learning_rate()
@@ -166,8 +170,13 @@ class Optimizer(object):
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
         var = self.helper.create_global_variable(
-            name=unique_name.generate(name),
+            name=var_name,
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,

From 55a7b98126f8ae08916d9b3740f5fb310b851094 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 22 Mar 2019 17:25:40 +0800
Subject: [PATCH 04/11] Add DeepCF model

test=develop
---
 .../tests/unittests/test_imperative_deepcf.py | 196 ++++++++++++++++++
 .../tests/unittests/test_imperative_gan.py    |   2 +-
 2 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_deepcf.py

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
new file mode 100644
index 0000000000..af80ca6ce7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+NUM_USERS = 100
+NUM_ITEMS = 1000
+
+BATCH_SIZE = 32
+NUM_BATCHES = 2
+
+
+class MLP(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+
+        self._user_layers = []
+        self._item_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._user_layers.append(
+                self.add_sublayer(
+                    'user_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+            self._item_layers.append(
+                self.add_sublayer(
+                    'item_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+
+        for ul, il in zip(self._user_layers, self._item_layers):
+            users = ul(users)
+            items = il(items)
+        return fluid.layers.elementwise_mul(users, items)
+
+
+class DMF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DMF, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._match_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._match_layers.append(
+                self.add_sublayer(
+                    'match_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+        self._mat
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+        match_vec = fluid.layers.concat(
+            [users, items], axis=len(users.shape) - 1)
+        for l in self._match_layers:
+            match_vec = l(match_vec)
+        return match_vec
+
+
+class DeepCF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DeepCF, self).__init__(name_scope)
+
+        self._user_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_USERS, 256])
+        self._item_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_ITEMS, 256])
+
+        self._mlp = MLP(self.full_name())
+        self._dmf = DMF(self.full_name())
+        self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
+
+    def forward(self, users, items):
+        users_emb = self._user_emb(users)
+        items_emb = self._item_emb(items)
+
+        mlp_predictive = self._mlp(users_emb, items_emb)
+        dmf_predictive = self._dmf(users_emb, items_emb)
+        predictive = fluid.layers.concat(
+            [mlp_predictive, dmf_predictive],
+            axis=len(mlp_predictive.shape) - 1)
+        prediction = self._match_fc(predictive)
+        return prediction
+
+
+def get_data():
+    user_ids = []
+    item_ids = []
+    labels = []
+    for uid in range(NUM_USERS):
+        for iid in range(NUM_ITEMS):
+            # 10% positive
+            label = float(random.randint(1, 10) == 1)
+            user_ids.append(uid)
+            item_ids.append(iid)
+            labels.append(label)
+    indices = np.arange(NUM_USERS * NUM_ITEMS)
+    np.random.shuffle(indices)
+    users_np = np.array(user_ids, dtype=np.int64)[indices]
+    items_np = np.array(item_ids, dtype=np.int64)[indices]
+    labels_np = np.array(labels, dtype=np.float32)[indices]
+    return np.expand_dims(users_np, -1), \
+           np.expand_dims(items_np, -1), \
+           np.expand_dims(labels_np, -1)
+
+
+class TestImperativeDeepCF(unittest.TestCase):
+    def test_gan_float32(self):
+        seed = 90
+        users_np, items_np, labels_np = get_data()
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            users = fluid.layers.data('users', [1], dtype='int64')
+            items = fluid.layers.data('items', [1], dtype='int64')
+            labels = fluid.layers.data('labels', [1], dtype='float32')
+
+            deepcf = DeepCF('deepcf')
+            prediction = deepcf(users, items)
+            loss = fluid.layers.reduce_sum(
+                fluid.layers.log_loss(prediction, labels))
+            adam = fluid.optimizer.AdamOptimizer(0.01)
+            adam.minimize(loss)
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                static_loss = exe.run(
+                    main,
+                    feed={
+                        users.name: users_np[slice:slice + BATCH_SIZE],
+                        items.name: items_np[slice:slice + BATCH_SIZE],
+                        labels.name: labels_np[slice:slice + BATCH_SIZE]
+                    },
+                    fetch_list=[loss])[0]
+                sys.stderr.write('static loss %s\n' % static_loss)
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            deepcf = DeepCF('deepcf')
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                prediction = deepcf(
+                    to_variable(users_np[slice:slice + BATCH_SIZE]),
+                    to_variable(items_np[slice:slice + BATCH_SIZE]))
+                loss = fluid.layers.reduce_sum(
+                    fluid.layers.log_loss(prediction,
+                                          to_variable(labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                loss._backward()
+                adam = fluid.optimizer.AdamOptimizer(0.01)
+                adam.minimize(loss)
+                deepcf.clear_gradients()
+                dy_loss = loss._numpy()
+
+        self.assertEqual(static_loss, dy_loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index a80202d6dd..6024fb5f81 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
         return self._fc3(x)
 
 
-class TestImperativeMnist(unittest.TestCase):
+class TestImperativeGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
 

From d11d0e18c27783183867cd2ab6d98adbc44b6df1 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Fri, 22 Mar 2019 10:12:21 +0000
Subject: [PATCH 05/11] remove test_dist_transplier; test=develop

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b61ef706ba..ecfbfe3b54 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -105,7 +105,6 @@ if(WITH_DISTRIBUTE)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
-    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

From 6b971e1f190faf31e06855bc9207bf31a1610671 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Fri, 22 Mar 2019 10:16:55 +0000
Subject: [PATCH 06/11] remove test_dist_transplier; test=develop

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ecfbfe3b54..3c6b9daca6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -105,6 +105,7 @@ if(WITH_DISTRIBUTE)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
+    # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

From 5d6737b5cbf3669ac0df06660e2826452dc9eb53 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 22 Mar 2019 20:30:10 +0800
Subject: [PATCH 07/11] Fix bug in affine_channel API (#16373)

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 45e7a67711..e2c8be613f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9768,7 +9768,7 @@ def affine_channel(x,
                 'Bias': bias},
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
-    return helper.append_activation(pre_activation)
+    return helper.append_activation(out)
 
 
 def similarity_focus(input, axis, indexes, name=None):

From 18779b5b8fb8007d10dc24f38dbbc611848f81e1 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Sat, 23 Mar 2019 07:39:28 +0800
Subject: [PATCH 08/11] [Operator] Add range op. (#15431)

* Add range op.
test=develop

* Add more unitests.
test=develop

* Fix API.spec
test=develop

* Fix API.spec
test=develop

* Fix API.spec
test=develop
---
 paddle/fluid/API.spec                         |  1 +
 paddle/fluid/operators/range_op.cc            | 69 ++++++++++++++++++
 paddle/fluid/operators/range_op.cu            | 67 +++++++++++++++++
 paddle/fluid/operators/range_op.h             | 56 +++++++++++++++
 python/paddle/fluid/layers/tensor.py          | 71 +++++++++++++++++--
 .../fluid/tests/unittests/test_layers.py      |  8 +++
 .../fluid/tests/unittests/test_range.py       | 70 ++++++++++++++++++
 7 files changed, 338 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/operators/range_op.cc
 create mode 100644 paddle/fluid/operators/range_op.cu
 create mode 100644 paddle/fluid/operators/range_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_range.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 598604ca7a..6c61a3d63d 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -255,6 +255,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=
 paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
+paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
new file mode 100644
index 0000000000..ee8c68fd00
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/range_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("Start")) {
+      auto s_dims = ctx->GetInputDim("Start");
+      PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                     "The shape of Input(Start) should be [1].");
+    }
+    if (ctx->HasInput("End")) {
+      auto e_dims = ctx->GetInputDim("End");
+      PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                     "The shape of Input(End) should be [1].");
+    }
+    if (ctx->HasInput("Step")) {
+      auto step_dims = ctx->GetInputDim("Step");
+      PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                     "The shape of Input(Step) should be [1].");
+    }
+    ctx->SetOutputDim("Out", {-1});
+  }
+};
+
+class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "Start of interval. The interval includes this value. It is a "
+             "tensor with shape=[1].");
+    AddInput("End",
+             "End of interval. The interval does not include this value, "
+             "except in some cases where step is not an integer and floating "
+             "point round-off affects the length of out. It is a tensor with "
+             "shape=[1].");
+    AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
+REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
+                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
+                       ops::CPURangeKernel<int64_t>);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
new file mode 100644
index 0000000000..e2c03716d5
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+class CUDARangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
+    T end = n.data<T>()[0];
+    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (size + block - 1) / block;
+    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
+                        ops::CUDARangeKernel<int64_t>,
+                        ops::CUDARangeKernel<float>,
+                        ops::CUDARangeKernel<double>);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
new file mode 100644
index 0000000000..fce58b45c9
--- /dev/null
+++ b/paddle/fluid/operators/range_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
+                 "The step of range op should not be 0.");
+  PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
+                 "The step should be greater than 0 while start < end. And the "
+                 "step should be less than 0 while start > end.");
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+
+template <typename T>
+class CPURangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
+    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = value;
+      value += step;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cb97398698..a18e5b6a9c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'tensor_array_to_tensor',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
+    'ones',
+    'zeros',
+    'reverse',
+    'has_inf',
+    'has_nan',
+    'isfinite',
+    'range',
 ]
 
 
@@ -764,3 +780,50 @@ def isfinite(x):
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
     return out
+
+
+def range(start, end, step, dtype):
+    """
+    Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval [start, stop) (in other words,
+    the interval including start but excluding stop).
+
+    args:
+        start(int|float|Variable): Start of interval. The interval includes this value.
+        end(int|float|Variable): End of interval. The interval does not include this
+                                 value, except in some cases where step is not an integer
+                                 and floating point round-off affects the length of out. 
+        step(int|float|Variable): Spacing between values. For any output out, this is the
+                                  distance between two adjacent values, out[i+1] - out[i].
+                                  The default step size is 1.
+        dtype(string): 'float32'|'int32'|..., the data type of the output tensor.
+
+    returns:
+        Evenly spaced values within a given interval.
+
+    examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.range(0, 10, 2, 'int32')
+
+    """
+    helper = LayerHelper("range", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(end, Variable):
+        end = fill_constant([1], dtype, end)
+    if not isinstance(step, Variable):
+        step = fill_constant([1], dtype, step)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='range',
+        inputs={'Start': start,
+                'End': end,
+                'Step': step},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 885ee170e8..1672c3600f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1240,6 +1240,14 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_range(self):
+        program = Program()
+        with program_guard(program):
+            layers.range(0, 10, 2, 'int32')
+            layers.range(0.1, 10.0, 0.2, 'float32')
+
+        print(str(program))
+
     def test_spectral_norm(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py
new file mode 100644
index 0000000000..f129ae78cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_range.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            'Start': np.array([self.case[0]]).astype(self.dtype),
+            'End': np.array([self.case[1]]).astype(self.dtype),
+            'Step': np.array([self.case[2]]).astype(self.dtype)
+        }
+
+        self.outputs = {
+            'Out': np.arange(self.case[0], self.case[1],
+                             self.case[2]).astype(self.dtype)
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFloatRangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 5, 1)
+
+
+class TestInt32RangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (0, 5, 2)
+
+
+class TestInt32RangeOpCase1(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (10, 1, -2)
+
+
+class TestInt32RangeOpCase2(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (-1, -10, -2)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2e5831f0dc4d1c63d738001bdb7f51838488a7d3 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Sat, 23 Mar 2019 11:54:56 +0800
Subject: [PATCH 09/11] [slim] Refine framework of slim and add filter pruning
 strategy (#16226)

* First pr of paddle slim.
1. Add framework of paddle slim
2. Add filter pruning strategy
test=develop

* Rename unitest to tests.
test=develop

* Add prettytable into requirements.
test=develop

* Change in_nodes and out_nodes to odered dict.
test=develop

* Remove distillation.
test=develop

* Fix API.spec
test=develop

* Fix unitest.
test=develop

* Fix unitest in windows.
test=develop

* Fix unitest in windows.
test=develop

* Fix unitest.
test=develop

* Hide some functions.
test=develop

* Fix python import in python3.5
test=develop

* Fix compress pass.
test=develop

* Fix unitest of test_dist_ctr.
test=develop

* Enhence flops.

* use os.path.join

* Fix pickle for python3
Fix log and comments.
test=develop

* 1. Remove feed_reader in compress pass
2. Fix cache reader
3. Rename CompressPass to Compressor
4. Add comments for distiller optimizer
5. Remove unused pruner currently
6. Add some comments.
7. Change API.spec
test=develop

* Fix pruning in python3.
test=develop

* Fix unitest in python3.
test=develop

* Fix format in python3.
test=develop
---
 paddle/fluid/API.spec                         |  20 +-
 python/paddle/fluid/contrib/slim/__init__.py  |  11 +-
 .../fluid/contrib/slim/core/__init__.py       |   8 +-
 .../fluid/contrib/slim/core/compress_pass.py  | 129 ---
 .../fluid/contrib/slim/core/compressor.py     | 481 +++++++++
 .../paddle/fluid/contrib/slim/core/config.py  |  36 +-
 .../fluid/contrib/slim/core/pass_builder.py   |  39 -
 .../fluid/contrib/slim/core/strategy.py       |   6 +-
 .../slim/demo/filter_prune/config.yaml        |  28 -
 .../contrib/slim/demo/filter_prune/demo.py    |  69 --
 .../fluid/contrib/slim/graph/__init__.py      |   9 +-
 .../fluid/contrib/slim/graph/executor.py      |  67 +-
 .../paddle/fluid/contrib/slim/graph/graph.py  |  49 -
 .../fluid/contrib/slim/graph/graph_pass.py    |  42 -
 .../fluid/contrib/slim/graph/graph_wrapper.py | 500 ++++++++++
 .../contrib/slim/prune/prune_strategy.py      | 935 +++++++++++++++++-
 .../paddle/fluid/contrib/slim/prune/pruner.py | 104 +-
 .../contrib/slim/tests/configs/config.yaml    |  29 -
 .../slim/tests/configs/filter_pruning.yaml    |  34 +
 .../contrib/slim/tests/configs/pruners.yaml   |  12 -
 .../contrib/slim/tests/configs/pruners_0.yaml |  12 -
 .../slim/tests/filter_pruning/__init__.py     |  13 +
 .../slim/tests/filter_pruning/compress.yaml   |  34 +
 .../slim/tests/filter_pruning/mobilenet.py    | 210 ++++
 .../fluid/contrib/slim/tests/test_factory.py  |  28 +-
 .../contrib/slim/tests/test_filter_pruning.py |  89 ++
 .../contrib/slim/tests/test_graph_wrapper.py  | 140 +++
 python/requirements.txt                       |   1 +
 28 files changed, 2545 insertions(+), 590 deletions(-)
 delete mode 100644 python/paddle/fluid/contrib/slim/core/compress_pass.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/compressor.py
 delete mode 100644 python/paddle/fluid/contrib/slim/core/pass_builder.py
 delete mode 100644 python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
 delete mode 100644 python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
 delete mode 100644 python/paddle/fluid/contrib/slim/graph/graph.py
 delete mode 100644 python/paddle/fluid/contrib/slim/graph/graph_pass.py
 create mode 100644 python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
 delete mode 100644 python/paddle/fluid/contrib/slim/tests/configs/config.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
 delete mode 100644 python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
 delete mode 100644 python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py
 create mode 100644 python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
 create mode 100644 python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6c61a3d63d..70a4d7b40b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -377,23 +377,9 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args',
 paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
 paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
-paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0'))
-paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6'))
-paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
-paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
+paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80'))
+paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
+paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
 paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
 paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
 paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
index 22dbf7c8b6..4a71fab6d0 100644
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -13,13 +13,4 @@
 # limitations under the License.
 
 from .core import *
-from .graph import *
-from .prune import *
-__all__ = [
-    'build_compressor',
-    'CompressPass',
-    'ImitationGraph',
-    'SensitivePruneStrategy',
-    'MagnitudePruner',
-    'RatioPruner',
-]
+__all__ = ['Compressor', ]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
index 7826d5830a..831bd70ecc 100644
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -14,11 +14,9 @@
 
 from . import config
 from .config import *
-from . import compress_pass
-from .compress_pass import *
+from . import compressor
+from .compressor import *
 from . import strategy
 from .strategy import *
-from . import pass_builder
-from .pass_builder import *
 
-__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
+__all__ = config.__all__ + compressor.__all__ + strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
deleted file mode 100644
index c4c348b878..0000000000
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ....core import CPUPlace
-from ..graph import get_executor
-
-__all__ = ['Context', 'CompressPass']
-
-
-class Context(object):
-    """
-    The context in the process of compression.
-    Args:
-        exe: The executor used to execute graph.
-        graph: The graph to be compressed.
-        scope: The scope used to execute graph.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self, exe, graph, scope, program_exe=None):
-        # The total number of epoches to be trained.
-        self.epoch = 0
-        # Current epoch
-        self.epoch_id = 0
-        # Current batch
-        self.batch_id = 0
-        self.exe = exe
-        self.graph = graph
-        self.scope = scope
-        self.program_exe = program_exe
-
-
-class CompressPass(object):
-    """
-    The pass used to compress model.
-    Args:
-        place: The device used in compression.
-        data_reader: The data_reader used to run graph.
-        data_feeder: The data_feeder used to run graph.
-        scope: The scope used to run graph.
-        metrics: The metrics for evaluating model.
-        epoch: The total epoches of trainning in compression.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self,
-                 place=None,
-                 data_reader=None,
-                 data_feeder=None,
-                 scope=None,
-                 metrics=None,
-                 epoch=None,
-                 program_exe=None):
-        self.strategies = []
-        self.place = CPUPlace() if place is None else place
-        self.data_reader = data_reader
-        self.data_feeder = data_feeder
-        self.scope = scope
-        self.metrics = metrics
-        self.epoch = epoch
-        self.program_exe = program_exe
-
-    def add_strategy(self, strategy):
-        """
-        Add a strategy to current compress pass.
-        Args:
-            strategy: The strategy to be added into current compress pass.
-        """
-        self.strategies.append(strategy)
-        self.epoch = max(strategy.end_epoch, self.epoch)
-
-    def apply(self, graph):
-        """
-        Compress a model.
-        Args:
-            graph: The target graph to be compressed.
-        """
-        self.executor = get_executor(graph, self.place)
-        context = Context(
-            self.executor, graph, self.scope, program_exe=self.program_exe)
-
-        for strategy in self.strategies:
-            strategy.on_compress_begin(context)
-
-        for epoch in range(self.epoch):
-
-            for strategy in self.strategies:
-                strategy.on_epoch_begin(context)
-
-            for data in self.data_reader():
-
-                for strategy in self.strategies:
-                    strategy.on_batch_begin(context)
-                fetches = None
-                if self.metrics:
-                    fetches = self.metrics.values()
-                feed = None
-                if self.data_feeder:
-                    feed = self.data_feeder.feed(data)
-                results = self.executor.run(graph,
-                                            fetches=fetches,
-                                            scope=self.scope,
-                                            feed=feed)
-                if results:
-                    print("results: {}".format(
-                        zip(self.metrics.keys(), results)))
-                for strategy in self.strategies:
-                    strategy.on_batch_end(context)
-                context.batch_id += 1
-
-            for strategy in self.strategies:
-                strategy.on_epoch_end(context)
-            context.epoch_id += 1
-
-        for strategy in self.strategies:
-            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
new file mode 100644
index 0000000000..832ade497c
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -0,0 +1,481 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from .... import compiler
+from .... import io
+from .... import profiler
+from .... import scope_guard
+from ....data_feeder import DataFeeder
+from ..graph import *
+from .config import ConfigFactory
+import numpy as np
+from collections import Iterable
+import time
+import os
+import logging
+import sys
+import pickle
+import functools
+
+__all__ = ['Context', 'Compressor']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+def cached_reader(reader, sampled_rate, cache_path, cached_id):
+    """
+    Sample partial data from reader and cache them into local file system.
+    Args:
+        reader: Iterative data source.
+        sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+        cache_path(str): The path to cache the sampled data.
+        cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+    """
+    np.random.seed(cached_id)
+    cache_path = os.path.join(cache_path, str(cached_id))
+    _logger.debug('read data from: {}'.format(cache_path))
+
+    def s_reader():
+        if os.path.isdir(cache_path):
+            for file_name in open(os.path.join(cache_path, "list")):
+                yield np.load(os.path.join(cache_path, file_name.strip()))
+        else:
+            os.makedirs(cache_path)
+            list_file = open(os.path.join(cache_path, "list"), 'w')
+            batch = 0
+            dtype = None
+            for data in reader():
+                if batch == 0 or (np.random.uniform() < sampled_rate):
+                    np.save(
+                        os.path.join(cache_path, 'batch' + str(batch)), data)
+                    list_file.write('batch' + str(batch) + '.npy\n')
+                    batch += 1
+                    yield data
+
+    return s_reader
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_graph=None,
+                 train_reader=None,
+                 eval_graph=None,
+                 eval_reader=None,
+                 teacher_graphs=None,
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place: The device place where the compression job running.
+            scope: The scope used in compression job.
+            train_graph: The graph with loss as output node.
+            eval_graph: The graph used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            teacher_graphs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies.
+        """
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+
+        self.k_v = {}
+
+        self.place = place
+        self.scope = scope
+        self.train_graph = train_graph
+        self.train_reader = train_reader
+        self.eval_graph = eval_graph
+        self.eval_reader = eval_reader
+        self.executor = None
+        self.teacher_graphs = teacher_graphs
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.optimize_graph = None
+        self.cache_path = './eval_cache'
+        self.eval_results = {}
+
+    def to_file(self, file_name):
+        """
+        Save the context into file.
+        """
+        data = {}
+        data['epoch_id'] = self.epoch_id
+        data['eval_results'] = self.eval_results
+        with open(file_name, 'wb') as context_file:
+            pickle.dump(data, context_file)
+
+    def from_file(self, file_name):
+        """
+        Load the context from file.
+        """
+        with open(file_name) as context_file:
+            if sys.version_info < (3, 0):
+                data = pickle.load(context_file)
+            else:
+                data = pickle.load(context_file, encoding='bytes')
+            self.epoch_id = data['epoch_id']
+            self.eval_results = data['eval_results']
+
+    def eval_converged(self, metric_name, delta=0.001):
+        """
+        Check whether the training has been converged.
+        Args:
+            metric_name(str): The metric used to check convergence.
+            delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta'
+                          means that the training has been converged.
+        Returns:
+            bool: True means the training has been converged.
+        """
+        # TODO(wanghaoshuang@baidu.com): enhence this method.
+        if (metric_name not in self.eval_results
+            ) or len(self.eval_results[metric_name]) < 2:
+            return False
+        results = self.eval_results[metric_name][-2:]
+        _logger.info('Latest evaluations: {}'.format(results))
+        return abs(results[1] - results[0]) / results[0] < delta
+
+    def run_eval_graph(self, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            sampled_rate(float): The sampled rate used to sample partial data
+            for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same
+                            cached_id use the same sampled dataset. default: 0.
+        """
+        _logger.info('Running evaluation')
+        assert self.eval_graph is not None
+        assert self.eval_reader is not None
+        eval_graph = self.eval_graph.clone(for_test=True)
+
+        executor = SlimGraphExecutor(self.place)
+        results = []
+        batch_id = 0
+        s_time = time.time()
+        reader = self.eval_reader
+        if sampled_rate:
+            reader = cached_reader(reader, sampled_rate, self.cache_path,
+                                   cached_id)
+        for data in reader():
+            result = executor.run(eval_graph, self.scope, data=data)
+            result = [np.mean(r) for r in result]
+            results.append(result)
+            if batch_id % 20 == 0:
+                _logger.info("batch-{}; {}={}".format(
+                    batch_id, eval_graph.out_nodes.keys(), result))
+            batch_id += 1
+        result = np.mean(np.array(results), axis=0)
+        _logger.info("Final eval result: {}={}".format(
+            eval_graph.out_nodes.keys(), result))
+        if not isinstance(result, Iterable):
+            result = [result]
+        _logger.info('Finish evaluation')
+        return result, eval_graph.out_nodes.keys()
+
+    def put(self, key, value):
+        self.k_v[key] = value
+
+    def get(self, key):
+        return self.k_v.get(key)
+
+
+class Compressor(object):
+    """
+    The pass used to compress model.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place(fluid.Place): The device place where the compression job running.
+            scope(fluid.core.Scope): The scope used to run graph.
+            train_program(Program): The main program to be compressed. It must have loss op.
+            train_reader: The data reader used for training.
+            train_feed_list(dict): A dict to indicate the input variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            train_fetch_list(dict): A dict to indicate the output variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_program(Program): The program used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            eval_feed_list(dict): A dict to indicate the input variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            teacher_programs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy,
+                                 this optimizer is used to minimize the combined loss of student-net and
+                                 teacher-net while train_optimizer is used to minimize loss of
+                                 student-net in fine-tune stage. 
+
+        """
+        assert isinstance(
+            train_feed_list, list
+        ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        assert isinstance(
+            eval_feed_list, list
+        ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        self.strategies = []
+        self.epoch = 0
+        self.place = CPUPlace() if place is None else place
+        self.scope = scope
+        self.train_graph = GraphWrapper(
+            train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list)
+        self.eval_graph = GraphWrapper(
+            eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list)
+        self.train_reader = train_reader
+        self.eval_reader = eval_reader
+        self.teacher_graphs = []
+        for teacher in teacher_programs:
+            self.teacher_graphs.append(ImitationGraph(teacher, scope=scope))
+
+        self.checkpoint = None
+        self.checkpoint_path = checkpoint_path
+        self.eval_epoch = 1
+
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.init_model = None
+
+    def _add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def config(self, config_file):
+        """
+        Configure the compress pass from file with yaml format.
+        Args:
+            config_file(str): The config file in local file system.
+        """
+        factory = ConfigFactory(config_file)
+        self.epoch = factory.compressor['epoch']
+        for strategy in factory.compressor['strategies']:
+            self._add_strategy(strategy)
+        if 'checkpoint_path' in factory.compressor:
+            self.checkpoint_path = factory.compressor['checkpoint_path']
+
+        if 'init_model' in factory.compressor:
+            self.init_model = factory.compressor['init_model']
+
+    def _init_model(self, context):
+        """
+        Load model that has been compressed. 
+        """
+        if self.init_model and os.path.exists(self.init_model):
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.train_graph.load_persistables(self.init_model, exe)
+            flops = context.eval_graph.flops()
+            conv_flops = context.eval_graph.flops(only_conv=True)
+            context.eval_graph.update_param_shape(context.scope)
+            context.eval_graph.update_groups_of_conv()
+            _logger.info("conv flops: -{}".format(1 - float(
+                context.eval_graph.flops(only_conv=True)) / conv_flops))
+            _logger.info("total flops: -{}".format(1 - float(
+                context.eval_graph.flops()) / flops))
+            context.train_graph.update_param_shape(context.scope)
+            context.train_graph.update_groups_of_conv()
+            context.train_graph.infer_shape()
+            _logger.info("Init model from: {}".format(self.init_model))
+
+    def _load_checkpoint(self, context):
+        """
+        Load checkpoints from file.
+        """
+        _logger.debug('_load_checkpoint')
+        strategies = self.strategies
+        if self.checkpoint_path:
+            if not os.path.exists(self.checkpoint_path):
+                _logger.warning("Checkpints path doesn't exist: [{}]".format(
+                    self.checkpoint_path))
+                return context, strategies
+            checkpoints = [
+                dir for dir in os.listdir(self.checkpoint_path)
+                if os.path.isdir(os.path.join(self.checkpoint_path, dir))
+            ]
+            _logger.debug('self.checkpoint_path: {}'.format(
+                self.checkpoint_path))
+            _logger.info('checkpoints: {}'.format(checkpoints))
+            if len(checkpoints) > 0:
+                latest = max([int(ck) for ck in checkpoints])
+                latest_ck_path = os.path.join(self.checkpoint_path, str(latest))
+
+                model_path = os.path.join(latest_ck_path, 'model')
+                context_path = os.path.join(latest_ck_path, 'context')
+                strategy_path = os.path.join(latest_ck_path, 'strategies')
+                if os.path.exists(context_path):
+                    context.from_file(context_path)
+                    context.epoch_id += 1
+                if os.path.exists(strategy_path):
+                    with open(strategy_path, 'rb') as strategy_file:
+                        if sys.version_info < (3, 0):
+                            strategies = pickle.load(strategy_file)
+                        else:
+                            strategies = pickle.load(
+                                strategy_file, encoding='bytes')
+
+                if os.path.exists(model_path):
+                    exe = SlimGraphExecutor(context.place)
+                    with scope_guard(context.scope):
+                        context.optimize_graph.load_persistables(model_path,
+                                                                 exe)
+                    context.optimize_graph.update_param_shape(context.scope)
+                    context.optimize_graph.update_groups_of_conv()
+                    context.eval_graph.update_param_shape(context.scope)
+                    context.eval_graph.update_groups_of_conv()
+                    _logger.info("Loaded params from: {}".format(model_path))
+        return context, strategies
+
+    def _save_checkpoint(self, context):
+        """
+        Save checkpoints to file.
+        """
+        if context.epoch_id % 1 == 0 and self.checkpoint_path:
+            checkpoint_path = os.path.join(self.checkpoint_path,
+                                           str(context.epoch_id))
+            model_path = os.path.join(checkpoint_path, 'model')
+            context_path = os.path.join(checkpoint_path, 'context')
+            strategy_path = os.path.join(checkpoint_path, 'strategies')
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.optimize_graph.save_persistables(model_path, exe)
+            context.to_file(context_path)
+            with open(strategy_path, 'wb') as strategy_file:
+                pickle.dump(self.strategies, strategy_file)
+            _logger.info('Saved checkpoint to: {}'.format(checkpoint_path))
+
+    def _train_one_epoch(self, context):
+        """
+        Train one epoch.
+        """
+
+        executor = SlimGraphExecutor(self.place)
+
+        if context.optimize_graph.compiled_graph is None:
+            context.optimize_graph.compiled_graph = compiler.CompiledProgram(
+                context.optimize_graph.program).with_data_parallel(
+                    loss_name=context.optimize_graph.out_nodes['loss'])
+
+        for data in context.train_reader():
+            for strategy in self.strategies:
+                strategy.on_batch_begin(context)
+            results = executor.run(context.optimize_graph,
+                                   context.scope,
+                                   data=data)
+            results = [float(np.mean(result)) for result in results]
+            if context.batch_id % 20 == 0:
+                _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
+                    context.epoch_id, context.batch_id,
+                    context.optimize_graph.out_nodes.keys(
+                    ), [round(r, 3) for r in results]))
+            for strategy in self.strategies:
+                strategy.on_batch_end(context)
+            context.batch_id += 1
+        context.batch_id = 0
+
+    def _eval(self, context):
+        """
+        Runing evaluation.
+        """
+        results, names = context.run_eval_graph()
+        for name, result in zip(names, results):
+            if name not in context.eval_results:
+                context.eval_results[name] = []
+            context.eval_results[name].append(result)
+
+    def run(self):
+        """
+        Execute compressiong pass.
+        """
+        context = Context(
+            place=self.place,
+            scope=self.scope,
+            train_graph=self.train_graph,
+            train_reader=self.train_reader,
+            eval_graph=self.eval_graph,
+            eval_reader=self.eval_reader,
+            teacher_graphs=self.teacher_graphs,
+            train_optimizer=self.train_optimizer,
+            distiller_optimizer=self.distiller_optimizer)
+        self.context = context
+        if self.teacher_graphs:
+            context.put('teachers', self.teacher_graphs)
+        self._init_model(context)
+        if not context.optimize_graph:
+            if context.train_optimizer:
+                context.train_optimizer._name = 'train_opt'
+                context.optimize_graph = context.train_graph.get_optimize_graph(
+                    context.train_optimizer, context.place, context.scope)
+            else:
+                context.optimize_graph = context.train_graph
+
+        context, self.strategies = self._load_checkpoint(context)
+
+        for strategy in self.strategies:
+            strategy.on_compression_begin(context)
+        start = context.epoch_id
+        self._eval(context)
+        for epoch in range(start, self.epoch):
+            context.epoch_id = epoch
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+            self._train_one_epoch(context)
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            if self.eval_epoch and epoch % self.eval_epoch == 0:
+                self._eval(context)
+            self._save_checkpoint(context)
+        for strategy in self.strategies:
+            strategy.on_compression_end(context)
+        return context.eval_graph
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
index 811c457003..12df9fcd1b 100644
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -17,7 +17,7 @@ import funcsigs
 import yaml
 from collections import OrderedDict
 from ..prune import *
-from .compress_pass import *
+from ..quantization import *
 from .strategy import *
 
 __all__ = ['ConfigFactory']
@@ -29,15 +29,10 @@ class ConfigFactory(object):
     def __init__(self, config):
         """Init a factory from configure file."""
         self.instances = {}
+        self.compressor = {}
         self.version = None
         self._parse_config(config)
 
-    def get_compress_pass(self):
-        """
-        Get compress pass from factory.
-        """
-        return self.instance('compress_pass')
-
     def instance(self, name):
         """
         Get instance from factory.
@@ -59,8 +54,16 @@ class ConfigFactory(object):
             args = {}
             for key in keys:
                 value = attrs[key]
+                if isinstance(value, str) and value.lower() == 'none':
+                    value = None
                 if isinstance(value, str) and value in self.instances:
                     value = self.instances[value]
+                if isinstance(value, list):
+                    for i in range(len(value)):
+                        if isinstance(value[i],
+                                      str) and value[i] in self.instances:
+                            value[i] = self.instances[value[i]]
+
                 args[key] = value
             self.instances[name] = class_(**args)
         return self.instances.get(name)
@@ -76,16 +79,23 @@ class ConfigFactory(object):
                     assert self.version == int(key_values['version'])
 
                 # parse pruners
-                if key == 'pruners' or key == 'strategies':
+                if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies':
                     instances = key_values[key]
                     for name in instances:
                         self._new_instance(name, instances[name])
 
-                if key == 'compress_pass':
-                    compress_pass = self._new_instance(key, key_values[key])
-                    for name in key_values[key]['strategies']:
-                        strategy = self.instance(name)
-                        compress_pass.add_strategy(strategy)
+                if key == 'compressor':
+                    self.compressor['strategies'] = []
+                    self.compressor['epoch'] = key_values[key]['epoch']
+                    if 'init_model' in key_values[key]:
+                        self.compressor['init_model'] = key_values[key][
+                            'init_model']
+                    self.compressor['checkpoint_path'] = key_values[key][
+                        'checkpoint_path']
+                    if 'strategies' in key_values[key]:
+                        for name in key_values[key]['strategies']:
+                            strategy = self.instance(name)
+                            self.compressor['strategies'].append(strategy)
 
                 if key == 'include':
                     for config_file in key_values[key]:
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
deleted file mode 100644
index fc1ddc94e0..0000000000
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .compress_pass import CompressPass
-from .config import ConfigFactory
-
-__all__ = ['build_compressor']
-
-
-def build_compressor(place=None,
-                     data_reader=None,
-                     data_feeder=None,
-                     scope=None,
-                     metrics=None,
-                     epoch=None,
-                     config=None):
-    if config is not None:
-        factory = ConfigFactory(config)
-        comp_pass = factory.get_compress_pass()
-    else:
-        comp_pass = CompressPass()
-    comp_pass.place = place
-    comp_pass.data_reader = data_reader
-    comp_pass.data_feeder = data_feeder
-    comp_pass.scope = scope
-    comp_pass.metrics = metrics
-    comp_pass.epoch = epoch
-    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
index 74d98e98b0..28bf24f4e3 100644
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -20,7 +20,7 @@ class Strategy(object):
     Base class for all strategies.
     """
 
-    def __init__(self, start_epoch=0, end_epoch=10):
+    def __init__(self, start_epoch=0, end_epoch=0):
         """
         Args:
             start_epoch: The first epoch to apply the strategy.
@@ -29,7 +29,7 @@ class Strategy(object):
         self.start_epoch = start_epoch
         self.end_epoch = end_epoch
 
-    def on_compress_begin(self, context):
+    def on_compression_begin(self, context):
         pass
 
     def on_epoch_begin(self, context):
@@ -44,5 +44,5 @@ class Strategy(object):
     def on_batch_end(self, context):
         pass
 
-    def on_compress_end(self, context):
+    def on_compression_end(self, context):
         pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
deleted file mode 100644
index ea888fa2c7..0000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
deleted file mode 100644
index 21c59c0c9d..0000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle
-import os
-import sys
-from paddle.fluid.contrib.slim import CompressPass
-from paddle.fluid.contrib.slim import build_compressor
-from paddle.fluid.contrib.slim import ImitationGraph
-
-
-class LinearModel(object):
-    def __init__(slef):
-        pass
-
-    def train(self):
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        startup_program.random_seed = 10
-        with fluid.program_guard(train_program, startup_program):
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=1, act=None)
-            cost = fluid.layers.square_error_cost(input=predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-            eval_program = train_program.clone()
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_cost)
-
-        train_reader = paddle.batch(
-            paddle.dataset.uci_housing.train(), batch_size=1)
-        eval_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=1)
-        place = fluid.CPUPlace()
-        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        train_metrics = {"loss": avg_cost.name}
-        eval_metrics = {"loss": avg_cost.name}
-
-        graph = ImitationGraph(train_program)
-        config = './config.yaml'
-        comp_pass = build_compressor(
-            place,
-            data_reader=train_reader,
-            data_feeder=train_feeder,
-            scope=fluid.global_scope(),
-            metrics=train_metrics,
-            epoch=1,
-            config=config)
-        comp_pass.apply(graph)
-
-
-if __name__ == "__main__":
-    model = LinearModel()
-    model.train()
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
index d65472d193..c5d1c4dbdf 100644
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -14,10 +14,7 @@
 
 from . import executor
 from .executor import *
-from . import graph
-from .graph import *
-from . import graph_pass
-from .graph_pass import *
+from . import graph_wrapper
+from .graph_wrapper import *
 __all__ = executor.__all__
-__all__ += graph.__all__
-__all__ += graph_pass.__all__
+__all__ += graph_wrapper.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
index c02c3af820..70438a90eb 100644
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -12,51 +12,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import abc
-from abc import abstractmethod
+from ....compiler import CompiledProgram
+from ....data_feeder import DataFeeder
 from .... import executor
-from .graph import IRGraph, ImitationGraph
+from .graph_wrapper import GraphWrapper
 
-__all__ = ['get_executor']
+__all__ = ['SlimGraphExecutor']
 
 
-class GraphExecutor(object):
-    __metaclass__ = abc.ABCMeta
+class SlimGraphExecutor(object):
+    """
+    Wrapper of executor used to run GraphWrapper.
+    """
 
     def __init__(self, place):
-        self.place = place
-
-    @abstractmethod
-    def run(self, graph, feches=None, feed=None):
-        pass
-
-
-class IRGraphExecutor(GraphExecutor):
-    def run(self, grah, fetches, feed=None):
-        pass
-
-
-class ImitationGraphExecutor(GraphExecutor):
-    def __init__(self, place):
-        super(ImitationGraphExecutor, self).__init__(place)
         self.exe = executor.Executor(place)
+        self.place = place
 
-    def run(self, graph, scope=None, fetches=None, feed=None):
-        assert isinstance(graph, ImitationGraph)
-        fetch_list = None
-        if fetches:
-            fetch_list = [
-                graph.program.global_block().var(name) for name in fetches
-            ]
-        results = self.exe.run(graph.program,
+    def run(self, graph, scope, data=None):
+        """
+        Runing a graph with a batch of data.
+        Args:
+            graph(GraphWrapper): The graph to be executed.
+            scope(fluid.core.Scope): The scope to be used.
+            data(list<tuple>): A batch of data. Each tuple in this list is a sample.
+                               It will feed the items of tuple to the in_nodes of graph.
+        Returns:
+            results(list): A list of result with the same order indicated by graph.out_nodes.
+        """
+        assert isinstance(graph, GraphWrapper)
+        if data is not None:
+            feeder = DataFeeder(
+                feed_list=graph.in_nodes.values(),
+                place=self.place,
+                program=graph.program)
+            feed = feeder.feed(data)
+
+        fetch_list = graph.out_nodes.values()
+        program = graph.compiled_graph if graph.compiled_graph else graph.program
+        results = self.exe.run(program,
                                scope=scope,
                                fetch_list=fetch_list,
                                feed=feed)
         return results
-
-
-def get_executor(graph, place):
-    if isinstance(graph, ImitationGraph):
-        return ImitationGraphExecutor(place)
-    if isinstance(graph, IRGraph):
-        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
deleted file mode 100644
index f38d978341..0000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import subprocess
-from ....framework import Program
-from ....framework import Block
-from .... import core
-
-__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
-
-
-class Graph(object):
-    """
-    Base class for all graph.
-    """
-
-    def __init__(self):
-        pass
-
-    def all_parameters(self):
-        """
-        Return all the parameters in current graph.
-        """
-        pass
-
-
-class ImitationGraph(Graph):
-    def __init__(self, program=None):
-        super(ImitationGraph, self).__init__()
-        self.program = Program() if program is None else program
-
-    def all_parameters(self):
-        return self.program.global_block().all_parameters()
-
-
-class IRGraph(Graph):
-    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
deleted file mode 100644
index 1db6c4f110..0000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['GraphPass', 'PruneParameterPass']
-
-
-class GraphPass(object):
-    """
-    Base class for all graph pass.
-    """
-
-    def __init__(self):
-        pass
-
-    def apply(self, graph):
-        pass
-
-
-class PruneParameterPass(GraphPass):
-    """
-    Generate a graph for pruning parameters from target graph.
-    """
-
-    def __init__(self, pruned_params, thresholds):
-        super(PruneParameterPass, self).__init__()
-        self.pruned_params = pruned_params
-        self.thresholds = thresholds
-        self.default_threshold = thresholds['*']
-
-    def apply(self, graph):
-        pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
new file mode 100644
index 0000000000..8694be7827
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -0,0 +1,500 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .... import io
+from .... import compiler
+from ....framework import Program
+from ....framework import program_guard
+from ....framework import Parameter
+from ....framework import Variable
+from ....executor import Executor
+import copy
+from collections import Iterable
+from ....io import save_inference_model, load_inference_model, save_persistables
+import numpy as np
+import pickle
+import os
+
+__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper']
+
+OPTIMIZER_OPS = [
+    'momentum',
+    'lars_momentum',
+    'adagrad',
+    'adam',
+    'adamax',
+    'decayed_adagrad',
+    'adadelta',
+    'rmsprop',
+]
+
+
+class VarWrapper(object):
+    def __init__(self, var, graph):
+        assert isinstance(var, Variable)
+        assert isinstance(graph, GraphWrapper)
+        self._var = var
+        self._graph = graph
+
+    def __eq__(self, v):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self._var.name == v._var.name
+
+    def name(self):
+        """
+        Get the name of the variable.
+        """
+        return self._var.name
+
+    def shape(self):
+        """
+        Get the shape of the varibale.
+        """
+        return self._var.shape
+
+    def set_shape(self, shape):
+        """
+        Set the shape of the variable.
+        """
+        self._var.desc.set_shape(shape)
+
+    def inputs(self):
+        """
+        Get all the operators that use this variable as output.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_inputs():
+                ops.append(op)
+        return ops
+
+    def outputs(self):
+        """
+        Get all the operators that use this variable as input.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_outputs():
+                ops.append(op)
+        return ops
+
+
+class OpWrapper(object):
+    def __init__(self, op, graph):
+        assert isinstance(graph, GraphWrapper)
+        self._op = op
+        self._graph = graph
+
+    def __eq__(self, op):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self.idx() == op.idx()
+
+    def all_inputs(self):
+        """
+        Get all the input variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.input_arg_names
+        ]
+
+    def all_outputs(self):
+        """
+        Get all the output variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.output_arg_names
+        ]
+
+    def idx(self):
+        """
+        Get the id of this operator.
+        """
+        return self._op.idx
+
+    def type(self):
+        """
+        Get the type of this operator.
+        """
+        return self._op.type
+
+    def is_bwd_op(self):
+        """
+        Whether this operator is backward op.
+        """
+        return self.type().endswith('_grad')
+
+    def is_opt_op(self):
+        """
+        Whether this operator is optimizer op.
+        """
+        return self.type() in OPTIMIZER_OPS
+
+    def inputs(self, name):
+        """
+        Get all the varibales by the input name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.input(name)]
+
+    def outputs(self, name):
+        """
+        Get all the varibales by the output name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.output(name)]
+
+    def set_attr(self, key, value):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            key(str): the attribute name.
+            value(bool|int|str|float|list): the value of the attribute.
+        """
+        self._op._set_attr(key, value)
+
+    def attr(self, name):
+        """
+        Get the attribute by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
+            can be any valid attribute type.
+        """
+        return self._op.attr(name)
+
+
+class GraphWrapper(object):
+    """
+    It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
+    for paddle slim framework.
+    """
+
+    def __init__(self, program=None, in_nodes=[], out_nodes=[]):
+        """
+        Args:
+            program(framework.Program): A program with 
+            in_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+            out_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+        """
+        super(GraphWrapper, self).__init__()
+        self.program = Program() if program is None else program
+        self.compiled_graph = None
+        self.in_nodes = OrderedDict(in_nodes)
+        self.out_nodes = OrderedDict(out_nodes)
+        self._attrs = OrderedDict()
+
+    def all_parameters(self):
+        """
+        Get all the parameters in this graph.
+        Returns:
+            list<VarWrapper>: A list of VarWrapper instances.
+        """
+        params = []
+        for block in self.program.blocks:
+            for param in block.all_parameters():
+                params.append(VarWrapper(param, self))
+        return params
+
+    def is_parameter(self, var):
+        """
+        Whether the given variable is parameter.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return isinstance(var._var, Parameter)
+
+    def is_persistable(self, var):
+        """
+        Whether the given variable is persistable.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return var._var.persistable
+
+    def compile(self, for_parallel=True, for_test=False):
+        """
+        Compile the program in this wrapper to framework.CompiledProgram for next running.
+        This function must be called if the program is modified.
+        Args:
+            for_parallel(bool): Whether the program to run in data parallel way. default: True.
+            for_test(bool): Whether the compiled program is used for test.
+        """
+        target = self.program
+        if for_test:
+            loss = None
+        else:
+            loss = self.out_nodes['loss']
+        if for_parallel:
+            # disable memory optimize for stable training
+            build_strategy = compiler.BuildStrategy()
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
+            self.compiled_graph = compiler.CompiledProgram(
+                target).with_data_parallel(
+                    loss_name=loss, build_strategy=build_strategy)
+        else:
+            self.compiled_graph = compiler.CompiledProgram(target)
+
+    def ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
+        ops = []
+        for block in self.program.blocks:
+            for op in block.ops:
+                ops.append(OpWrapper(op, self))
+        return ops
+
+    def vars(self):
+        """
+        Get all the variables.
+        """
+        return [VarWrapper(var, self) for var in self.program.list_vars()]
+
+    def var(self, name):
+        """
+        Get the variable by variable name.
+        """
+        return VarWrapper(self.program.global_block().var(name), self)
+
+    def clone(self, for_test=False):
+        """
+        Clone a new graph from current graph.
+        Returns:
+            (GraphWrapper): The wrapper of a new graph.
+        """
+        return GraphWrapper(
+            self.program.clone(for_test),
+            copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes))
+
+    def merge(self, graph):
+        """
+        Merge a graph into current graph.
+        Args:
+            graph(GraphWrapper): The graph to be merged by current graph.
+        """
+        for var in graph.program.list_vars():
+            self.program.global_block()._clone_variable(var)
+            # TODO: parameters should be cloned
+        for op in graph.ops():
+            op = op._op
+            inputs = {}
+            outputs = {}
+            attrs = {}
+            for input_name in op.input_names:
+                inputs[input_name] = [
+                    self.var(in_var_name)
+                    for in_var_name in op.inputs(input_name)
+                ]
+            for output_name in op.output_names:
+                outputs[output_name] = [
+                    self.var(out_var_name)
+                    for out_var_name in op.output(output_name)
+                ]
+            for attr_name in op.attr_names:
+                attrs[attr_name] = op.attr(attr_name)
+            self.program.global_block().append_op(
+                type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    def program(self):
+        """
+        Get the program in current wrapper.
+        """
+        return self.program
+
+    def pre_ops(self, op):
+        """
+        Get all the previous operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for in_var in op.all_inputs():
+                if in_var in p.all_outputs():
+                    ops.append(p)
+        return ops
+
+    def next_ops(self, op):
+        """
+        Get all the next operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for out_var in op.all_outputs():
+                if out_var in p.all_inputs():
+                    ops.append(p)
+        return ops
+
+    def get_param_by_op(self, op):
+        """
+        Get the parameters used by target operator.
+        """
+        assert isinstance(op, OpWrapper)
+        params = []
+        for var in op.all_inputs():
+            if isinstance(var._var, Parameter):
+                params.append(var)
+        assert len(params) > 0
+        return params
+
+    def numel_params(self):
+        """
+        Get the number of elements in all parameters.
+        """
+        ret = 0
+        for param in self.all_parameters():
+            ret += np.product(param.shape())
+        return ret
+
+    def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]):
+        """
+        Get a new graph for training by appending some backward operators and optimization operators.
+        Args:
+            optimizer: The optimzier used to generate training graph.
+            place: The place to run the graph.
+            scope: The scope used to run the graph. Some new variable will be added into this scope.
+            no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
+        Returns:
+            (GraphWrapper): The wrapper of new graph with backward ops and optimization ops. 
+        """
+        graph = self.clone()
+        startup_program = Program()
+        with program_guard(
+                main_program=graph.program, startup_program=startup_program):
+            target_name = None
+            if 'loss' in graph.out_nodes:
+                target_name = graph.out_nodes['loss']
+            elif 'cost' in graph.out_nodes:
+                target_name = graph.out_nodes['cost']
+            target = graph.var(target_name)._var
+            optimizer.minimize(target, no_grad_set=no_grad_var_names)
+
+        exe = Executor(place)
+        exe.run(program=startup_program, scope=scope)
+        return graph
+
+    def flops(self, only_conv=False):
+        """
+        Get the flops of current graph.
+        Args:
+            only_conv: Only calculating the conv layers. default: False.
+        Returns:
+            int: The flops of current graph.
+        """
+        flops = 0
+        for op in self.ops():
+            if op.type() in ['conv2d', 'depthwise_conv2d']:
+                filter_shape = op.inputs("Filter")[0].shape()
+                input_shape = op.inputs("Input")[0].shape()
+                output_shape = op.outputs("Output")[0].shape()
+                c_out, c_in, k_h, k_w = filter_shape
+                _, _, h_out, w_out = output_shape
+                groups = op.attr("groups")
+                kernel_ops = k_h * k_w * (c_in / groups)
+                if len(op.inputs("Bias")) > 0:
+                    with_bias = 1
+                else:
+                    with_bias = 0
+                flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
+            elif op.type() == 'pool2d' and not only_conv:
+                input_shape = op.inputs("X")[0].shape()
+                output_shape = op.outputs("Out")[0].shape()
+                _, c_out, h_out, w_out = output_shape
+                k_size = op.attr("ksize")
+                flops += h_out * w_out * c_out * (k_size[0]**2)
+
+            elif op.type() == 'mul' and not only_conv:
+                x_shape = list(op.inputs("X")[0].shape())
+                y_shape = op.inputs("Y")[0].shape()
+                if x_shape[0] == -1:
+                    x_shape[0] = 1
+                flops += 2 * x_shape[0] * x_shape[1] * y_shape[1]
+
+            elif op.type() in ['relu', 'sigmoid', 'batch_norm'
+                               ] and not only_conv:
+                input_shape = list(op.inputs("X")[0].shape())
+                if input_shape[0] == -1:
+                    input_shape[0] = 1
+                flops += np.product(input_shape)
+
+        return flops
+
+    def save_persistables(self, path, exe):
+        """
+        Save all the persistable variables into file.
+        Args:
+            path(str): The path to save the persistables.
+            exe(framework.Executor): The executor used to save the persistables.
+        """
+        io.save_persistables(exe.exe, path, main_program=self.program)
+
+    def load_persistables(self, path, exe):
+        """
+        Load the persistable variables from file.
+        Args:
+            path(str): The path to load the persistables.
+            exe(framework.Executor): The executor used to load the persistables.
+        """
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(path, var.name))
+
+        io.load_vars(
+            exe.exe, path, main_program=self.program, predicate=if_exist)
+
+    def update_param_shape(self, scope):
+        """
+        Update the shape of parameters in the graph according to tensors in scope.
+        It is used after loading pruned parameters from file.
+        """
+        for param in self.all_parameters():
+            tensor_shape = np.array(scope.find_var(param.name()).get_tensor(
+            )).shape
+            param.set_shape(tensor_shape)
+
+    def infer_shape(self):
+        """
+        Update the groups of convolution layer according to current filters.
+        It is used after loading pruned parameters from file.
+        """
+        for op in self.ops():
+            if op.type() != 'conditional_block':
+                op._op.desc.infer_shape(op._op.block.desc)
+
+    def update_groups_of_conv(self):
+        for op in self.ops():
+            if op.type() == 'depthwise_conv2d':
+                op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index 34c5107daa..7a25c3a61e 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -13,54 +13,919 @@
 # limitations under the License.
 
 from ..core.strategy import Strategy
-from ....framework import Program, program_guard
+from ..graph import VarWrapper, OpWrapper, GraphWrapper
+from ....framework import Program, program_guard, Parameter
 from .... import layers
+import prettytable as pt
 import numpy as np
+from scipy.optimize import leastsq
+import copy
+import re
+import os
+import pickle
+import logging
+import sys
 
-__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy']
 
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class PruneStrategy(Strategy):
+    """
+    The base class of all pruning strategies.
+    """
 
-class SensitivePruneStrategy(Strategy):
     def __init__(self,
                  pruner=None,
                  start_epoch=0,
-                 end_epoch=10,
-                 delta_rate=0.20,
-                 acc_loss_threshold=0.2,
-                 sensitivities=None):
-        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
         self.pruner = pruner
-        self.delta_rate = delta_rate
-        self.acc_loss_threshold = acc_loss_threshold
-        self.sensitivities = sensitivities
+        self.target_ratio = target_ratio
+        self.metric_name = metric_name
+        self.pruned_params = pruned_params
+        self.pruned_list = []
+        self.backup = {}
+        self.param_shape_backup = {}
 
+    def _eval_graph(self, context, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            context(slim.core.Context): The context storing all information used to evaluate the current model.
+            sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+        """
+        results, names = context.run_eval_graph(sampled_rate, cached_id)
+        metric = np.mean(results[list(names).index(self.metric_name)])
+        return metric
 
-class PruneStrategy(Strategy):
+    def _prune_filters_by_ratio(self,
+                                scope,
+                                params,
+                                ratio,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning filters by given ratio.
+        Args:
+            scope(fluid.core.Scope): The scope used to pruning filters.
+            params(list<VarWrapper>): A list of filter parameters.
+            ratio(float): The ratio to be pruned.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[0]:
+            return
+        param_t = scope.find_var(params[0].name()).get_tensor()
+        pruned_idx = self.pruner.cal_pruned_idx(
+            params[0].name(), np.array(param_t), ratio, axis=0)
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[0] = pruned_param.shape[0]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()), str(0), str(ori_shape), str(param.shape())))
+            self.pruned_list[0].append(param.name())
+        return pruned_idx
+
+    def _prune_parameter_by_idx(self,
+                                scope,
+                                params,
+                                pruned_idx,
+                                pruned_axis,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning parameters in given axis.
+        Args:
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(VarWrapper): The parameter to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            pruned_axis(int): The pruning axis.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[pruned_axis]:
+            return
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()),
+                str(pruned_axis), str(ori_shape), str(param.shape())))
+            self.pruned_list[pruned_axis].append(param.name())
+
+    def _forward_search_related_op(self, graph, param):
+        """
+        Forward search operators that will be affected by pruning of param.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            param(VarWrapper): The current pruned parameter.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        assert isinstance(param, VarWrapper)
+        visited = {}
+        for op in graph.ops():
+            visited[op.idx()] = False
+        stack = []
+        for op in graph.ops():
+            if (not op.is_bwd_op()) and (param in op.all_inputs()):
+                stack.append(op)
+        visit_path = []
+        while len(stack) > 0:
+            top_op = stack[len(stack) - 1]
+            if visited[top_op.idx()] == False:
+                visit_path.append(top_op)
+                visited[top_op.idx()] = True
+            next_ops = None
+            if top_op.type() == "conv2d" and param not in top_op.all_inputs():
+                next_ops = None
+            elif top_op.type() == "mul":
+                next_ops = None
+            else:
+                next_ops = self._get_next_unvisited_op(graph, visited, top_op)
+            if next_ops == None:
+                stack.pop()
+            else:
+                stack += next_ops
+        return visit_path
+
+    def _get_next_unvisited_op(self, graph, visited, top_op):
+        """
+        Get next unvisited adjacent operators of given operators.
+        Args:
+            graph(GraphWrapper): The graph used to search. 
+            visited(list): The ids of operators that has been visited.
+            top_op: The given operator.
+        Returns:
+            list<OpWrapper>: A list of operators. 
+        """
+        assert isinstance(top_op, OpWrapper)
+        next_ops = []
+        for op in graph.next_ops(top_op):
+            if (visited[op.idx()] == False) and (not op.is_bwd_op()):
+                next_ops.append(op)
+        return next_ops if len(next_ops) > 0 else None
+
+    def _get_accumulator(self, graph, param):
+        """
+        Get accumulators of given parameter. The accumulator was created by optimizer.
+        Args:
+            graph(GraphWrapper): The graph used to search.
+            param(VarWrapper): The given parameter.
+        Returns:
+            list<VarWrapper>: A list of accumulators which are variables.
+        """
+        assert isinstance(param, VarWrapper)
+        params = []
+        for op in param.outputs():
+            if op.is_opt_op():
+                for out_var in op.all_outputs():
+                    if graph.is_persistable(out_var) and out_var.name(
+                    ) != param.name():
+                        params.append(out_var)
+        return params
+
+    def _forward_pruning_ralated_params(self,
+                                        graph,
+                                        scope,
+                                        param,
+                                        place,
+                                        ratio=None,
+                                        pruned_idxs=None,
+                                        lazy=False,
+                                        only_graph=False):
+        """
+        Pruning all the parameters affected by the pruning of given parameter.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            param(VarWrapper): The given parameter.
+            place(fluid.Place): The device place of filter parameters.
+            ratio(float): The target ratio to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        assert isinstance(
+            graph,
+            GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
+        assert isinstance(
+            param, VarWrapper), "param must be instance of slim.core.VarWrapper"
+
+        if param.name() in self.pruned_list[0]:
+            return
+        related_ops = self._forward_search_related_op(graph, param)
+
+        if ratio is None:
+            assert pruned_idxs is not None
+            self._prune_parameter_by_idx(
+                scope, [param] + self._get_accumulator(graph, param),
+                pruned_idxs,
+                pruned_axis=0,
+                place=place,
+                lazy=lazy,
+                only_graph=only_graph)
+
+        else:
+            pruned_idxs = self._prune_filters_by_ratio(
+                scope, [param] + self._get_accumulator(graph, param),
+                ratio,
+                place,
+                lazy=lazy,
+                only_graph=only_graph)
+        corrected_idxs = pruned_idxs[:]
+
+        for idx, op in enumerate(related_ops):
+            if op.type() == "conv2d" and (param not in op.all_inputs()):
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=1,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            if op.type() == "depthwise_conv2d":
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "elementwise_add":
+                # pruning bias
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        bias_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [bias_param] + self._get_accumulator(
+                                graph, bias_param),
+                            pruned_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "mul":  # pruning fc layer
+                fc_input = None
+                fc_param = None
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        fc_param = in_var
+                    else:
+                        fc_input = in_var
+
+                idx = []
+                feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
+                range_idx = np.array(range(feature_map_size))
+                for i in corrected_idxs:
+                    idx += list(range_idx + i * feature_map_size)
+                corrected_idxs = idx
+                self._prune_parameter_by_idx(
+                    scope, [fc_param] + self._get_accumulator(graph, fc_param),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+            elif op.type() == "concat":
+                concat_inputs = op.all_inputs()
+                last_op = related_ops[idx - 1]
+                for out_var in last_op.all_outputs():
+                    if out_var in concat_inputs:
+                        concat_idx = concat_inputs.index(out_var)
+                offset = 0
+                for ci in range(concat_idx):
+                    offset += concat_inputs[ci].shape()[1]
+                corrected_idxs = [x + offset for x in pruned_idxs]
+            elif op.type() == "batch_norm":
+                bn_inputs = op.all_inputs()
+                mean = bn_inputs[2]
+                variance = bn_inputs[3]
+                alpha = bn_inputs[0]
+                beta = bn_inputs[1]
+                self._prune_parameter_by_idx(
+                    scope, [mean] + self._get_accumulator(graph, mean),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [variance] + self._get_accumulator(graph, variance),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [alpha] + self._get_accumulator(graph, alpha),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [beta] + self._get_accumulator(graph, beta),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+    def _prune_parameters(self,
+                          graph,
+                          scope,
+                          params,
+                          ratios,
+                          place,
+                          lazy=False,
+                          only_graph=False):
+        """
+        Pruning the given parameters.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(list<str>): A list of parameter names to be pruned.
+            ratios(list<float>): A list of ratios to be used to pruning parameters.
+            place(fluid.Place): The device place of filter parameters.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+
+        """
+        _logger.debug('\n################################')
+        _logger.debug('#       pruning parameters       #')
+        _logger.debug('################################\n')
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis',
+                                                            'from', 'to'))
+        assert len(params) == len(ratios)
+        self.pruned_list = [[], []]
+        for param, ratio in zip(params, ratios):
+            assert isinstance(param, str) or isinstance(param, unicode)
+            param = graph.var(param)
+            self._forward_pruning_ralated_params(
+                graph,
+                scope,
+                param,
+                place,
+                ratio=ratio,
+                lazy=lazy,
+                only_graph=only_graph)
+            ops = param.outputs()
+            for op in ops:
+                if op.type() == 'conv2d':
+                    brother_ops = self._search_brother_ops(graph, op)
+                    for broher in brother_ops:
+                        for p in graph.get_param_by_op(broher):
+                            self._forward_pruning_ralated_params(
+                                graph,
+                                scope,
+                                p,
+                                place,
+                                ratio=ratio,
+                                lazy=lazy,
+                                only_graph=only_graph)
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+
+    def _search_brother_ops(self, graph, op_node):
+        """
+        Search brother operators that was affected by pruning of given operator.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            op_node(OpWrapper): The start node for searching.
+        Returns: 
+            list<VarWrapper>: A list of operators.
+        """
+        visited = [op_node.idx()]
+        stack = []
+        brothers = []
+        for op in graph.next_ops(op_node):
+            if (op.type() != 'conv2d') and (op.type() != 'fc') and (
+                    not op._is_bwd_op()):
+                stack.append(op)
+                visited.append(op.idx())
+        while len(stack) > 0:
+            top_op = stack.pop()
+            for parent in graph.pre_ops(top_op):
+                if parent.idx() not in visited and (not parent._is_bwd_op()):
+                    if ((parent.type == 'conv2d') or (parent.type == 'fc')):
+                        brothers.append(parent)
+                    else:
+                        stack.append(parent)
+                    visited.append(parent.idx())
+
+            for child in graph.next_ops(top_op):
+                if (child.type != 'conv2d') and (child.type != 'fc') and (
+                        child.idx() not in visited) and (
+                            not child._is_bwd_op()):
+                    stack.append(child)
+                    visited.append(child.idx())
+        return brothers
+
+    def _prune_graph(self, graph, target_graph):
+        """
+        Pruning parameters of graph according to target graph.
+        Args:
+            graph(GraphWrapper): The graph to be pruned.
+            target_graph(GraphWrapper): The reference graph.
+        Return: None
+        """
+        count = 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter',
+                                                            'from', 'to'))
+        for param in target_graph.all_parameters():
+            var = graph.var(param.name())
+            ori_shape = var.shape()
+            var.set_shape(param.shape())
+            _logger.debug(
+                '|----+----------------------------------------+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format(
+                str(count),
+                str(param.name()), str(ori_shape), str(param.shape())))
+            count += 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+
+
+class UniformPruneStrategy(PruneStrategy):
     """
-    The strategy that pruning weights by threshold or ratio iteratively.
+    The uniform pruning strategy. The parameters will be pruned by uniform ratio.
     """
 
     def __init__(self,
-                 pruner,
-                 mini_batch_pruning_frequency=1,
+                 pruner=None,
                  start_epoch=0,
-                 end_epoch=10):
-        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
-        self.pruner = pruner
-        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
-
-    def _triger(self, context):
-        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
-                self.start_epoch <= context.epoch_id < self.end_epoch)
-
-    def on_batch_end(self, context):
-        if self._triger(context):
-            prune_program = Program()
-            with program_guard(prune_program):
-                for param in context.graph.all_parameters():
-                    prune_program.global_block().clone_variable(param)
-                    p = prune_program.global_block().var(param.name)
-                    zeros_mask = self.pruner.prune(p)
-                    pruned_param = p * zeros_mask
-                    layers.assign(input=pruned_param, output=param)
-            context.program_exe.run(prune_program, scope=context.scope)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(UniformPruneStrategy, self).__init__(pruner, start_epoch,
+                                                   end_epoch, target_ratio,
+                                                   metric_name, pruned_params)
+
+    def _get_best_ratios(self, context):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios')
+        pruned_params = []
+        for param in context.eval_graph.all_parameters():
+            if re.match(self.pruned_params, param.name()):
+                pruned_params.append(param.name())
+
+        min_ratio = 0.
+        max_ratio = 1.
+
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+
+        while min_ratio < max_ratio:
+            ratio = (max_ratio + min_ratio) / 2
+            _logger.debug(
+                '-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
+            ratios = [ratio] * len(pruned_params)
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                pruned_params,
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
+            _logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            if abs(pruned_flops - self.target_ratio) < 1e-2:
+                break
+            if pruned_flops > self.target_ratio:
+                max_ratio = ratio
+            else:
+                min_ratio = ratio
+        _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
+        return pruned_params, ratios
+
+    def on_epoch_begin(self, context):
+        if context.epoch_id == self.start_epoch:
+            params, ratios = self._get_best_ratios(context)
+
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('\n################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################\n')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.2f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.2f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            #            metric = self._eval_graph(context)
+            #            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------UniformPruneStrategy.on_compression_begin finish--------------------------------'
+            )
+
+
+class SensitivePruneStrategy(PruneStrategy):
+    """
+    Sensitive pruning strategy. Different pruned ratio was applied on each layer.
+    """
+
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=0,
+                 delta_rate=0.20,
+                 target_ratio=0.5,
+                 metric_name='top1_acc',
+                 pruned_params='conv.*_weights',
+                 sensitivities_file='./sensitivities.data',
+                 sensitivities={},
+                 num_steps=1,
+                 eval_rate=None):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10.
+            delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2
+            target_ratio(float): The flops ratio to be pruned from current model. default: 0.5
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc'
+            pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'.
+            sensitivities_file(str): The sensitivities file. default: './sensitivities.data'
+            sensitivities(dict): The user-defined sensitivities. default: {}.
+            num_steps(int): The number of pruning steps. default: 1.
+            eval_rate(float): The rate of sampled data used to calculate sensitivities.
+                              None means using all the data. default: None.
+        """
+        super(SensitivePruneStrategy, self).__init__(pruner, start_epoch,
+                                                     end_epoch, target_ratio,
+                                                     metric_name, pruned_params)
+        self.delta_rate = delta_rate
+        self.pruned_list = []
+        self.sensitivities = sensitivities
+        self.sensitivities_file = sensitivities_file
+        self.backup = {}
+        self.param_shape_backup = {}
+        self.num_steps = num_steps
+        self.eval_rate = eval_rate
+        self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
+
+    def _save_sensitivities(self, sensitivities, sensitivities_file):
+        """
+        Save sensitivities into file.
+        """
+        with open(sensitivities_file, 'wb') as f:
+            pickle.dump(sensitivities, f)
+
+    def _load_sensitivities(self, sensitivities_file):
+        """
+        Load sensitivities from file.
+        """
+        sensitivities = {}
+        if sensitivities_file and os.path.exists(sensitivities_file):
+            with open(sensitivities_file, 'rb') as f:
+                if sys.version_info < (3, 0):
+                    sensitivities = pickle.load(f)
+                else:
+                    sensitivities = pickle.load(f, encoding='bytes')
+
+        for param in sensitivities:
+            sensitivities[param]['pruned_percent'] = [
+                round(p, 2) for p in sensitivities[param]['pruned_percent']
+            ]
+        self._format_sensitivities(sensitivities)
+        return sensitivities
+
+    def _format_sensitivities(self, sensitivities):
+        """
+        Print formated sensitivities in debug log level.
+        """
+        tb = pt.PrettyTable()
+        tb.field_names = ["parameter", "size"] + [
+            str(round(i, 2))
+            for i in np.arange(self.delta_rate, 1, self.delta_rate)
+        ]
+        for param in sensitivities:
+            if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2):
+                tb.add_row([param, sensitivities[param]['size']] + [
+                    round(loss, 2) for loss in sensitivities[param]['loss']
+                ])
+        _logger.debug('\n################################')
+        _logger.debug('#      sensitivities table     #')
+        _logger.debug('################################\n')
+        _logger.debug(tb)
+
+    def _compute_sensitivities(self, context):
+        """
+        Computing the sensitivities of all parameters.
+        """
+        _logger.info("calling _compute_sensitivities.")
+        self.param_shape_backup = {}
+        self.backup = {}
+        cached_id = np.random.randint(1000)
+        if self.start_epoch == context.epoch_id:
+            sensitivities_file = self.sensitivities_file
+        else:
+            sensitivities_file = self.sensitivities_file + ".epoch" + str(
+                context.epoch_id)
+        sensitivities = self._load_sensitivities(sensitivities_file)
+
+        for param in context.eval_graph.all_parameters():
+            if not re.match(self.pruned_params, param.name()):
+                continue
+            if param.name() not in sensitivities:
+                sensitivities[param.name()] = {
+                    'pruned_percent': [],
+                    'loss': [],
+                    'size': param.shape()[0]
+                }
+
+        metric = None
+
+        for param in sensitivities.keys():
+            ratio = self.delta_rate
+            while ratio < 1:
+                ratio = round(ratio, 2)
+                if ratio in sensitivities[param]['pruned_percent']:
+                    _logger.debug('{}, {} has computed.'.format(param, ratio))
+                    ratio += self.delta_rate
+                    continue
+                if metric is None:
+                    metric = self._eval_graph(context, self.eval_rate,
+                                              cached_id)
+                # prune parameter by ratio
+                self._prune_parameters(
+                    context.eval_graph,
+                    context.scope, [param], [ratio],
+                    context.place,
+                    lazy=True)
+                self.pruned_list[0]
+                # get accuracy after pruning and update self.sensitivities
+                pruned_metric = self._eval_graph(context, self.eval_rate,
+                                                 cached_id)
+                loss = metric - pruned_metric
+                _logger.info("pruned param: {}; {}; loss={}".format(
+                    param, ratio, loss))
+                for brother in self.pruned_list[0]:
+                    if re.match(self.pruned_params, brother):
+                        if brother not in sensitivities:
+                            sensitivities[brother] = {
+                                'pruned_percent': [],
+                                'loss': []
+                            }
+                        sensitivities[brother]['pruned_percent'].append(ratio)
+                        sensitivities[brother]['loss'].append(loss)
+
+                self._save_sensitivities(sensitivities, sensitivities_file)
+
+                # restore pruned parameters
+                for param_name in self.backup.keys():
+                    param_t = context.scope.find_var(param_name).get_tensor()
+                    param_t.set(self.backup[param_name], context.place)
+
+#                pruned_metric = self._eval_graph(context)
+                self.backup = {}
+
+                ratio += self.delta_rate
+        return sensitivities
+
+    def _get_best_ratios(self, context, sensitivities, target_ratio):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios for pruning ratie: {}'.format(
+            target_ratio))
+        self.param_shape_backup = {}
+        self.backup = {}
+
+        def func(params, x):
+            a, b, c, d = params
+            return a * x * x * x + b * x * x + c * x + d
+
+        def error(params, x, y):
+            return func(params, x) - y
+
+        def slove_coefficient(x, y):
+            init_coefficient = [10, 10, 10, 10]
+            coefficient, loss = leastsq(error, init_coefficient, args=(x, y))
+            return coefficient
+
+        min_loss = 0.
+        max_loss = 0.
+
+        # step 1: fit curve by sensitivities
+        coefficients = {}
+        for param in sensitivities:
+            losses = np.array([0] * 5 + sensitivities[param]['loss'])
+            precents = np.array([0] * 5 + sensitivities[param][
+                'pruned_percent'])
+            coefficients[param] = slove_coefficient(precents, losses)
+            loss = np.max(losses)
+            max_loss = np.max([max_loss, loss])
+
+        # step 2: Find a group of ratios by binary searching.
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+        ratios = []
+        while min_loss < max_loss:
+            loss = (max_loss + min_loss) / 2
+            _logger.info(
+                '-----------Try pruned ratios while acc loss={:.4f}-----------'.
+                format(loss))
+            ratios = []
+            # step 2.1: Get ratios according to current loss
+            for param in sensitivities:
+                coefficient = copy.deepcopy(coefficients[param])
+                coefficient[-1] = coefficient[-1] - loss
+                roots = np.roots(coefficient)
+                for root in roots:
+                    min_root = 1
+                    if np.isreal(root) and root > 0 and root < 1:
+                        selected_root = min(root.real, min_root)
+                ratios.append(selected_root)
+            _logger.info('Pruned ratios={}'.format(
+                [round(ratio, 3) for ratio in ratios]))
+            # step 2.2: Pruning by current ratios
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                sensitivities.keys(),
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
+            _logger.info('Pruned model size: {:.4f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            # step 2.3: Check whether current ratios is enough
+            if abs(pruned_flops - target_ratio) < 0.015:
+                break
+            if pruned_flops > target_ratio:
+                max_loss = loss
+            else:
+                min_loss = loss
+        return sensitivities.keys(), ratios
+
+    def _current_pruning_target(self, context):
+        '''
+        Get the target pruning rate in current epoch.
+        '''
+        _logger.info('Left number of pruning steps: {}'.format(self.num_steps))
+        if self.num_steps <= 0:
+            return None
+        if (self.start_epoch == context.epoch_id) or context.eval_converged(
+                self.metric_name, 0.005):
+            self.num_steps -= 1
+            return self.pruning_step
+
+    def on_epoch_begin(self, context):
+        current_ratio = self._current_pruning_target(context)
+        if current_ratio is not None:
+            sensitivities = self._compute_sensitivities(context)
+            params, ratios = self._get_best_ratios(context, sensitivities,
+                                                   current_ratio)
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            self.param_shape_backup = {}
+            self.backup = {}
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+            context.optimize_graph.compile()  # to update the compiled program
+            context.eval_graph.compile(
+                for_parallel=False,
+                for_test=True)  # to update the compiled program
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.3f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.3f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            metric = self._eval_graph(context)
+            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------'
+            )
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
index ca72bcb6f6..506b8fbe1d 100644
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import numpy as np
+import collections
 from .... import layers
 
-__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+__all__ = ['Pruner', 'StructurePruner']
 
 
 class Pruner(object):
@@ -30,54 +31,77 @@ class Pruner(object):
         pass
 
 
-class MagnitudePruner(Pruner):
+class StructurePruner(Pruner):
     """
-    Pruner used to pruning a parameter by threshold.
+    Pruner used to pruning parameters by groups.
     """
 
-    def __init__(self, threshold):
-        self.threshold = threshold
-
-    def prune(self, param, threshold=None):
-        if threshold is None:
-            thres = layers.fill_constant(
-                shape=[1], dtype='float32', value=self.threshold)
-        else:
-            thres = threshold
-        zeros_mask = layers.less_than(x=param, y=thres)
-        return zeros_mask
-
-
-class RatioPruner(Pruner):
-    """
-    Pruner used to pruning a parameter by ratio.
-    """
+    def __init__(self, pruning_axis, criterions):
+        """
+        Args:
+            pruning_axis(dict): The key is the name of parameter to be pruned,
+                                '*' means all the parameters.
+                                The value is the axis to be used. Given a parameter
+                                with shape [3, 4], the result of pruning 50% on aixs 1
+                                is a parameter with shape [3, 2].
+            criterions(dict): The key is the name of parameter to be pruned,
+                              '*' means all the parameters.
+                              The value is the criterion used to sort groups for pruning.
+                              It only supports 'l1_norm' currently.
+        """
+        self.pruning_axis = pruning_axis
+        self.criterions = criterions
 
-    def __init__(self, ratios=None):
+    def cal_pruned_idx(self, name, param, ratio, axis=None):
         """
+        Calculate the index to be pruned on axis by given pruning ratio.
         Args:
-            ratios: dict with pair (paramer_name, pruned_ratio). 
+            name(str): The name of parameter to be pruned.
+            param(np.array): The data of parameter to be pruned.
+            ratio(float): The ratio to be pruned.
+            axis(int): The axis to be used for pruning given parameter.
+                       If it is None, the value in self.pruning_axis will be used.
+                       default: None.
+        Returns:
+            list<int>: The indexes to be pruned on axis.
         """
-        self.ratios = ratios
+        criterion = self.criterions[
+            name] if name in self.criterions else self.criterions['*']
+        if axis is None:
+            assert self.pruning_axis is not None, "pruning_axis should set if axis is None."
+            axis = self.pruning_axis[
+                name] if name in self.pruning_axis else self.pruning_axis['*']
+        prune_num = int(round(param.shape[axis] * ratio))
+        reduce_dims = [i for i in range(len(param.shape)) if i != axis]
+        if criterion == 'l1_norm':
+            criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
+        pruned_idx = criterions.argsort()[:prune_num]
+        return pruned_idx
 
-    def prune(self, param, ratio=None):
+    def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
         """
+        Pruning a array by indexes on given axis.
         Args:
-            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+            tensor(numpy.array): The target array to be pruned.
+            pruned_idx(list<int>): The indexes to be pruned.
+            pruned_axis(int): The axis of given array to be pruned on. 
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means remove the pruned elements from memory.
+                        default: False.
+        Returns:
+            numpy.array: The pruned array.
         """
-        if ratio is None:
-            rat = self.ratios[
-                param.name] if param.name in self.ratios else self.ratios['*']
-        else:
-            rat = ratio
-        if rat < 1.0:
-            k = max(int(rat * np.prod(param.shape)), 1)
-            param_vec = layers.reshape(x=param, shape=[1, -1])
-            param_topk, _ = layers.topk(param_vec, k=k)
-            threshold = layers.slice(
-                param_topk, axes=[1], starts=[-1], ends=[k])
-            threshold = layers.reshape(x=threshold, shape=[1])
-            zeros_mask = layers.less_than(x=param, y=threshold)
+        mask = np.zeros(tensor.shape[pruned_axis], dtype=bool)
+        mask[pruned_idx] = True
+
+        def func(data):
+            return data[~mask]
+
+        def lazy_func(data):
+            data[mask] = 0
+            return data
+
+        if lazy:
+            return np.apply_along_axis(lazy_func, pruned_axis, tensor)
         else:
-            zeros_mask = layers.ones(param.shape)
-        return zeros_mask
+            return np.apply_along_axis(func, pruned_axis, tensor)
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
deleted file mode 100644
index d9b49029d3..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-version: 1.0
-include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_2'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
new file mode 100644
index 0000000000..570c60026d
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        delta_rate: 0.1
+        target_ratio: 0.3
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: '.*_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 120
+    checkpoint_path: './checkpoints/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
deleted file mode 100644
index 235092c595..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_2:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
deleted file mode 100644
index cd2ef9eb56..0000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_3:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py b/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py
new file mode 100644
index 0000000000..d0c32e2609
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
new file mode 100644
index 0000000000..232276feac
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 1
+        delta_rate: 0.2
+        target_ratio: 0.08
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: 'conv6_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 2
+    checkpoint_path: './checkpoints/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py b/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py
new file mode 100644
index 0000000000..0148325a64
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name="conv1")
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name="conv2_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name="conv2_2")
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name="conv3_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name="conv3_2")
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name="conv4_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name="conv4_2")
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name="conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name="conv5_6")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name="conv6")
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name="fc7_weights"),
+                                 bias_attr=ParamAttr(name="fc7_offset"))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
diff --git a/python/paddle/fluid/contrib/slim/tests/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
index 2fc72b6475..90eb8bd4b3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
@@ -12,29 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.contrib.slim import ConfigFactory
+from paddle.fluid.contrib.slim.core import ConfigFactory
 import unittest
 
 
 class TestFactory(unittest.TestCase):
-    def test_parse(self):
-        factory = ConfigFactory('./configs/config.yaml')
+    def test_parse_pruning(self):
+        factory = ConfigFactory('./configs/filter_pruning.yaml')
 
-        pruner = factory.instance('pruner_1')
-        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+        pruner_1 = factory.instance('pruner_1')
+        self.assertEquals(pruner_1.pruning_axis['*'], 0)
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        pruner = factory.instance('pruner_2')
-        self.assertEquals(pruner.ratios['*'], 0.7)
+        strategy = factory.instance('sensitive_pruning_strategy')
+        pruner_1 = strategy.pruner
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        strategy = factory.instance('strategy_1')
-        pruner = strategy.pruner
-        self.assertEquals(pruner.ratios['*'], 0.7)
-
-        compress_pass = factory.get_compress_pass()
-        self.assertEquals(compress_pass.epoch, 100)
-
-        strategy = compress_pass.strategies[0]
-        self.assertEquals(strategy.delta_rate, 0.2)
+        self.assertEquals(strategy.start_epoch, 0)
+        self.assertEquals(strategy.sensitivities_file,
+                          'mobilenet_acc_top1_sensitive.data')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
new file mode 100644
index 0000000000..d73ee27779
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
@@ -0,0 +1,89 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from filter_pruning.mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestFilterPruning(unittest.TestCase):
+    def test_compression(self):
+        """
+        Model: mobilenet_v1
+        data: mnist
+        step1: Training one epoch
+        step2: pruning flops
+        step3: fine-tune one epoch
+        step4: check top1_acc.
+        """
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet().net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            train_optimizer=optimizer)
+        com_pass.config('./filter_pruning/compress.yaml')
+        eval_graph = com_pass.run()
+        self.assertTrue(
+            abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969)
+            < 0.02)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
new file mode 100644
index 0000000000..ad82aa9411
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -0,0 +1,140 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+import numpy as np
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+from paddle.fluid import core
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    data.stop_gradinet = False
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return data, label, loss
+
+
+class TestGraphWrapper(unittest.TestCase):
+    def build_program(self):
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            image, label, self.loss = residual_block(2)
+            eval_program = main.clone()
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(self.loss)
+        self.scope = core.Scope()
+        exe = fluid.Executor(place)
+        exe.run(startup, scope=self.scope)
+        self.eval_graph = GraphWrapper(
+            program=eval_program,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+        self.train_graph = GraphWrapper(
+            program=main,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+
+    def test_all_parameters(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.all_parameters()), 24)
+
+    def test_all_vars(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.vars()), 90)
+
+    def test_numel_params(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.numel_params(), 13258)
+
+    def test_compile(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        self.train_graph.compile()
+        exe.run(self.train_graph.compiled_graph,
+                scope=self.scope,
+                feed={
+                    'image':
+                    np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'),
+                    'label': np.random.randint(0, 10, [16, 1]).astype('int64')
+                })
+
+    def test_pre_and_next_ops(self):
+        self.build_program()
+        for op in self.train_graph.ops():
+            for next_op in self.train_graph.next_ops(op):
+                self.assertTrue(op in self.train_graph.pre_ops(next_op))
+
+    def test_get_optimize_graph(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        opt = fluid.optimizer.SGD(learning_rate=0.001)
+        train_graph = self.eval_graph.get_optimize_graph(
+            opt, place, self.scope, no_grad_var_names=['image'])
+        self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops()))
+        exe = fluid.Executor(place)
+        train_graph.compile()
+        image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32')
+        label = np.random.randint(0, 10, [16, 1]).astype('int64')
+        exe.run(train_graph.compiled_graph,
+                scope=self.scope,
+                feed={'image': image,
+                      'label': label})
+
+    def test_flops(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.flops(), 354624)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index 36bd5d4261..ce56462fac 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -12,3 +12,4 @@ six
 funcsigs
 pyyaml
 decorator
+prettytable

From 431068c9cac6292189170834b5eea3855f745b8e Mon Sep 17 00:00:00 2001
From: chuanqiw <chuanqi.wang@intel.com>
Date: Sat, 23 Mar 2019 20:25:26 +0800
Subject: [PATCH 10/11] Enhance test calibration script on accuracy assert
 test=develop

---
 python/paddle/fluid/contrib/tests/test_calibration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index b9f938bebe..1a046a7941 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
             self.model, self.infer_iterations)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program("calibration_out")
-        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        delta_value = fp32_acc1 - int8_acc1
         self.assertLess(delta_value, 0.01)
         print(
             "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".

From c917c13af150c578593e00cc6f25017c40e9dacd Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Sat, 23 Mar 2019 09:10:00 -0500
Subject: [PATCH 11/11] increase the time limite (#16405)

test=develop
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 3c6b9daca6..cefa2b4919 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -118,8 +118,8 @@ if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    # change the timeout from 600 to 1200, because in debug mode, this test need more time.
-    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
+    # change the timeout from 600 to 2200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
 endif()
 
 if (WITH_NGRAPH)