From 3909108cae29ee035785e7e2fa44f1c7c8bbd9ea Mon Sep 17 00:00:00 2001
From: zhhsplendid <zhhsplendid@gmail.com>
Date: Tue, 26 Mar 2019 01:59:25 +0000
Subject: [PATCH 01/19] Add SpectralNormGradOpDescMaker

Use SpectralNormGradOpDescMaker instead of DefaultGradOpDescMaker
to avoid registering useless variables to improve GPU usage.

test=develop
---
 paddle/fluid/operators/spectral_norm_op.cc | 27 +++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 357d055756..04f659a465 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -10,6 +10,9 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/spectral_norm_op.h"
+
+#include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -156,6 +159,28 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class SpectralNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("spectral_norm_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Weight", Input("Weight"));
+    op->SetInput("U", Input("U"));
+    op->SetInput("V", Input("V"));
+
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+
+    op->SetAttrMap(Attrs());
+
+    return op;
+  }
+};
+
 class SpectralNormOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -185,7 +210,7 @@ class SpectralNormOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SpectralNormGradOpDescMaker);
 REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad);
 REGISTER_OP_CPU_KERNEL(
     spectral_norm,

From 183bacebe3d822776abdaa93a7f1765dcc0ade54 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Wed, 27 Mar 2019 16:46:39 +0800
Subject: [PATCH 02/19] clean codes and fix some bugs. test=develop

---
 .../slim/quantization/quantization_pass.py    | 120 ++++++++++--------
 .../quantization/quantization_strategy.py     |  16 ++-
 .../slim/tests/quantization/compress.yaml     |   2 +
 .../slim/tests/test_quantization_pass.py      |   3 -
 python/paddle/fluid/framework.py              |  80 ++++--------
 5 files changed, 103 insertions(+), 118 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index ab3bd8bd18..3809e32794 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -26,6 +26,17 @@ __all__ = [
 ]
 
 
+def _init_var_node(var_node, value, scope, place):
+    assert isinstance(value,
+                      np.ndarray), 'The type of value should be numpy array.'
+    assert scope is not None, \
+    'The scope cannot be set None.'
+    assert place is not None, \
+    'The place cannot be set None.'
+    tensor = scope.var(var_node.name()).get_tensor()
+    tensor.set(value, place)
+
+
 class QuantizationTransformPass(object):
     def __init__(self,
                  scope=None,
@@ -88,14 +99,14 @@ class QuantizationTransformPass(object):
         assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(activation_quantize_type))
+                "Unknown activation_quantize_type : '%s'. It can only be "
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
+                (str(activation_quantize_type)))
         if weight_quantize_type not in quant_type:
             raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(weight_quantize_type))
+                "Unknown weight_quantize_type: '%s'. It can only be "
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
+                % (str(weight_quantize_type)))
 
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
@@ -121,8 +132,6 @@ class QuantizationTransformPass(object):
         """
         assert isinstance(graph,
                           IrGraph), 'graph must be the instance of IrGraph.'
-        #sequential_execution = core.get_pass('sequential_execution_pass')
-        #sequential_execution.apply(graph.graph)
         self._is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
@@ -203,9 +212,12 @@ class QuantizationTransformPass(object):
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
                     shape=[1],
                     var_dtype=core.VarDesc.VarType.INT64)
-                self._init_var_node(
-                    global_step_in, np.zeros(
-                        [1], dtype='int64'))
+                _init_var_node(
+                    global_step_in,
+                    np.zeros(
+                        [1], dtype='int64'),
+                    self._scope,
+                    self._place)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
                 # The attribute of `op_role` is needed by ParallelExecutor.
@@ -284,7 +296,12 @@ class QuantizationTransformPass(object):
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
+        _init_var_node(
+            scale_in_node,
+            np.array(
+                [0.001], dtype=data_type),
+            self._scope,
+            self._place)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         inputs = {'X': var_node, 'InScale': scale_in_node}
@@ -299,9 +316,13 @@ class QuantizationTransformPass(object):
                 var_dtype=var_node.dtype())
             data_type = 'float64' if var_node.dtype(
             ) == core.VarDesc.VarType.FP64 else 'float32'
-            self._init_var_node(
-                scales_node, np.zeros(
-                    [self._window_size], dtype=data_type))
+            _init_var_node(
+                scales_node,
+                np.zeros(
+                    [self._window_size], dtype=data_type),
+                self._scope,
+                self._place)
+
             inputs['Iter'] = self._global_step
             outputs['OutScales'] = scales_node
         attrs = {
@@ -343,7 +364,12 @@ class QuantizationTransformPass(object):
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
+        _init_var_node(
+            scale_in_node,
+            np.array(
+                [0.001], dtype=data_type),
+            self._scope,
+            self._place)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         ins = {'X': var_node, 'InScale': scale_in_node}
@@ -356,13 +382,23 @@ class QuantizationTransformPass(object):
                 shape=[1])
             data_type = 'float64' if var_node.dtype(
             ) == core.VarDesc.VarType.FP64 else 'float32'
-            self._init_var_node(scale_in_node, np.ones([1], dtype=data_type))
+            _init_var_node(
+                scale_in_node,
+                np.ones(
+                    [1], dtype=data_type),
+                self._scope,
+                self._place)
             accum_in_node = graph.create_persistable_node(
                 name=unique_name.generate('accum'),
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 var_dtype=var_node.dtype(),
                 shape=[1])
-            self._init_var_node(accum_in_node, np.ones([1], dtype=data_type))
+            _init_var_node(
+                accum_in_node,
+                np.ones(
+                    [1], dtype=data_type),
+                self._scope,
+                self._place)
             state_out_node = graph.create_var_node_from_desc(state_in_node.var(
             ))
             accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
@@ -482,16 +518,6 @@ class QuantizationTransformPass(object):
         graph.link_to(dequant_op_node, dequant_var_node)
         return dequant_var_node
 
-    def _init_var_node(self, var_node, value):
-        assert isinstance(
-            value, np.ndarray), 'The type of value should be numpy array.'
-        assert self._scope is not None, \
-        'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-        assert self._place is not None, \
-        'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
-        tensor = self._scope.var(var_node.name()).get_tensor()
-        tensor.set(value, self._place)
-
     def _quantized_var_name(self, var_name):
         """
         Return quantized variable name for the input `var_name`.
@@ -594,8 +620,8 @@ class QuantizationFreezePass(object):
                                                     self._weight_bits)
                     self._restore_var(input_arg_name, quantized_param_v)
                 else:
-                    scale_v = self._to_node(op_node.outputs,
-                                            op_node.output('OutScale')[0])
+                    scale_v = graph._find_node_by_name(
+                        op_node.outputs, op_node.output('OutScale')[0])
                     self._var_scale_map[input_arg_name] = scale_v
 
         ops = graph.all_op_nodes()
@@ -627,8 +653,8 @@ class QuantizationFreezePass(object):
         return graph
 
     def _remove_fake_quant_and_dequant_op(self, graph, op_node):
-        k = self._to_node(op_node.outputs, op_node.output('Out')[0])
-        v = self._to_node(op_node.inputs, op_node.input('X')[0])
+        k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0])
+        v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0])
         if v.node not in self._op_input_rename_map:
             self._op_input_rename_map[k.node] = v
         else:
@@ -663,8 +689,8 @@ class QuantizationFreezePass(object):
             raise ValueError("Only support one output, but op %s has"
                              " more than one output." % (op_node.name()))
 
-        output_var_node = self._to_node(op_node.outputs,
-                                        op_node.output_arg_names()[0])
+        output_var_node = graph._find_node_by_name(
+            op_node.outputs, op_node.output_arg_names()[0])
         weight_scale_node = graph.create_persistable_node(
             name=unique_name.generate('channel_scale'),
             var_type=core.VarDesc.VarType.LOD_TENSOR,
@@ -672,7 +698,9 @@ class QuantizationFreezePass(object):
             var_dtype=output_var_node.dtype())
         data_type = 'float64' if output_var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        self._init_var_node(weight_scale_node, channel_scale.astype(data_type))
+        _init_var_node(weight_scale_node,
+                       channel_scale.astype(data_type), self._scope,
+                       self._place)
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
             var_type=output_var_node.type(),
@@ -724,8 +752,8 @@ class QuantizationFreezePass(object):
             raise ValueError("Only support one output, but op %s has"
                              " more than one output." % (op_node.name()))
 
-        output_var_node = self._to_node(op_node.outputs,
-                                        op_node.output_arg_names()[0])
+        output_var_node = graph._find_node_by_name(
+            op_node.outputs, op_node.output_arg_names()[0])
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
             var_type=output_var_node.type(),
@@ -746,24 +774,6 @@ class QuantizationFreezePass(object):
         self._op_output_rename_map[output_var_node.node] = dequant_var_node
         return dequant_var_node
 
-    def _init_var_node(self, var_node, value):
-        assert isinstance(
-            value, np.ndarray), 'The type of value should be numpy array.'
-        assert self._scope is not None, \
-        'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-        assert self._place is not None, \
-        'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
-        tensor = self._scope.var(var_node.name()).get_tensor()
-        tensor.set(value, self._place)
-
-    def _to_node(self, nodes, node_name):
-        target_node = None
-        for n in nodes:
-            if n.name() == node_name:
-                target_node = n
-        assert target_node is not None, "Cannot find the target node in the giving set."
-        return target_node
-
     def _load_var(self, name):
         return np.array(self._scope.find_var(name).get_tensor())
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index 6812b4c633..da3510de39 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy):
                  activation_bits=8,
                  weight_bits=8,
                  activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
                  save_in_nodes=None,
                  save_out_nodes=None):
         """
         Args:
             start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
             end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
-            float_model_save_path(str): The path to save model with float weights. 
+            float_model_save_path(str): The path to save model with float weights.
                             None means it doesn't save float model. defalut: None.
             mobile_model_save_path(str): The path to save model for paddle-mobile execution.
                             None means it doesn't save mobile model. defalut: None.
@@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy):
                 dynamically each step in both training and testing period. If use
                 'range_abs_max', a static quantization scale will be calculated
                 during training and used in inference.
-            save_in_nodes(list<str>): A list of variable names used to prune graph 
+            weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
+            The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained.
+            save_in_nodes(list<str>): A list of variable names used to prune graph
                                       for saving inference model.
-            save_out_nodes(list<str>): A list of variable names used to prune graph 
+            save_out_nodes(list<str>): A list of variable names used to prune graph
                                       for saving inference model.
 
         """
@@ -81,6 +84,7 @@ class QuantizationStrategy(Strategy):
         self.activation_bits = activation_bits
         self.weight_bits = weight_bits
         self.activation_quantize_type = activation_quantize_type
+        self.weight_quantize_type = weight_quantize_type
         self.save_out_nodes = save_out_nodes
         self.save_in_nodes = save_in_nodes
 
@@ -100,7 +104,8 @@ class QuantizationStrategy(Strategy):
                 place=context.place,
                 weight_bits=self.weight_bits,
                 activation_bits=self.activation_bits,
-                activation_quantize_type=self.activation_quantize_type)
+                activation_quantize_type=self.activation_quantize_type,
+                weight_quantize_type=self.weight_quantize_type)
             transform_pass.apply(train_ir_graph)
             transform_pass.apply(test_ir_graph)
 
@@ -134,7 +139,8 @@ class QuantizationStrategy(Strategy):
                 scope=context.scope,
                 place=context.place,
                 weight_bits=self.weight_bits,
-                activation_bits=self.activation_bits)
+                activation_bits=self.activation_bits,
+                weight_quantize_type=self.weight_quantize_type)
             freeze_pass.apply(test_ir_graph)
 
             # for other strategies
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
index f29eb53f88..a3a5a724fb 100644
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
@@ -35,6 +35,8 @@ strategies:
         start_epoch: 0
         end_epoch: 0
         float_model_save_path: './output/float'
+        mobile_model_save_path: './output/mobile'
+        int8_model_save_path: './output/int8'
         weight_bits: 8
         activation_bits: 8
         weight_quantize_type: 'abs_max'
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index c7feca0b82..e896f8bb42 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
             place=place,
             activation_quantize_type=activation_quant_type,
             weight_quantize_type=weight_quant_type)
-        #transform_pass = QuantizationTransformPass(
-        #    scope=scope, place=place, activation_quantize_type=activation_quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
         dev_name = '_gpu_' if use_cuda else '_cpu_'
@@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
         # Freeze graph for inference, but the weight of fc/conv is still float type.
         freeze_pass = QuantizationFreezePass(
             scope=scope, place=place, weight_quantize_type=weight_quant_type)
-        #freeze_pass = QuantizationFreezePass(scope=scope, place=place)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5ac2b50a99..a209f389f3 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -104,14 +104,14 @@ def cuda_places(device_ids=None):
     :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
     be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
     If :code:`FLAGS_selected_gpus` is not set, all visible
-    gpu places would be returned.  
+    gpu places would be returned.
 
     If :code:`device_ids` is not None, it should be the device
-    ids of gpus. For example, if :code:`device_ids=[0,1,2]`, 
-    the returned list would be 
+    ids of gpus. For example, if :code:`device_ids=[0,1,2]`,
+    the returned list would be
     [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
-    
-    Args: 
+
+    Args:
         device_ids (None|list(int)|tuple(int)): gpu device id list.
 
     Returns:
@@ -133,11 +133,11 @@ def cuda_places(device_ids=None):
 def cpu_places(device_count=None):
     '''
     Create a list of :code:`fluid.CPUPlace` objects.
-    
+
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the device count would
-    be determined by :code:`multiprocessing.cpu_count()`. 
+    be determined by :code:`multiprocessing.cpu_count()`.
 
     Args:
         device_count (None|int): device number.
@@ -155,9 +155,9 @@ def cuda_pinned_places(device_count=None):
     Create a list of :code:`fluid.CUDAPinnedPlace` objects.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the device count would
-    be determined by :code:`multiprocessing.cpu_count()`. 
+    be determined by :code:`multiprocessing.cpu_count()`.
 
     Args:
         device_count (None|int): device number.
@@ -2164,40 +2164,6 @@ class IrGraph(object):
         """
         return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
 
-    def _find_var_node(self, key):
-        """
-        Get a variable node by the `key` from this graph. The key
-        can be a node name or a node id.
-
-        WARNS:
-            There are some nodes may have the same name. So, be
-            cautious about using this method when you find the
-            target var node by its name.
-
-        Args:
-            key(str|int): The str type denotes that the target variable node's name.
-            And the int type denotes that the target variable node's id.
-
-        Raises:
-            ValueError: If this graph doesn't have a variable with the giving name or id.
-
-        Returns:
-            IrVarNode: the variable node with the giving name or id.
-        """
-        target_var_node = None
-        var_nodes = self.all_var_nodes()
-        if isinstance(key, six.string_types):
-            for var_node in var_nodes:
-                if var_node.name() == key:
-                    target_var_node = var_node
-        elif isinstance(key, int):
-            for var_node in var_nodes:
-                if var_node.id() == key:
-                    target_var_node = var_node
-        if target_var_node is None:
-            raise ValueError("var_node %s not in this graph" % key)
-        return target_var_node
-
     def create_persistable_node(self, name, var_type, shape, var_dtype):
         """
         Create a persistable variable node in the graph. In IrGraph,
@@ -2342,14 +2308,6 @@ class IrGraph(object):
         core.graph_safe_remove_nodes(self.graph, original_nodes)
 
     def resolve_hazard(self):
-        def _to_node(nodes, node_name):
-            target_node = None
-            for n in nodes:
-                if n.name() == node_name:
-                    target_node = n
-            assert target_node is not None, "Cannot find the target node in the giving set."
-            return target_node
-
         ordered_nodes = core.topology_sort(self.graph)
         var_nodes = dict()
         for node in ordered_nodes:
@@ -2357,16 +2315,17 @@ class IrGraph(object):
                 for each_var_name in node.op().input_arg_names():
                     if each_var_name not in var_nodes:
                         var_nodes[each_var_name] = [
-                            _to_node(node.inputs, each_var_name)
+                            self._find_node_by_name(node.inputs, each_var_name)
                         ]
                 for each_var_name in node.op().output_arg_names():
                     if each_var_name not in var_nodes:
                         var_nodes[each_var_name] = [
-                            _to_node(node.outputs, each_var_name)
+                            self._find_node_by_name(node.outputs, each_var_name)
                         ]
                     else:
                         var_nodes[each_var_name].append(
-                            _to_node(node.outputs, each_var_name))
+                            self._find_node_by_name(node.outputs,
+                                                    each_var_name))
         self.graph.resolve_hazard(var_nodes)
 
     def has_circle(self):
@@ -2479,6 +2438,17 @@ class IrGraph(object):
         program = Program._construct_from_desc(desc)
         return program
 
+    def _find_node_by_name(self, nodes, node_name):
+        """
+        Find a node in the giving nodes set by the name.
+        """
+        target_node = None
+        for n in nodes:
+            if n.name() == node_name:
+                target_node = n
+        assert target_node is not None, "Cannot find the target node in the giving set."
+        return target_node
+
     def _update_desc_attr(self, desc, name, val):
         """
         Update the value of desc's attribute by attribute's name.

From 6b854f3e1f4412b5726197bc336754f163148cd8 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Wed, 27 Mar 2019 17:19:10 +0800
Subject: [PATCH 03/19] fix the save_in_nodes bug.

---
 .../fluid/contrib/slim/quantization/quantization_strategy.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index da3510de39..aa50891121 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -158,7 +158,7 @@ class QuantizationStrategy(Strategy):
                 ]
 
             if self.save_in_nodes == None:
-                in_vars = list(context.eval_graph.out_nodes.values())
+                in_vars = list(context.eval_graph.in_nodes.values())
             else:
                 in_vars = self.save_in_nodes
 

From 63651c1968ac7f5694e8bce3f23be465ad57a895 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 27 Mar 2019 10:47:29 +0000
Subject: [PATCH 04/19] fix grad desc maker test=develop

---
 .../framework/details/reference_count_pass.cc |   1 +
 paddle/fluid/operators/bpr_loss_op.cc         |  20 +-
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 .../fluid/operators/controlflow/while_op.cc   |  21 +-
 .../operators/controlflow/while_op_helper.cc  | 291 ------------------
 .../operators/controlflow/while_op_helper.h   |  43 ---
 .../detection/roi_perspective_transform_op.cc |  21 +-
 .../gaussian_random_batch_size_like_op.cc     |  10 +-
 paddle/fluid/operators/im2sequence_op.cc      |  19 +-
 paddle/fluid/operators/interpolate_op.cc      |  34 +-
 paddle/fluid/operators/l1_norm_op.cc          |  19 +-
 paddle/fluid/operators/label_smooth_op.cc     |  24 +-
 paddle/fluid/operators/linear_chain_crf_op.cc |  39 ++-
 paddle/fluid/operators/log_loss_op.cc         |  20 +-
 paddle/fluid/operators/lstm_op.cc             |  41 ++-
 paddle/fluid/operators/margin_rank_loss_op.cc |  22 +-
 paddle/fluid/operators/mean_op.cc             |   8 +-
 paddle/fluid/operators/multiplex_op.cc        |  34 +-
 paddle/fluid/operators/multiplex_op.cu        |  11 +-
 paddle/fluid/operators/multiplex_op.h         |  11 +-
 paddle/fluid/operators/pad_op.cc              |  21 +-
 paddle/fluid/operators/psroi_pool_op.cc       |  20 +-
 paddle/fluid/operators/rank_loss_op.cc        |  20 ++
 paddle/fluid/operators/recurrent_op.cc        |  52 ++--
 paddle/fluid/operators/roi_align_op.cc        |  20 +-
 paddle/fluid/operators/roi_pool_op.cc         |  21 +-
 paddle/fluid/operators/scatter_op.cc          |  34 +-
 paddle/fluid/operators/shuffle_channel_op.cc  |  20 +-
 28 files changed, 473 insertions(+), 426 deletions(-)
 delete mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc
 delete mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h

diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 0c3d8d5cae..c218e55b70 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -335,6 +335,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
                        var_name);
         ref_cnts[i].emplace(var_name, result.size());
         last_live_ops_of_vars[i].emplace(var_name, std::move(result));
+        break;
       }
 
       // Seldomly, all preceding trying failed.
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index f349c51d8a..b2dbaecfcf 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/bpr_loss_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939)
 )DOC");
   }
 };
+
+class BprLossGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("bpr_loss_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -134,7 +152,7 @@ namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::BprLossGradDescMaker);
 REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
 REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
                        ops::BprLossOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 7aa1c44eaa..4782e9d5ff 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,5 +1,5 @@
 include(operators)
 register_operators(DEPS naive_executor)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
+cc_library(loop_op_helper SRCS loop_op_helper.cc DEPS operator) 
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index deb8ec3bb2..58fe354958 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -18,28 +18,21 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
+#include "paddle/fluid/operators/controlflow/loop_op_helper.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
 
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
+
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-namespace {  // NOLINT
-static std::string GetSkipEagerDeletionVarsDebugString(
-    const std::vector<std::string> &vars) {
-  std::string str = "Skip " + std::to_string(vars.size()) +
-                    " var(s) in eager deletion mode: ";
-  for (auto &var : vars) {
-    str.append(var);
-    str.push_back(' ');
-  }
-  return str;
-}
-}  // NOLINT
-
 class WhileOp : public framework::OperatorBase {
  public:
   WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
deleted file mode 100644
index 2cbd94a061..0000000000
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace operators {
-
-// OpVariant is a wrapper class of OpDesc and OperatorBase
-// So that API would be the same.
-class OpVariant {
-  struct InputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Inputs());
-    }
-  };
-
-  struct OutputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Outputs());
-    }
-  };
-
-  struct AttributeMapVisitor
-      : public boost::static_visitor<const framework::AttributeMap *> {
-    const framework::AttributeMap *operator()(
-        const framework::OpDesc *op) const {
-      return &(op->GetAttrMap());
-    }
-
-    const framework::AttributeMap *operator()(
-        const framework::OperatorBase *op) const {
-      return &(op->Attrs());
-    }
-  };
-
-  struct RawPointerVisitor : public boost::static_visitor<const void *> {
-    template <typename OpType>
-    const void *operator()(const OpType *op) const {
-      return op;
-    }
-  };
-
- public:
-  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
-
-  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
-
-  const framework::VariableNameMap &Inputs() const {
-    return *boost::apply_visitor(InputsVisitor(), op_);
-  }
-
-  const framework::VariableNameMap &Outputs() const {
-    return *boost::apply_visitor(OutputsVisitor(), op_);
-  }
-
-  const framework::AttributeMap &Attrs() const {
-    return *boost::apply_visitor(AttributeMapVisitor(), op_);
-  }
-
-  template <typename AttrType>
-  const AttrType &Attr(const std::string &name) const {
-    auto &attrs = Attrs();
-    auto it = attrs.find(name);
-    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
-    return boost::get<AttrType>(it->second);
-  }
-
-  bool operator==(const OpVariant &other) const {
-    return RawPointer() == other.RawPointer();
-  }
-
-  const void *RawPointer() const {
-    return boost::apply_visitor(RawPointerVisitor(), op_);
-  }
-
-  int which() const { return static_cast<int>(op_.which()); }
-
-  struct Hasher {
-    size_t operator()(const OpVariant &op) const {
-      return reinterpret_cast<size_t>(op.RawPointer());
-    }
-  };
-
- private:
-  const boost::variant<const framework::OperatorBase *,
-                       const framework::OpDesc *>
-      op_;
-};
-
-static std::string GetDebugString(const std::vector<std::string> &names) {
-  if (names.empty()) return "";
-  std::string ret = names[0];
-  for (size_t i = 1; i < names.size(); ++i) {
-    ret += (" " + names[i]);
-  }
-  return ret;
-}
-
-// Set skip variables of while_op and while_grad_op
-// These variables should be skipped when eager deletion enables.
-// It is because:
-//  1. while_grad_op needs some variables defined in while_op.
-//  2. while_grad_op needs variables from the previous time step.
-static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
-  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
-  VLOG(2) << "Prepare to skip " << attr.size()
-          << " var(s): " << GetDebugString(attr);
-  attrs[kSkipEagerDeletionVars] = std::move(attr);
-}
-
-// Check whether the forward while_op and while_grad_op match
-// The program may have many while_ops.
-static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
-                                           const OpVariant &grad_op) {
-  return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
-         fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
-}
-
-// Test whether the variable is skippable in forward while_op
-// The variable is skippable in while_op when the variable used in while_grad
-// is not from grad_block.
-static bool IsSkippableVar(const std::string &name,
-                           framework::BlockDesc *grad_block) {
-  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
-}
-
-static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
-                                            const OpVariant &bwd_op) {
-  auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
-
-  // Find all skippable variables in forward while_op
-  std::unordered_set<std::string> forward_skip_vars;
-  for (auto *op_desc : grad_block->AllOps()) {
-    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-      if (IsSkippableVar(in_arg_name, grad_block)) {
-        forward_skip_vars.insert(in_arg_name);
-      }
-    }
-
-    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-      if (IsSkippableVar(out_arg_name, grad_block)) {
-        forward_skip_vars.insert(out_arg_name);
-      }
-    }
-  }
-
-  SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
-                                               forward_skip_vars.end()));
-
-  // Find all skippable variables in while_grad_op
-  // The skipped variables are those which would be used across time steps.
-  auto &fwd_input = fwd_op.Inputs().at(kX);
-  auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
-  PADDLE_ENFORCE_EQ(
-      fwd_input.size(), in_grads.size(),
-      "Backward input gradient number does not match forward input number.");
-
-  std::unordered_set<std::string> backward_skip_vars;
-  for (size_t i = 0; i < in_grads.size(); ++i) {
-    if (in_grads[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    backward_skip_vars.insert(in_grads[i]);
-    backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
-  }
-
-  SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
-                                               backward_skip_vars.end()));
-}
-
-// Find all while_ops and while_grad_ops in the graph or program
-// The while_grad_op and while_op may located in different blocks
-// So we should traverse all blocks in the program and find them out.
-static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops,
-                                       std::vector<OpVariant> *while_grad_ops) {
-  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
-
-  if (while_ops->empty()) return;
-
-  const auto *program =
-      while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program();
-  for (size_t i = 1; i < program->Size(); ++i) {
-    auto &block = program->Block(i);
-    for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
-      if (op->Type() == "while") {
-        while_ops->emplace_back(op);
-      } else if (op->Type() == "while_grad") {
-        while_grad_ops->emplace_back(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
-                    "There are extra while_grad ops in the graph or program");
-}
-
-static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
-    std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) {
-  FindAllWhileAndWhileGradOp(while_ops, while_grad_ops);
-
-  VLOG(2) << "Found while op num: " << while_ops->size()
-          << ", while grad op num: " << while_grad_ops->size();
-
-  if (while_grad_ops->empty()) {
-    return;
-  }
-
-  std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
-      while_ops->begin(), while_ops->end());
-
-  for (auto &bwd_op : *while_grad_ops) {
-    const OpVariant *matched_fwd_op = nullptr;
-    for (auto &fwd_op : while_op_set) {
-      if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
-        PADDLE_ENFORCE(matched_fwd_op == nullptr,
-                       "Found multiple matched while ops");
-        matched_fwd_op = &fwd_op;
-      }
-    }
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            "Cannot find matched forward while op.");
-    ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
-    while_op_set.erase(*matched_fwd_op);
-  }
-}
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
-  // If block_id is not 0, returns
-  // This is because all while_ops and while_grad_ops in the whole program
-  // would be processed when block_id is 0 (i.e. when Executor::Run() or
-  // ParallelExecutor constructs).
-
-  // What's more, all while_ops and while_grad_ops must be processed when
-  // block_id is zero. If not, while_op may run first and erase variables
-  // used in while_grad_op, and in this moment, while_grad_ops may be not
-  // constructed yet.
-  if (block_id != 0) return;
-
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  for (auto &op : all_ops) {
-    if (op->Type() == "while") {
-      fwd_ops.emplace_back(op.get());
-    } else if (op->Type() == "while_grad") {
-      bwd_ops.emplace_back(op.get());
-    }
-  }
-  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
-}
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const std::vector<framework::OperatorBase *> &while_ops,
-    const std::vector<framework::OperatorBase *> &while_grad_ops) {
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  fwd_ops.reserve(while_ops.size());
-  for (auto *op : while_ops) {
-    fwd_ops.emplace_back(op);
-  }
-
-  bwd_ops.reserve(while_grad_ops.size());
-  for (auto *op : while_grad_ops) {
-    bwd_ops.emplace_back(op);
-  }
-
-  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
deleted file mode 100644
index 456ba8642b..0000000000
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kStepBlock[] = "sub_block";
-static constexpr char kCondition[] = "Condition";
-static constexpr char kStepScopes[] = "StepScopes";
-static constexpr char kX[] = "X";
-static constexpr char kXGRAD[] = "X@GRAD";
-static constexpr char kOutputs[] = "Out";
-static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const std::vector<framework::OperatorBase *> &while_ops,
-    const std::vector<framework::OperatorBase *> &while_grad_ops);
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index a97828e6fe..5b84221cfa 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker
   }
 };
 
+class ROIPerspectiveTransformGradDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("roi_perspective_transform_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
                   ops::ROIPerspectiveTransformOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ROIPerspectiveTransformGradDescMaker);
 REGISTER_OPERATOR(roi_perspective_transform_grad,
                   ops::ROIPerspectiveTransformGradOp);
 REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 4a97428148..98ebe1fdf4 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -65,11 +65,17 @@ by input arguments.
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
+REGISTER_OPERATOR(
     gaussian_random_batch_size_like,
     paddle::operators::GaussianRandomBatchSizeLikeOp,
-    paddle::operators::GaussianRandomBatchSizeLikeOpMaker);
+    paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
+    paddle::framework::EmptyGradOpMaker,
+    paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
+
 // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 8efd43928a..44fd95edef 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("im2sequence_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::Im2SequenceGradDescMaker);
 REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
     im2sequence,
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 10d01af982..cfded65f0b 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -194,21 +194,43 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
   }
 };
 
+class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType(ForwardOp().Type() + "_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad);
+                  ops::InterpolateGradDescMaker);
+REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad,
+                  ops::InterpolateGradNoNeedBufferVarsInference);
 REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad);
+                  ops::InterpolateGradDescMaker);
+REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad,
+                  ops::InterpolateGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
                        ops::InterpolateKernel<double>,
                        ops::InterpolateKernel<uint8_t>);
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index bc115090ac..2696d0bef9 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/l1_norm_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$
   }
 };
 
+class L1NormGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("l1_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::L1NormGradDescMaker);
 REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index da59bd53bc..6d0af57318 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
+  }
+};
+
+class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("label_smooth_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
   }
 };
 
@@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LabelSmoothGradDescMaker);
 REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
 REGISTER_OP_CPU_KERNEL(
     label_smooth,
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index e17b6cb598..fa09cb61e6 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linear_chain_crf_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("linear_chain_crf_grad");
+    op->SetAttrMap(Attrs());
+
+    op->SetInput("Emission", Input("Emission"));
+    op->SetInput("Transition", Input("Transition"));
+    op->SetInput("Label", Input("Label"));
+
+    op->SetInput("Alpha", Output("Alpha"));
+    op->SetInput("EmissionExps", Output("EmissionExps"));
+    op->SetInput("TransitionExps", Output("TransitionExps"));
+
+    op->SetInput(framework::GradVarName("LogLikelihood"),
+                 OutputGrad("LogLikelihood"));
+
+    op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission"));
+    op->SetOutput(framework::GradVarName("Transition"),
+                  InputGrad("Transition"));
+
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
-                  ops::LinearChainCRFOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+                  ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker);
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp,
+                  ops::LinearChainCRFGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
     ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index ef1fb83aa6..e8850a1e58 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/log_loss_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class LogLossGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("log_loss_grad");
+    op->SetInput("Predicted", Input("Predicted"));
+    op->SetInput("Labels", Input("Labels"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LogLossGradDescMaker);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 4a199d681f..30c3945cbb 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -264,12 +264,51 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("lstm_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("Input", Input("Input"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+
+    if (ForwardOp().Inputs().count("H0") > 0) {
+      op->SetInput("H0", Input("H0"));
+      op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
+    }
+
+    if (ForwardOp().Inputs().count("C0") > 0) {
+      op->SetInput("C0", Input("C0"));
+      op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
+    }
+
+    op->SetInput("Weight", Input("Weight"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+
+    op->SetInput("Bias", Input("Bias"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    op->SetInput("Cell", Output("Cell"));
+
+    op->SetInput("Hidden", Output("Hidden"));
+    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
+
+    op->SetInput("BatchGate", Output("BatchGate"));
+    op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LSTMGradOpDescMaker);
 REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b643ba9d7f..b3d9733a97 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -94,8 +94,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("Activated"),
@@ -106,13 +104,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("margin_rank_loss_grad");
+    op->SetInput("Activated", Output("Activated"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Label", Input("Label"));
+    op->SetOutput(framework::GradVarName("X1"), InputGrad("X1"));
+    op->SetOutput(framework::GradVarName("X2"), InputGrad("X2"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
                   ops::MarginRankLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::MarginRankLossGradDescMaker);
 REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 35b6d7b5e3..26d86afed0 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -61,7 +61,8 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
+    auto input_data_type =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -81,13 +82,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
                   ops::MeanGradMaker);
-REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
+                  ops::MeanGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 1801f2915e..b3d0423b72 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/multiplex_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -111,28 +112,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
-    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
-                   "Output(X@Grad) should not be null.");
+    auto& dxs = ctx->Outputs(framework::GradVarName("X"));
+    PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputsDim(framework::GradVarName("X"),
+                       std::vector<framework::DDim>(dxs.size(), dout_dim));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
+  }
+};
+
+class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("multiplex_grad");
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<false>);
+                  ops::MultiplexGradDescMaker);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
     multiplex,
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 2f8a602f3c..1ef54ecc73 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<Tensor>("X");
     auto* ids = ctx.Input<Tensor>("Ids");
     auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+
+    size_t idx = -1UL;
     for (size_t i = 0; i < d_ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
         t.device(*ctx.template device_context<Place>().eigen_device()) =
             t.constant(static_cast<T>(0));
+
+        idx = i;
       }
     }
 
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
+    if (idx == -1UL) return;
+
+    auto rows = d_ins[idx]->dims()[0];
+    auto cols = d_ins[idx]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
     TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
index 87de000971..44d6cc84a6 100644
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
@@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto d_ins =
         ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+
+    size_t idx = -1UL;
     for (size_t i = 0; i < d_ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
         t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
             t.constant(static_cast<T>(0));
+
+        idx = i;
       }
     }
 
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
+    if (idx == -1UL) return;
+
+    auto rows = d_ins[idx]->dims()[0];
+    auto cols = d_ins[idx]->numel() / rows;
     auto* index = ids->data<int32_t>();
     platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index d4b631a6f5..c28106d312 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pad_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel {
                    "Output(Out) of PadOp should not be null.");
 
     auto x_dim = ctx->GetInputDim("X");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
                       "Size of paddings should be equal to 2 * dimension size "
                       "of input tensor.");
@@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    for (int i = 0; i < dout_dims.size(); ++i) {
+      dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+    }
+
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      for (int i = 0; i < dout_dims.size(); ++i) {
+        dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+      }
+      ctx->SetOutputDim(x_grad_name, dout_dims);
     }
   }
 };
@@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
  protected:
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
-    bind->SetInput("X", Input("X"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index 78989582b7..dce9108eb1 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/psroi_pool_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("psroi_pool_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::PSROIPoolGradDescMaker);
 REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     psroi_pool,
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 313cf01541..45daa6b955 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class RankLossGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("rank_loss_grad");
+    op->SetInput("Label", Input("Label"));
+    op->SetInput("Left", Input("Left"));
+    op->SetInput("Right", Input("Right"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("Left"), InputGrad("Left"));
+    op->SetOutput(framework::GradVarName("Right"), InputGrad("Right"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 2898a62ddb..45c87bb085 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -15,24 +15,24 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/controlflow/loop_op_helper.h"
 
 namespace paddle {
 namespace operators {
-constexpr char kInputs[] = "inputs";
-constexpr char kInitialStates[] = "initial_states";
-constexpr char kParameters[] = "parameters";
-constexpr char kOutputs[] = "outputs";
-constexpr char kStepScopes[] = "step_scopes";
-constexpr char kExStates[] = "ex_states";
-constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "sub_block";
-constexpr char kReverse[] = "reverse";
-constexpr char kIsTrain[] = "is_train";
-#define GRAD_SUFFIX "@GRAD"
-constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
-constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
-constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
-constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
+
+using recurrent::kInputs;
+using recurrent::kInitialStates;
+using recurrent::kParameters;
+using recurrent::kOutputs;
+using recurrent::kStepScopes;
+using recurrent::kExStates;
+using recurrent::kStates;
+using recurrent::kReverse;
+using recurrent::kIsTrain;
+using recurrent::kInputGrads;
+using recurrent::kOutputGrads;
+using recurrent::kParamGrads;
+using recurrent::kInitStateGrads;
 
 using StepScopeVar = std::vector<framework::Scope *>;
 
@@ -249,6 +249,9 @@ class RecurrentOp : public RecurrentBase {
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
+    auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars);
+
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -283,8 +286,7 @@ class RecurrentOp : public RecurrentBase {
       // Every inputs are linked now, execute!
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/, true /*create_vars*/,
-                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
-                   true /*force_disable_gc*/);
+                   keep_vars);
 
       // get device context from pool
       platform::DeviceContextPool &pool =
@@ -341,6 +343,9 @@ class RecurrentGradOp : public RecurrentBase {
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
     auto *program = block->Program();
+    auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars);
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -401,8 +406,7 @@ class RecurrentGradOp : public RecurrentBase {
       // Run step block with cur_scope
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/, true /*create_vars*/,
-                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
-                   true /*force_disable_gc*/);
+                   keep_vars);
 
       VLOG(5) << "executor.Run finished ";
 
@@ -579,6 +583,10 @@ if reverse is True
       o          o          o         o
 )DOC").SetDefault(false);
     AddAttr<bool>(kIsTrain, "").SetDefault(true);
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
+                                      "Skip vars that would "
+                                      "be used in backward ops")
+        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(
 Static Length Recurrent Operator.
 
@@ -614,7 +622,11 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
                        this->OutputGrad(output_param));
       }
     }
-    grad->SetAttrMap(this->Attrs());
+
+    auto attrs = this->Attrs();
+    attrs.insert({kSkipEagerDeletionVars, std::vector<std::string>()});
+    grad->SetAttrMap(attrs);
+
     grad->SetBlockAttr(kStepBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 6857b5ed9d..7bb10ce063 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -147,12 +148,29 @@ Thus avoid the misaligned problem.
   }
 };
 
+class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("roi_align_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ROIAlignGradDescMaker);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_align,
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index e46d92d6fc..cfac7e09e1 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
   }
 };
 
+class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("roi_pool_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput("Argmax", Output("Argmax"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ROIPoolGradDescMaker);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index ad418d51bc..1c26707500 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -63,14 +63,16 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -95,12 +97,34 @@ $$
   }
 };
 
+class ScatterGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("scatter_grad");
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput("Updates", Input("Updates"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference,
+                                      "Updates");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp);
+                  ops::ScatterGradDescMaker);
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
+                  ops::ScatterGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 9349912e09..26355e5861 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("shuffle_channel_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
-                  ops::ShuffleChannelOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker);
 
 REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
 

From c4c6205268d7863b714334dcdcdd31e1576e540d Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 27 Mar 2019 21:18:48 +0800
Subject: [PATCH 05/19] fix gc bug test=develop

---
 paddle/fluid/framework/details/reference_count_pass.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 0c3d8d5cae..c218e55b70 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -335,6 +335,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
                        var_name);
         ref_cnts[i].emplace(var_name, result.size());
         last_live_ops_of_vars[i].emplace(var_name, std::move(result));
+        break;
       }
 
       // Seldomly, all preceding trying failed.

From 4c8254e3bf426a044ef51d661193bd9a720dc204 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 27 Mar 2019 10:53:01 +0000
Subject: [PATCH 06/19] revert some loop op revision test=develop

---
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 .../fluid/operators/controlflow/while_op.cc   |  21 +-
 .../operators/controlflow/while_op_helper.cc  | 291 ++++++++++++++++++
 .../operators/controlflow/while_op_helper.h   |  43 +++
 paddle/fluid/operators/interpolate_op.cc      |   4 +
 paddle/fluid/operators/lstm_op.cc             |   1 +
 paddle/fluid/operators/margin_rank_loss_op.cc |   1 +
 paddle/fluid/operators/mean_op.cc             |   3 +
 paddle/fluid/operators/multiplex_op.cc        |   1 +
 paddle/fluid/operators/recurrent_op.cc        |  52 ++--
 paddle/fluid/operators/scatter_op.cc          |   1 +
 11 files changed, 380 insertions(+), 40 deletions(-)
 create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc
 create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h

diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 4782e9d5ff..7aa1c44eaa 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,5 +1,5 @@
 include(operators)
 register_operators(DEPS naive_executor)
-cc_library(loop_op_helper SRCS loop_op_helper.cc DEPS operator) 
+cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index a07a732d88..b321920882 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -18,21 +18,28 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/controlflow/loop_op_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
 
-static constexpr char kCondition[] = "Condition";
-static constexpr char kStepScopes[] = "StepScopes";
-static constexpr char kX[] = "X";
-static constexpr char kXGRAD[] = "X@GRAD";
-static constexpr char kOutputs[] = "Out";
-
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
+
 class WhileOp : public framework::OperatorBase {
  public:
   WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
new file mode 100644
index 0000000000..2cbd94a061
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -0,0 +1,291 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace operators {
+
+// OpVariant is a wrapper class of OpDesc and OperatorBase
+// So that API would be the same.
+class OpVariant {
+  struct InputsVisitor
+      : public boost::static_visitor<const framework::VariableNameMap *> {
+    template <typename OpType>
+    const framework::VariableNameMap *operator()(const OpType *op) const {
+      return &(op->Inputs());
+    }
+  };
+
+  struct OutputsVisitor
+      : public boost::static_visitor<const framework::VariableNameMap *> {
+    template <typename OpType>
+    const framework::VariableNameMap *operator()(const OpType *op) const {
+      return &(op->Outputs());
+    }
+  };
+
+  struct AttributeMapVisitor
+      : public boost::static_visitor<const framework::AttributeMap *> {
+    const framework::AttributeMap *operator()(
+        const framework::OpDesc *op) const {
+      return &(op->GetAttrMap());
+    }
+
+    const framework::AttributeMap *operator()(
+        const framework::OperatorBase *op) const {
+      return &(op->Attrs());
+    }
+  };
+
+  struct RawPointerVisitor : public boost::static_visitor<const void *> {
+    template <typename OpType>
+    const void *operator()(const OpType *op) const {
+      return op;
+    }
+  };
+
+ public:
+  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
+
+  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
+
+  const framework::VariableNameMap &Inputs() const {
+    return *boost::apply_visitor(InputsVisitor(), op_);
+  }
+
+  const framework::VariableNameMap &Outputs() const {
+    return *boost::apply_visitor(OutputsVisitor(), op_);
+  }
+
+  const framework::AttributeMap &Attrs() const {
+    return *boost::apply_visitor(AttributeMapVisitor(), op_);
+  }
+
+  template <typename AttrType>
+  const AttrType &Attr(const std::string &name) const {
+    auto &attrs = Attrs();
+    auto it = attrs.find(name);
+    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
+    return boost::get<AttrType>(it->second);
+  }
+
+  bool operator==(const OpVariant &other) const {
+    return RawPointer() == other.RawPointer();
+  }
+
+  const void *RawPointer() const {
+    return boost::apply_visitor(RawPointerVisitor(), op_);
+  }
+
+  int which() const { return static_cast<int>(op_.which()); }
+
+  struct Hasher {
+    size_t operator()(const OpVariant &op) const {
+      return reinterpret_cast<size_t>(op.RawPointer());
+    }
+  };
+
+ private:
+  const boost::variant<const framework::OperatorBase *,
+                       const framework::OpDesc *>
+      op_;
+};
+
+static std::string GetDebugString(const std::vector<std::string> &names) {
+  if (names.empty()) return "";
+  std::string ret = names[0];
+  for (size_t i = 1; i < names.size(); ++i) {
+    ret += (" " + names[i]);
+  }
+  return ret;
+}
+
+// Set skip variables of while_op and while_grad_op
+// These variables should be skipped when eager deletion enables.
+// It is because:
+//  1. while_grad_op needs some variables defined in while_op.
+//  2. while_grad_op needs variables from the previous time step.
+static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  VLOG(2) << "Prepare to skip " << attr.size()
+          << " var(s): " << GetDebugString(attr);
+  attrs[kSkipEagerDeletionVars] = std::move(attr);
+}
+
+// Check whether the forward while_op and while_grad_op match
+// The program may have many while_ops.
+static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
+                                           const OpVariant &grad_op) {
+  return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
+         fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
+}
+
+// Test whether the variable is skippable in forward while_op
+// The variable is skippable in while_op when the variable used in while_grad
+// is not from grad_block.
+static bool IsSkippableVar(const std::string &name,
+                           framework::BlockDesc *grad_block) {
+  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
+}
+
+static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
+                                            const OpVariant &bwd_op) {
+  auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
+
+  // Find all skippable variables in forward while_op
+  std::unordered_set<std::string> forward_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (IsSkippableVar(in_arg_name, grad_block)) {
+        forward_skip_vars.insert(in_arg_name);
+      }
+    }
+
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (IsSkippableVar(out_arg_name, grad_block)) {
+        forward_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+
+  SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
+                                               forward_skip_vars.end()));
+
+  // Find all skippable variables in while_grad_op
+  // The skipped variables are those which would be used across time steps.
+  auto &fwd_input = fwd_op.Inputs().at(kX);
+  auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
+  PADDLE_ENFORCE_EQ(
+      fwd_input.size(), in_grads.size(),
+      "Backward input gradient number does not match forward input number.");
+
+  std::unordered_set<std::string> backward_skip_vars;
+  for (size_t i = 0; i < in_grads.size(); ++i) {
+    if (in_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    backward_skip_vars.insert(in_grads[i]);
+    backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
+  }
+
+  SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
+                                               backward_skip_vars.end()));
+}
+
+// Find all while_ops and while_grad_ops in the graph or program
+// The while_grad_op and while_op may located in different blocks
+// So we should traverse all blocks in the program and find them out.
+static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops,
+                                       std::vector<OpVariant> *while_grad_ops) {
+  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
+
+  if (while_ops->empty()) return;
+
+  const auto *program =
+      while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program();
+  for (size_t i = 1; i < program->Size(); ++i) {
+    auto &block = program->Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == "while") {
+        while_ops->emplace_back(op);
+      } else if (op->Type() == "while_grad") {
+        while_grad_ops->emplace_back(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
+                    "There are extra while_grad ops in the graph or program");
+}
+
+static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
+    std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) {
+  FindAllWhileAndWhileGradOp(while_ops, while_grad_ops);
+
+  VLOG(2) << "Found while op num: " << while_ops->size()
+          << ", while grad op num: " << while_grad_ops->size();
+
+  if (while_grad_ops->empty()) {
+    return;
+  }
+
+  std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
+      while_ops->begin(), while_ops->end());
+
+  for (auto &bwd_op : *while_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : while_op_set) {
+      if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE(matched_fwd_op == nullptr,
+                       "Found multiple matched while ops");
+        matched_fwd_op = &fwd_op;
+      }
+    }
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
+                            "Cannot find matched forward while op.");
+    ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
+    while_op_set.erase(*matched_fwd_op);
+  }
+}
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all while_ops and while_grad_ops in the whole program
+  // would be processed when block_id is 0 (i.e. when Executor::Run() or
+  // ParallelExecutor constructs).
+
+  // What's more, all while_ops and while_grad_ops must be processed when
+  // block_id is zero. If not, while_op may run first and erase variables
+  // used in while_grad_op, and in this moment, while_grad_ops may be not
+  // constructed yet.
+  if (block_id != 0) return;
+
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  for (auto &op : all_ops) {
+    if (op->Type() == "while") {
+      fwd_ops.emplace_back(op.get());
+    } else if (op->Type() == "while_grad") {
+      bwd_ops.emplace_back(op.get());
+    }
+  }
+  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
+}
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    const std::vector<framework::OperatorBase *> &while_ops,
+    const std::vector<framework::OperatorBase *> &while_grad_ops) {
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  fwd_ops.reserve(while_ops.size());
+  for (auto *op : while_ops) {
+    fwd_ops.emplace_back(op);
+  }
+
+  bwd_ops.reserve(while_grad_ops.size());
+  for (auto *op : while_grad_ops) {
+    bwd_ops.emplace_back(op);
+  }
+
+  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
new file mode 100644
index 0000000000..456ba8642b
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
+
+void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+    const std::vector<framework::OperatorBase *> &while_ops,
+    const std::vector<framework::OperatorBase *> &while_grad_ops);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index cfded65f0b..edee8c08d0 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -10,6 +10,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_op.h"
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -209,6 +210,9 @@ class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker {
     std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
     op->SetType(ForwardOp().Type() + "_grad");
     op->SetInput("X", Input("X"));
+    if (ForwardOp().Inputs().count("OutSize") > 0) {
+      op->SetInput("OutSize", Input("OutSize"));
+    }
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 30c3945cbb..52e4e8be28 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b3d9733a97..fca3532551 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/margin_rank_loss_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 26d86afed0..2b2f845076 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mean_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index b3d0423b72..7cb213e899 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/multiplex_op.h"
+#include <memory>
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 45c87bb085..2898a62ddb 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -15,24 +15,24 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/controlflow/loop_op_helper.h"
 
 namespace paddle {
 namespace operators {
-
-using recurrent::kInputs;
-using recurrent::kInitialStates;
-using recurrent::kParameters;
-using recurrent::kOutputs;
-using recurrent::kStepScopes;
-using recurrent::kExStates;
-using recurrent::kStates;
-using recurrent::kReverse;
-using recurrent::kIsTrain;
-using recurrent::kInputGrads;
-using recurrent::kOutputGrads;
-using recurrent::kParamGrads;
-using recurrent::kInitStateGrads;
+constexpr char kInputs[] = "inputs";
+constexpr char kInitialStates[] = "initial_states";
+constexpr char kParameters[] = "parameters";
+constexpr char kOutputs[] = "outputs";
+constexpr char kStepScopes[] = "step_scopes";
+constexpr char kExStates[] = "ex_states";
+constexpr char kStates[] = "states";
+constexpr char kStepBlock[] = "sub_block";
+constexpr char kReverse[] = "reverse";
+constexpr char kIsTrain[] = "is_train";
+#define GRAD_SUFFIX "@GRAD"
+constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
+constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
+constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
+constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
 
 using StepScopeVar = std::vector<framework::Scope *>;
 
@@ -249,9 +249,6 @@ class RecurrentOp : public RecurrentBase {
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
-    auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars);
-
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -286,7 +283,8 @@ class RecurrentOp : public RecurrentBase {
       // Every inputs are linked now, execute!
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/, true /*create_vars*/,
-                   keep_vars);
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       // get device context from pool
       platform::DeviceContextPool &pool =
@@ -343,9 +341,6 @@ class RecurrentGradOp : public RecurrentBase {
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
     auto *program = block->Program();
-    auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-
-    VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars);
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -406,7 +401,8 @@ class RecurrentGradOp : public RecurrentBase {
       // Run step block with cur_scope
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/, true /*create_vars*/,
-                   keep_vars);
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       VLOG(5) << "executor.Run finished ";
 
@@ -583,10 +579,6 @@ if reverse is True
       o          o          o         o
 )DOC").SetDefault(false);
     AddAttr<bool>(kIsTrain, "").SetDefault(true);
-    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
-                                      "Skip vars that would "
-                                      "be used in backward ops")
-        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(
 Static Length Recurrent Operator.
 
@@ -622,11 +614,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
                        this->OutputGrad(output_param));
       }
     }
-
-    auto attrs = this->Attrs();
-    attrs.insert({kSkipEagerDeletionVars, std::vector<std::string>()});
-    grad->SetAttrMap(attrs);
-
+    grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kStepBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 1c26707500..8e0e3bd605 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter_op.h"
+#include <memory>
 #include "paddle/fluid/framework/ddim.h"
 
 namespace paddle {

From 09dfc7a2aa9296d820cead49a2e125aea5e72ae8 Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Wed, 27 Mar 2019 20:13:01 +0100
Subject: [PATCH 07/19] C-API quantization core 2 (#16396)

* C-API quantization core

test=develop

Co-authored-by: Sylwester Fraczek <sylwester.fraczek@intel.com>

* Decouple Quantizer from AnalysisPredictor

test=develop

* fixes after review

test=develop

* renamed mkldnn quantize stuff

test=develop

* remove ifdef from header file

test=develop
---
 paddle/fluid/inference/CMakeLists.txt         |  14 +-
 paddle/fluid/inference/api/CMakeLists.txt     |  12 +-
 paddle/fluid/inference/api/analysis_config.cc |  52 ++-
 .../fluid/inference/api/analysis_predictor.cc |  56 ++-
 .../fluid/inference/api/analysis_predictor.h  |  13 +
 .../api/analysis_predictor_tester.cc          | 241 ++++++++++
 .../fluid/inference/api/mkldnn_quantizer.cc   | 437 ++++++++++++++++++
 paddle/fluid/inference/api/mkldnn_quantizer.h | 104 +++++
 .../inference/api/mkldnn_quantizer_config.cc  |  40 ++
 .../inference/api/paddle_analysis_config.h    |  18 +
 .../api/paddle_mkldnn_quantizer_config.h      | 105 +++++
 .../inference/api/paddle_pass_builder.cc      |   4 +-
 .../fluid/inference/api/paddle_pass_builder.h |  22 +-
 13 files changed, 1089 insertions(+), 29 deletions(-)
 create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.cc
 create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.h
 create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer_config.cc
 create mode 100644 paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 4cd29486a8..5e0be5d445 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -37,18 +37,24 @@ endif(WIN32)
 
 add_subdirectory(api)
 
+if(WITH_MKLDNN)
+	set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc)
+	set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
+endif()
+
 set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
 set(SHARED_INFERENCE_SRCS
     io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
+    ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-              analysis_config paddle_pass_builder)
+              analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
 else(WIN32)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
-             zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
+             zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
 endif(WIN32)
 
 if(NOT APPLE)
@@ -61,11 +67,11 @@ endif()
 if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
               DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
-                   analysis_config paddle_pass_builder)
+                   analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
 else(WIN32)
   cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
-                  analysis_config paddle_pass_builder)
+                  analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
 endif()
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 90f09505c0..882bb34683 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -33,13 +33,19 @@ endif()
 
 add_subdirectory(details)
 
-cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
+if(WITH_MKLDNN)
+	set(mkldnn_quantizer_src mkldnn_quantizer.cc)
+	set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
+	cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
+endif()
+
+cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
+cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor
   reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
            lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
-           analysis_config paddle_pass_builder zero_copy_tensor
+           paddle_pass_builder zero_copy_tensor
            reset_tensor_array)
 
 cc_test(test_paddle_inference_api
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7bfdada496..aee94e1234 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
+  // Quantization related.
+  CP_MEMBER(use_mkldnn_quantizer_);
+  CP_MEMBER(mkldnn_quantizer_config_);
 
   CP_MEMBER(use_anakin_);
   CP_MEMBER(anakin_max_batchsize_);
@@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnQuantizer() {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!mkldnn_quantizer_config_)
+    mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig());
+  use_mkldnn_quantizer_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
+  use_mkldnn_quantizer_ = false;
+#endif
+
+  Update();
+}
+
+std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
+    const {
+  PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
+                          "MkldnnQuantizer was not enabled yet.");
+  return mkldnn_quantizer_config_;
+}
+
 void AnalysisConfig::EnableTensorRtEngine(
     int workspace_size, int max_batch_size, int min_subgraph_size,
     AnalysisConfig::Precision precision_mode, bool use_static) {
@@ -224,15 +247,27 @@ void AnalysisConfig::Update() {
 #endif
   }
 
-  if (enable_memory_optim_) {
-    auto analysis_passes = pass_builder()->AnalysisPasses();
-    auto memory_opti_pass_name = "memory_optimize_pass";
-    bool already_exists =
-        std::find(analysis_passes.begin(), analysis_passes.end(),
-                  memory_opti_pass_name) != analysis_passes.end();
-    if (!already_exists) {
-      pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
+  // Quantization passes must come after all other optimization passes
+  if (use_mkldnn_quantizer_) {
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization "
+                    "is enabled.";
     }
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMkldnnQuantizer();
+#else
+    LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
+    use_mkldnn_quantizer_ = false;
+#endif
+  }
+
+#ifdef PADDLE_WITH_MKLDNN
+  // Do not optimize before quantization
+  if (enable_memory_optim_ && !use_mkldnn_quantizer_) {
+#else
+  if (enable_memory_optim_) {
+#endif
+    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
   }
 
   if (use_anakin_) {
@@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   for (auto &item : mkldnn_enabled_op_types_) ss << item;
   ss << ";";
 
+  ss << use_mkldnn_quantizer_;
   ss << model_from_memory_;
 
   ss << enable_ir_optim_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 001e8e66d5..f726056154 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -18,6 +18,7 @@
 #include <fstream>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -35,8 +36,13 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#endif
+
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
@@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
-// NOTE All the members in AnalysisConfig should be copied to Argument.
-void AnalysisPredictor::OptimizeInferenceProgram() {
-  status_program_optimized_ = true;
-
+void AnalysisPredictor::PrepareArgument() {
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
   argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
@@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
   }
 
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.mkldnn_quantizer_enabled()) {
+    LOG(INFO) << "Quantization is enabled";
+    argument_.SetQuantizeEnabledOpTypes(
+        config_.mkldnn_quantizer_config()->enabled_op_types());
+    argument_.SetQuantizeExcludedOpIds(
+        config_.mkldnn_quantizer_config()->excluded_op_ids());
+  }
+#endif
+
   auto passes = config_.pass_builder()->AllPasses();
   if (!config_.ir_optim()) {
     passes.clear();
@@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   argument_.SetIrAnalysisPasses(passes);
   argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
   argument_.SetScopeNotOwned(scope_.get());
+}
+
+// NOTE All the members in AnalysisConfig should be copied to Argument.
+void AnalysisPredictor::OptimizeInferenceProgram() {
+  status_program_optimized_ = true;
+
+  PrepareArgument();
   Analyzer().Run(&argument_);
 
   PADDLE_ENFORCE(argument_.scope_valid());
@@ -439,12 +459,31 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   }
 
   std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
+  auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
+
+  if (!predictor_p->Init(nullptr)) {
+    return nullptr;
+  }
+
+  if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) {
     return nullptr;
   }
+
   return predictor;
 }
 
+bool AnalysisPredictor::MkldnnQuantize() {
+#if PADDLE_WITH_MKLDNN
+  if (!mkldnn_quantizer_)
+    mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer(
+        *this, config_.mkldnn_quantizer_config());
+  return mkldnn_quantizer_->Quantize();
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
+  return false;
+#endif
+}
+
 void AnalysisPredictor::PrepareFeedFetch() {
   PADDLE_ENFORCE_NOT_NULL(sub_scope_);
   CreateFeedFetchVar(sub_scope_);
@@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() {
     scope_->DeleteScope(sub_scope_);
   }
 
+#if PADDLE_WITH_MKLDNN
+  if (mkldnn_quantizer_) {
+    delete mkldnn_quantizer_;
+    mkldnn_quantizer_ = nullptr;
+  }
+#endif
+
   // TODO(Superjomn) deduce the directory path.
   std::string out_path = inference::analysis::GetMemoryCachePath(
       config_.model_dir(), config_.prog_file());
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 087bfbd002..e4c537f426 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void CreateFeedFetchVar(framework::Scope *scope);
   void PrepareFeedFetch();
 
+  void PrepareArgument();
   void OptimizeInferenceProgram();
 
   Argument &analysis_argument() { return argument_; }
@@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor {
 
   std::string GetSerializedProgram() const override;
 
+  bool MkldnnQuantize();
+
  protected:
   // For memory optimization.
   bool need_collect_var_shapes_for_memory_optim();
@@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor {
   std::vector<framework::OpDesc *> fetches_;
   std::map<size_t, std::string> idx2fetches_;
 
+#if PADDLE_WITH_MKLDNN
+  // Helper class to perform quantization
+  class MkldnnQuantizer;
+  MkldnnQuantizer *mkldnn_quantizer_{nullptr};
+
+#if PADDLE_WITH_TESTING
+  friend class MkldnnQuantizerTest;
+#endif
+#endif
+
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, wrong results and memory leak, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 6696839b53..0429a287c7 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -17,9 +17,13 @@
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#endif
 
 DEFINE_string(dirname, "", "dirname to tests.");
 
@@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) {
   inference::CompareResult(output, output1);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+class MkldnnQuantizerTest : public testing::Test {
+ public:
+  MkldnnQuantizerTest() {
+    AnalysisConfig config(FLAGS_dirname);
+
+    predictor.reset(new AnalysisPredictor(config));
+    auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
+
+    auto qconfig = std::make_shared<MkldnnQuantizerConfig>();
+
+    mkldnn_quantizer.reset(
+        new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
+  }
+
+  std::pair<std::vector<int>, float> Histogram(
+      const framework::LoDTensor& var_tensor, float min_val, float max_val,
+      int num_bins) const {
+    return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins);
+  }
+
+  std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
+    return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned);
+  }
+
+  std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
+    return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned);
+  }
+
+  std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
+    return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned);
+  }
+
+ protected:
+  std::unique_ptr<PaddlePredictor> predictor;
+  std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
+  float abs_error = 1e-6;
+  static const std::array<float, 10> non_negative_values;
+  static const std::array<float, 10> positive_and_negative_values;
+};
+
+const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = {
+    0.0158671, 0.026459,   0.0280772,  0.00962479, 0.0131628,
+    0.016704,  0.00118407, 0.00765726, 0.0123213,  0.00944741};
+const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values =
+    {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
+     -0.0495346, 0.0629528,  -0.00531285, -0.0230353,  0.0269089};
+
+TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
+  const auto& values = non_negative_values;
+  auto min_val = *std::min_element(values.begin(), values.end());
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3),
+               platform::EnforceNotMet);
+}
+
+TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
+  // all non-negative values
+  const auto& values = non_negative_values;
+  auto min_val = *std::min_element(values.begin(), values.end());
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  std::vector<int> histogram;
+  float bin_width;
+
+  std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
+
+  ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error)
+      << "Improperly calculated bin_width.";
+
+  ASSERT_EQ(histogram[0], 4);
+  ASSERT_EQ(histogram[1], 4);
+  ASSERT_EQ(histogram[2], 2);
+}
+
+TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
+  const auto& values = positive_and_negative_values;
+  auto min_val = *std::min_element(values.begin(), values.end());
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  std::vector<int> histogram;
+  float bin_width;
+
+  std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
+
+  ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error)
+      << "Improperly calculated bin_width.";
+
+  ASSERT_EQ(histogram[0], 3);
+  ASSERT_EQ(histogram[1], 5);
+  ASSERT_EQ(histogram[2], 2);
+}
+
+TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
+  const auto& values = non_negative_values;
+  auto min_val = *std::min_element(values.begin(), values.end());
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0),
+               platform::EnforceNotMet);
+}
+
+TEST_F(MkldnnQuantizerTest, histogram_empty) {
+  // empty tensor
+  ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet);
+
+  // zero tensor
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize({0});
+  ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
+
+  ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
+}
+
+TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
+  const auto& values = positive_and_negative_values;
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  bool is_unsigned;
+  framework::LoDTensor lod_tensor;
+
+  std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false);
+
+  ASSERT_EQ(is_unsigned, false);
+  ASSERT_EQ(lod_tensor.numel(), 1);
+  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error);
+}
+
+TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
+  const auto& values = positive_and_negative_values;
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  bool is_unsigned;
+  framework::LoDTensor lod_tensor;
+
+  std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false);
+
+  ASSERT_EQ(is_unsigned, false);
+  ASSERT_EQ(lod_tensor.numel(), 1);
+  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
+}
+
+TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
+  const auto& values = non_negative_values;
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  bool is_unsigned;
+  framework::LoDTensor lod_tensor;
+
+  std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true);
+
+  ASSERT_EQ(is_unsigned, true);
+  ASSERT_EQ(lod_tensor.numel(), 1);
+  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
+}
+
+TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
+  const auto& values = non_negative_values;
+  auto max_val = *std::max_element(values.begin(), values.end());
+  int channels = 3;
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size()));
+  for (int i = 0; i < channels; i++)
+    std::copy(begin(values), end(values),
+              var_tensor.mutable_data<float>(platform::CPUPlace()) +
+                  i * values.size());
+
+  bool is_unsigned;
+  framework::LoDTensor lod_tensor;
+
+  std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true);
+
+  ASSERT_EQ(is_unsigned, true);
+  ASSERT_EQ(lod_tensor.numel(), channels);
+  for (int i = 0; i < channels; i++) {
+    ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error);
+  }
+}
+
+TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
+  const auto& values = non_negative_values;
+
+  framework::LoDTensor var_tensor;
+  var_tensor.Resize(framework::make_dim(values.size()));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+
+  bool is_unsigned;
+  framework::LoDTensor lod_tensor;
+
+  std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true);
+
+  ASSERT_EQ(is_unsigned, true);
+  ASSERT_EQ(lod_tensor.numel(), 1);
+  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error);
+}
+#endif
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
new file mode 100644
index 0000000000..de75e884f5
--- /dev/null
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -0,0 +1,437 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+
+using platform::CPUPlace;
+using framework::LoDTensor;
+using framework::ir::Graph;
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+using string::PrettyLogH1;
+
+bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
+  PrettyLogH1("--- Calculating scales for quantization");
+  using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+  std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
+  for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
+    if (op->HasAttr("use_quantizer") &&
+        boost::get<bool>(op->GetAttr("use_quantizer"))) {
+      const VariableNameMap& connections_in = op->Inputs();
+      const VariableNameMap& connections_out = op->Outputs();
+
+      auto glambda = [&](const VariableNameMap& connections, bool is_output) {
+        for (auto const& conn : connections) {
+          if (conn.second.size() == 0) continue;
+          auto& var_name = conn.second[0];
+
+          // skip if scale already computed
+          if (scales_.find(var_name) != scales_.end()) return;
+
+          auto* var = predictor_.sub_scope_->FindVar(var_name);
+          PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
+          PADDLE_ENFORCE(var->IsType<LoDTensor>(),
+                         "Only support lod tensor now.");
+          LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
+
+          // force unsigned type if already know it
+          bool is_unsigned = false;
+          if (is_output && op->Type() == "conv2d") {
+            // output of conv2d with relu must be unsigned
+            is_unsigned = op->HasAttr("fuse_relu") &&
+                          boost::get<bool>(op->GetAttr("fuse_relu"));
+          } else if (is_output && op->Type() == "pool2d") {
+            // output of pool2d with unsigned input must be unsigned
+            auto input_var_name = op->Input("X")[0];
+            if (scales_.find(input_var_name) != scales_.end()) {
+              is_unsigned = scales_[input_var_name].first;
+            }
+          }
+
+          CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
+                               is_unsigned);
+        }
+      };
+
+      // handle outputs first so unsigned outputs could be inferred
+      glambda(connections_out, true /* is_output */);
+      glambda(connections_in, false /* is_output */);
+    }
+  }
+
+  return true;
+}
+
+void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
+    const std::string& op_type_name, const std::string& conn_name,
+    const std::string& var_name, const LoDTensor& var_tensor,
+    bool is_unsigned) {
+  auto rule = qconfig_->scale_algo(op_type_name, conn_name);
+  if (rule == ScaleAlgo::NONE) return;
+
+  PADDLE_ENFORCE(
+      var_tensor.numel() > 0,
+      "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
+      "%s of connection %s should not be empty.",
+      var_name, op_type_name, conn_name);
+
+  switch (rule) {
+    case ScaleAlgo::MAX:
+      scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
+      break;
+    case ScaleAlgo::MAX_CH:
+      scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
+      break;
+    case ScaleAlgo::KL:
+      scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
+      break;
+    default:
+      throw std::runtime_error(
+          "MkldnnQuantizer: Unexpected ScaleAlgo specified.");
+  }
+}
+
+std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins(
+    std::vector<int> quantized_bins, std::vector<int> reference_bins) const {
+  std::vector<int> expanded_quantized_bins(reference_bins.size(), 0);
+  int num_merged_bins = reference_bins.size() / quantized_bins.size();
+  int j_start = 0;
+  int j_end = num_merged_bins;
+  for (size_t idx = 0; idx < quantized_bins.size(); idx++) {
+    int zero_count =
+        std::count(&reference_bins[j_start], &reference_bins[j_end], 0);
+    num_merged_bins = j_end - j_start;
+    int avg_bin_ele;
+    if (zero_count == num_merged_bins) {
+      avg_bin_ele = 0;
+    } else {
+      avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0);
+    }
+    for (int idx1 = j_start; idx1 < j_end; idx1++) {
+      expanded_quantized_bins[idx1] =
+          (reference_bins[idx1] == 0) ? 0 : avg_bin_ele;
+    }
+    j_start += num_merged_bins;
+    j_end += num_merged_bins;
+    if ((idx + 1) == quantized_bins.size() - 1) {
+      j_end = reference_bins.size();
+    }
+  }
+  return expanded_quantized_bins;
+}
+
+std::pair<bool, LoDTensor>
+AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
+    const LoDTensor& var_tensor, bool is_unsigned) const {
+  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
+                                        var_tensor.numel(), 1};
+  int precision_hist_num_bins = 2048;
+  float max_val = eigen_tensor.maxCoeff();
+  float min_val = eigen_tensor.minCoeff();
+  bool is_positive = min_val >= 0.0f;
+  if (is_unsigned)
+    PADDLE_ENFORCE(
+        is_positive,
+        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+        min_val);
+
+  int num_quantized_bins = 255;
+
+  std::vector<int> hist;
+  float bin_width;
+  int starting_iter;
+  int ending_iter = precision_hist_num_bins - 1;
+  if (is_positive) {
+    std::tie(hist, bin_width) =
+        Histogram(var_tensor, min_val, max_val, precision_hist_num_bins);
+    starting_iter = static_cast<int>(ending_iter * 0.7);
+  } else {
+    float th = std::max(std::abs(max_val), std::abs(min_val));
+    std::tie(hist, bin_width) =
+        Histogram(var_tensor, -th, th, precision_hist_num_bins);
+    starting_iter = 0;
+    if (std::abs(max_val) > std::abs(min_val)) {
+      while (starting_iter < ending_iter) {
+        if (hist[starting_iter] == 0) {
+          ++starting_iter;
+          continue;
+        } else {
+          break;
+        }
+      }
+      starting_iter += static_cast<int>((ending_iter - starting_iter) * 0.6);
+    } else {
+      while (ending_iter > 0) {
+        if (hist[ending_iter] == 0) {
+          --ending_iter;
+          continue;
+        } else {
+          break;
+        }
+      }
+      starting_iter = static_cast<int>(0.6 * ending_iter);
+    }
+  }
+  auto P_sum = eigen_tensor.size();
+  int min_kl_divergence = 0;
+  int min_kl_index = 0;
+  bool kl_inited = false;
+  for (int i = starting_iter; i <= ending_iter; i++) {
+    std::vector<int> reference_distr_P(&hist[0], &hist[i]);
+    auto outliers_count =
+        std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0);
+    if (reference_distr_P[i - 1] == 0) {
+      continue;
+    }
+    reference_distr_P[i - 1] += outliers_count;
+    auto reference_distr_bins = reference_distr_P;
+    std::vector<int> candidate_distr_Q(&hist[0], &hist[i]);
+    int num_merged_bins = i / num_quantized_bins;
+    std::vector<int> candidate_distr_Q_quantized(num_quantized_bins, 0);
+    int j_start = 0;
+    int j_end = num_merged_bins;
+    for (int idx = 0; idx < num_quantized_bins; idx++) {
+      candidate_distr_Q_quantized[idx] = std::accumulate(
+          &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0);
+      j_start += num_merged_bins;
+      j_end += num_merged_bins;
+      if ((idx + 1) == num_quantized_bins - 1) {
+        j_end = i;
+      }
+    }
+    candidate_distr_Q =
+        ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins);
+    int Q_sum =
+        std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0);
+    auto kl_divergence =
+        SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum);
+    if (!kl_inited) {
+      min_kl_divergence = kl_divergence;
+      min_kl_index = i;
+      kl_inited = true;
+    } else if (kl_divergence < min_kl_divergence) {
+      min_kl_divergence = kl_divergence;
+      min_kl_index = i;
+    } else {
+    }
+  }
+  if (min_kl_index == 0) {
+    while (starting_iter > 0) {
+      if (hist[starting_iter] == 0) {
+        starting_iter -= 1;
+        continue;
+      } else {
+        break;
+      }
+    }
+    min_kl_index = starting_iter;
+  }
+
+  LoDTensor scale_tensor;
+  scale_tensor.Resize({1});
+  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
+
+  scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
+
+  return std::make_pair(is_unsigned, scale_tensor);
+}
+
+std::pair<bool, LoDTensor>
+AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
+    const LoDTensor& var_tensor, bool is_unsigned) const {
+  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
+                                        var_tensor.numel(), 1};
+  float max_abs = eigen_tensor.abs().maxCoeff();
+  float min_val = eigen_tensor.minCoeff();
+  if (is_unsigned)
+    PADDLE_ENFORCE(
+        min_val >= 0.0f,
+        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+        min_val);
+
+  LoDTensor scale_tensor;
+  scale_tensor.Resize({1});
+  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
+  scale_ptr[0] = 1.0 / max_abs;
+
+  return std::make_pair(is_unsigned, scale_tensor);
+}
+
+std::pair<bool, LoDTensor>
+AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
+    const LoDTensor& var_tensor, bool is_unsigned) const {
+  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
+
+  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
+                                        var_tensor.numel(), 1};
+  float min_val = eigen_tensor.minCoeff();
+  if (is_unsigned)
+    PADDLE_ENFORCE(
+        min_val >= 0.0f,
+        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+        min_val);
+
+  int channels = var_tensor.dims()[0];
+  LoDTensor scale_tensor;
+  scale_tensor.Resize({channels});
+  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
+
+  for (int i = 0; i < channels; ++i) {
+    const auto tensor = var_tensor.Slice(i, i + 1);
+
+    ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
+                                          1};
+    float max_abs = eigen_tensor.abs().maxCoeff();
+    scale_ptr[i] = 1.0 / max_abs;
+  }
+
+  return std::make_pair(is_unsigned, scale_tensor);
+}
+
+std::pair<std::vector<int>, float>
+AnalysisPredictor::MkldnnQuantizer::Histogram(
+    const framework::LoDTensor& var_tensor, float min_val, float max_val,
+    size_t num_bins) const {
+  PADDLE_ENFORCE_GT(num_bins, 0,
+                    "MkldnnQuantizer: To calculate Histogram, num_bins (" +
+                        std::to_string(num_bins) + ") must be positive.");
+  PADDLE_ENFORCE_GT(
+      var_tensor.numel(), 0,
+      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
+  PADDLE_ENFORCE(max_val >= min_val,
+                 "MkldnnQuantizer: To calculate Histogram, max_val (" +
+                     std::to_string(max_val) +
+                     ") must be greater or equal"
+                     "to min_val (" +
+                     std::to_string(min_val) + ").");
+  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
+                                        var_tensor.numel(), 1};
+  auto bin_width = std::abs(max_val - min_val) / num_bins;
+  std::vector<int> hist(num_bins);
+
+  for (int i = 0; i < eigen_tensor.size(); i++) {
+    int bin = std::min(
+        num_bins - 1,
+        static_cast<size_t>(floor((eigen_tensor[i] - min_val) / bin_width)));
+    ++hist[bin];
+  }
+
+  return std::make_pair(std::move(hist), std::move(bin_width));
+}
+
+void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
+  auto& arg = predictor_.argument_;
+  if (!arg.scope_valid()) arg.SetScope(new framework::Scope);
+  arg.SetMainProgramNotOwned(predictor_.inference_program_.get());
+  auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
+  arg.SetMainGraph(graph.release());
+  arg.main_graph().Set(framework::ir::kParamScopeAttr,
+                       new framework::Scope*(arg.scope_ptr()));
+
+  auto* builder = predictor_.config_.pass_builder();
+  builder->SetPasses({
+      "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass",
+  });
+  if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
+  auto passes = builder->AllPasses();
+  predictor_.argument_.SetIrAnalysisPasses(passes);
+  predictor_.argument_.SetAnalysisPasses(
+      {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"});
+  predictor_.argument_.SetQuantVarScales(scales_);
+}
+
+bool AnalysisPredictor::MkldnnQuantizer::Quantize() {
+  if (!RunWarmup()) return false;
+  if (!CalculateScales()) return false;
+  predictor_.PrepareScope(predictor_.scope_);
+  predictor_.CreateExecutor();
+  if (!RunQuantizePasses()) return false;
+  predictor_.PrepareExecutor();
+  predictor_.PrepareFeedFetch();
+  return true;
+}
+
+bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
+  predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true,
+                                        predictor_.sub_scope_);
+  PrepareArgument();
+  auto& arg = predictor_.argument_;
+  Analyzer().Run(&arg);
+  PADDLE_ENFORCE(arg.scope_valid());
+  VLOG(5) << "to prepare executor";
+  ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
+  predictor_.inference_program_.reset(
+      new framework::ProgramDesc(arg.ir_analyzed_program()));
+  LOG(INFO) << "== optimize 2 end ==";
+  predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0,
+                                        false, predictor_.sub_scope_);
+  return true;
+}
+
+bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
+  VLOG(3) << "Predictor: run a quantization warmup iteration";
+  auto warmup_data = qconfig_->warmup_data();
+  PADDLE_ENFORCE_NOT_NULL(warmup_data,
+                          "Warmup data cannot be NULL in the config.");
+  PrettyLogH1("--- Running warmup iteration for quantization");
+
+  // Run the inference program
+  std::vector<PaddleTensor> output_slots;
+  predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size());
+
+  return true;
+}
+
+float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
+    std::vector<int> reference_distr_P, int P_sum,
+    std::vector<int> candidate_distr_Q, int Q_sum) const {
+  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
+  float tmp_sum1 = 0;
+  float tmp_sum2 = 0;
+  for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
+    int p_idx = reference_distr_P[idx];
+    int q_idx = candidate_distr_Q[idx];
+    if (p_idx == 0) {
+      tmp_sum1 += 0;
+      tmp_sum2 += 0;
+    } else {
+      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
+                                     std::to_string(idx) +
+                                     " qindex = 0! p_idx = " +
+                                     std::to_string(p_idx));
+    }
+    tmp_sum1 += p_idx * (log(Q_sum * p_idx));
+    tmp_sum2 += p_idx * (log(P_sum * q_idx));
+  }
+  return (tmp_sum1 - tmp_sum2) / P_sum;
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
new file mode 100644
index 0000000000..f4b0df5d74
--- /dev/null
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/string/printf.h"
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
+
+namespace paddle {
+
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
+
+class AnalysisPredictor::MkldnnQuantizer {
+ public:
+  explicit MkldnnQuantizer(
+      AnalysisPredictor& predictor,  // NOLINT
+      const std::shared_ptr<MkldnnQuantizerConfig>& qconfig)
+      : predictor_(predictor), qconfig_(qconfig) {}
+
+  // Execute full quantization procedure.
+  bool Quantize();
+
+#if PADDLE_WITH_TESTING
+  friend class MkldnnQuantizerTest;
+#endif
+
+ private:
+  // Run single warmup iteration
+  bool RunWarmup() const;
+  // Gather data from variables and calculate scales for them.
+  bool CalculateScales();
+  // Calculate a scale for tensor based on ScaleAlgo rules.
+  void CalculateSingleScale(const std::string& op_name,
+                            const std::string& conn_name,
+                            const std::string& var_name,
+                            const framework::LoDTensor& var_tensor,
+                            bool is_unsigned);
+  void PrepareArgument() const;
+  bool RunQuantizePasses() const;
+
+  std::vector<int> ExpandQuantizedBins(std::vector<int> quantized_bins,
+                                       std::vector<int> reference_bins) const;
+
+  // Using the KL-divergence method get the most precise scaling factor.
+  std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
+
+  std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
+
+  std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
+      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
+
+  // Returns histogram and bin width
+  std::pair<std::vector<int>, float> Histogram(
+      const framework::LoDTensor& var_tensor, float min_val, float max_val,
+      size_t num_bins = 2048) const;
+
+  // Calculate the entropy.
+  float SafeEntropy(std::vector<int> reference_distr_P, int P_sum,
+                    std::vector<int> candidate_distr_Q, int Q_sum) const;
+
+ private:
+  AnalysisPredictor& predictor_;
+  const std::shared_ptr<MkldnnQuantizerConfig> qconfig_;
+
+  // A map: variable name -> scale
+  VarQuantScale scales_;
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
new file mode 100644
index 0000000000..f9ff542d86
--- /dev/null
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
+
+namespace paddle {
+
+MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
+  // The default configuration of scale computing algorightms
+  rules_["conv2d"]["Input"] = ScaleAlgo::KL;
+  rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
+  rules_["conv2d"]["Bias"] = ScaleAlgo::NONE;  // do not compute scale
+  rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
+  rules_["conv2d"]["Output"] = ScaleAlgo::KL;  // do not compute scale
+
+  rules_["pool2d"]["X"] = ScaleAlgo::KL;
+  rules_["pool2d"]["Out"] = ScaleAlgo::KL;  // do not compute scale
+}
+
+ScaleAlgo MkldnnQuantizerConfig::scale_algo(
+    const std::string& op_type_name, const std::string& conn_name) const {
+  if (rules_.find(op_type_name) != rules_.end()) {
+    auto op_rule = rules_.at(op_type_name);
+    if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name);
+  }
+  return default_scale_algo_;
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 23df507aa6..2ad4add294 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -27,10 +27,14 @@
 // the abstract path of this header file will be changed.
 #include "paddle_api.h"           // NOLINT
 #include "paddle_pass_builder.h"  // NOLINT
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle_mkldnn_quantizer_config.h"  // NOLINT
+#endif
 
 namespace paddle {
 
 class AnalysisPredictor;
+struct MkldnnQuantizerConfig;
 
 // NOTE WIP, not stable yet.
 struct AnalysisConfig {
@@ -186,6 +190,16 @@ struct AnalysisConfig {
     mkldnn_enabled_op_types_ = op_list;
   }
 
+  /** Turn on quantization.
+   */
+  void EnableMkldnnQuantizer();
+
+  /** A boolean state telling whether the quantization is enabled.
+  */
+  bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
+
+  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config() const;
+
   /** Specify the memory buffer of program and parameter
    * @param prog_buffer the memory buffer of program.
    * @param prog_buffer_size the size of the data.
@@ -271,10 +285,14 @@ struct AnalysisConfig {
   std::string serialized_info_cache_;
 
   mutable std::unique_ptr<PassStrategy> pass_builder_;
+
   bool use_anakin_{false};
   int anakin_max_batchsize_;
   std::map<std::string, std::vector<int>> anakin_max_input_shape_;
   std::map<std::string, std::string> engine_opt_info_;
+
+  bool use_mkldnn_quantizer_{false};
+  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
new file mode 100644
index 0000000000..d46f842de7
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle_api.h"  // NOLINT
+
+namespace paddle {
+
+// Algorithms for finding scale of quantized Tensors.
+enum class ScaleAlgo {
+  NONE,    // Do not compute scale
+  MAX,     // Find scale based on the maximum absolute value
+  MAX_CH,  // Find scale based on the maximum absolute value per channel
+  KL,      // Find scale based on KL Divergence
+};
+
+struct MkldnnQuantizerConfig {
+  MkldnnQuantizerConfig();
+
+  /** Specify a quantization algorithm for a connection (input/output) of the
+   * operator type.
+   * @param op_type_name the operator's name.
+   * @param conn_name name of the connection (input/output) of the operator.
+   * @param algo the algorithm for computing scale.
+   */
+  void SetScaleAlgo(std::string op_type_name, std::string conn_name,
+                    ScaleAlgo algo) {
+    rules_[op_type_name][conn_name] = algo;
+  }
+
+  /** Get the quantization algorithm for a connection (input/output) of the
+   * operator type.
+   * @param op_type_name the operator's name.
+   * @param conn_name name of the connection (input/output) of the operator.
+   * @return the algorithm for computing scale.
+   */
+  ScaleAlgo scale_algo(const std::string& op_type_name,
+                       const std::string& conn_name) const;
+
+  /** Set the batch of data to be used for warm-up iteration.
+   * @param data batch of data.
+   */
+  void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
+    warmup_data_ = data;
+  }
+
+  /** Get the batch of data used for warm-up iteration.
+   * @return batch of data.
+   */
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
+    return warmup_data_;
+  }
+
+  void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
+
+  int warmup_batch_size() const { return warmup_bs_; }
+
+  void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
+    enabled_op_types_ = op_list;
+  }
+
+  const std::unordered_set<std::string>& enabled_op_types() const {
+    return enabled_op_types_;
+  }
+
+  void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
+    excluded_op_ids_ = op_ids_list;
+  }
+
+  const std::unordered_set<int>& excluded_op_ids() const {
+    return excluded_op_ids_;
+  }
+
+  void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
+
+  ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
+
+ protected:
+  std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
+  std::unordered_set<std::string> enabled_op_types_;
+  std::unordered_set<int> excluded_op_ids_;
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
+  int warmup_bs_{1};
+  ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 35dd117671..8ec32b3a0b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -107,8 +107,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   use_gpu_ = true;
 }
 
-void GpuPassStrategy::EnableQuantizer() {
-  LOG(ERROR) << "GPU not support quantization yet";
+void GpuPassStrategy::EnableMkldnnQuantizer() {
+  LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 65403e790e..de60185eb3 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -30,6 +30,10 @@ class PaddlePassBuilder {
   explicit PaddlePassBuilder(const std::vector<std::string> &passes)
       : passes_(passes) {}
 
+  void SetPasses(std::initializer_list<std::string> passes) {
+    passes_ = passes;
+  }
+
   /** Append a pass to the end of the passes. */
   void AppendPass(const std::string &pass_type);
 
@@ -85,9 +89,9 @@ class PassStrategy : public PaddlePassBuilder {
    */
   virtual void EnableMKLDNN() {}
 
-  /** Enable quantize optimization
+  /** Enable MKLDNN quantize optimization
    */
-  virtual void EnableQuantizer() {}
+  virtual void EnableMkldnnQuantizer() {}
 
   bool use_gpu() const { return use_gpu_; }
 
@@ -130,15 +134,19 @@ class CpuPassStrategy : public PassStrategy {
 #endif
   }
 
-  void EnableQuantizer() override {
-    if (!use_quantizer_) {
+  void EnableMkldnnQuantizer() override {
+#ifdef PADDLE_WITH_MKLDNN
+    if (!use_mkldnn_quantizer_) {
       passes_.push_back("cpu_quantize_placement_pass");
     }
-    use_quantizer_ = true;
+    use_mkldnn_quantizer_ = true;
+#else
+    use_mkldnn_quantizer_ = false;
+#endif
   }
 
  protected:
-  bool use_quantizer_{false};
+  bool use_mkldnn_quantizer_{false};
 };
 
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -153,7 +161,7 @@ class GpuPassStrategy : public PassStrategy {
   }
 
   void EnableMKLDNN() override;
-  void EnableQuantizer() override;
+  void EnableMkldnnQuantizer() override;
 
   virtual ~GpuPassStrategy() = default;
 };

From 2d8b7b3a766c2aa707a1f27d2901bd9b75d98f1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Gallus?= <Sand3r-@users.noreply.github.com>
Date: Wed, 27 Mar 2019 21:42:53 +0100
Subject: [PATCH 08/19] Refine default MKL-DNN Pass order (#16490)

* Refine default MKL-DNN Pass order

test=develop

* Add comment to default MKL-DNN Pass list

test=develop
---
 paddle/fluid/inference/api/paddle_pass_builder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index de60185eb3..48da8c156f 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -121,6 +121,8 @@ class CpuPassStrategy : public PassStrategy {
 
       for (auto &pass : std::vector<std::string>(
                {"depthwise_conv_mkldnn_pass",    //
+                "conv_bn_fuse_pass",             // Execute BN passes again to
+                "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
                 "conv_bias_mkldnn_fuse_pass",    //
                 "conv3d_bias_mkldnn_fuse_pass",  //
                 "conv_relu_mkldnn_fuse_pass",    //

From b1d2605152e70acc1ba3d82dd693dcc47d128390 Mon Sep 17 00:00:00 2001
From: baojun <32073718+baojun-nervana@users.noreply.github.com>
Date: Wed, 27 Mar 2019 18:05:01 -0700
Subject: [PATCH 09/19] fix compile issue test=develop (#16447)

---
 cmake/external/ngraph.cmake | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index e7fb69dbbc..23998b497e 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 ExternalProject_Add(
     ${NGRAPH_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ${MKLDNN_PROJECT} ${MKLML_PROJECT}
-    GIT_REPOSITORY      ${NGRAPH_GIT_REPO}
-    GIT_TAG             ${NGRAPH_GIT_TAG}
-    PREFIX              ${NGRAPH_SOURCES_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
-    CMAKE_ARGS          -DNGRAPH_UNIT_TEST_ENABLE=FALSE
-    CMAKE_ARGS          -DNGRAPH_TOOLS_ENABLE=FALSE
-    CMAKE_ARGS          -DNGRAPH_INTERPRETER_ENABLE=FALSE
-    CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
-    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
-    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
+    DEPENDS                  ${MKLDNN_PROJECT} ${MKLML_PROJECT}
+    GIT_REPOSITORY           ${NGRAPH_GIT_REPO}
+    GIT_TAG                  ${NGRAPH_GIT_TAG}
+    PREFIX                   ${NGRAPH_SOURCES_DIR}
+    UPDATE_COMMAND           ""
+    CMAKE_GENERATOR          ${CMAKE_GENERATOR}
+    CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
+    CMAKE_GENERATOR_TOOLSET  ${CMAKE_GENERATOR_TOOLSET}
+    CMAKE_ARGS               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS               -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS               -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
+    CMAKE_ARGS               -DNGRAPH_UNIT_TEST_ENABLE=FALSE
+    CMAKE_ARGS               -DNGRAPH_TOOLS_ENABLE=FALSE
+    CMAKE_ARGS               -DNGRAPH_INTERPRETER_ENABLE=FALSE
+    CMAKE_ARGS               -DNGRAPH_DEX_ONLY=TRUE
+    CMAKE_ARGS               -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS               -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
+    CMAKE_ARGS               -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
+    CMAKE_ARGS               -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
 )
 
 add_dependencies(ngraph ${NGRAPH_PROJECT})

From eb83abeac3c0146b921ce72d06fef2551ab3e8d8 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 28 Mar 2019 09:23:47 +0800
Subject: [PATCH 10/19] Add DGC(Deep Gradient Compression) interface. (#15841)

---
 CMakeLists.txt                                |   6 +
 cmake/external/dgc.cmake                      |  42 +++
 cmake/inference_lib.cmake                     |   9 +
 cmake/operators.cmake                         |   2 +-
 paddle/fluid/API.spec                         |   5 +
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../framework/details/all_reduce_deps_pass.cc |   7 +-
 .../framework/details/all_reduce_op_handle.cc | 200 ++++++++++++-
 .../framework/details/all_reduce_op_handle.h  |  16 +-
 .../details/multi_devices_graph_pass.cc       |  33 ++-
 .../details/multi_devices_graph_pass.h        |   5 +-
 paddle/fluid/framework/details/var_handle.cc  |   3 +-
 paddle/fluid/framework/op_desc.cc             |   6 +
 paddle/fluid/framework/op_desc.h              |   1 +
 paddle/fluid/framework/operator.cc            |   5 +-
 paddle/fluid/inference/CMakeLists.txt         |   5 +
 paddle/fluid/operators/CMakeLists.txt         |   8 +-
 paddle/fluid/operators/clip_by_norm_op.cc     |  61 +---
 paddle/fluid/operators/clip_by_norm_op.h      |  54 ++++
 paddle/fluid/operators/dgc_clip_by_norm_op.cc |  67 +++++
 paddle/fluid/operators/dgc_clip_by_norm_op.cu |  20 ++
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |  46 +++
 paddle/fluid/operators/dgc_op.cc              | 138 +++++++++
 paddle/fluid/operators/dgc_op.cu              |  20 ++
 paddle/fluid/operators/dgc_op.h               | 132 +++++++++
 paddle/fluid/platform/CMakeLists.txt          |   6 +-
 paddle/fluid/platform/assert.h                |  14 +-
 paddle/fluid/platform/device_context.cc       |  15 +-
 paddle/fluid/platform/init.cc                 |  18 ++
 paddle/fluid/platform/init.h                  |   2 +
 paddle/fluid/pybind/protobuf.cc               |   1 +
 paddle/fluid/pybind/pybind.cc                 |   1 +
 python/paddle/fluid/framework.py              |  16 ++
 python/paddle/fluid/optimizer.py              | 272 +++++++++++++++++-
 python/paddle/fluid/parallel_executor.py      |   6 +
 .../fluid/tests/unittests/CMakeLists.txt      |   8 +-
 .../fluid/tests/unittests/dist_mnist.py       |   8 +-
 .../fluid/tests/unittests/dist_se_resnext.py  |  20 +-
 .../fluid/tests/unittests/test_dgc_op.py      | 138 +++++++++
 .../fluid/tests/unittests/test_dist_base.py   |  16 +-
 .../fluid/tests/unittests/test_dist_mnist.py  |  14 +
 .../tests/unittests/test_dist_se_resnext.py   |  15 +
 42 files changed, 1363 insertions(+), 100 deletions(-)
 create mode 100644 cmake/external/dgc.cmake
 create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cc
 create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cu
 create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.h
 create mode 100644 paddle/fluid/operators/dgc_op.cc
 create mode 100644 paddle/fluid/operators/dgc_op.cu
 create mode 100644 paddle/fluid/operators/dgc_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_dgc_op.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a38e32b73d..9ad69738eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,6 +193,12 @@ if(WITH_GPU)
     include(tensorrt)
     include(anakin_subgraph)
 endif()
+
+if(WITH_GPU AND NOT WIN32)
+    message(STATUS "add dgc lib.")
+    include(external/dgc)
+endif()
+
 if(WITH_MKL OR WITH_MKLML)
     include(external/anakin)
 elseif()
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
new file mode 100644
index 0000000000..199ca88b47
--- /dev/null
+++ b/cmake/external/dgc.cmake
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc")
+SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
+SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
+SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
+INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_dgc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
+    GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
+    SOURCE_DIR "${DGC_SOURCES_DIR}"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND cd collective && make -j
+    INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/  ${DGC_INCLUDE_DIR}/dgc
+        && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
+        && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
+    BUILD_IN_SOURCE 1
+)
+
+ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
+ADD_DEPENDENCIES(dgc extern_dgc)
+
+LIST(APPEND external_project_dependencies dgc)
+
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index a7dce4dfdb..b7c32f80db 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32)
             )
 endif ()
 
+if (WITH_GPU AND NOT WIN32)
+    set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
+    copy(dgc_lib
+            SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
+            DSTS ${dgc_dir} ${dgc_dir}
+            DEPS dgc)
+endif()
+
+
 if (WITH_MKLDNN)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
     copy(mkldnn_lib
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 34c6cbd73d..c17e718f42 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 851308a0f6..e6f5cb7473 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -483,6 +483,11 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['sel
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 77e94e998c..046ec6978a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
+            dynload_cuda variable_visitor dgc)
     nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
     if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index c084410864..98a74d630c 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -86,7 +86,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
     }
   }
 
-  VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
+  VLOG(10) << "dist_ops size:" << dist_ops.size()
+           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
 
   std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
                                                   OpHandleBase* op2) {
@@ -99,6 +100,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
     auto l_it = vars.find(i0->name());
     auto r_it = vars.find(i1->name());
 
+    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
+                   "can't find var's name %s and %s in opdesc", i0->name(),
+                   i1->name());
+
     if (l_it->second < r_it->second) return true;
 
     if (l_it->second == r_it->second) {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index fdaff08e53..6e477cd297 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -16,6 +16,13 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/operator.h"
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "dgc/dgc.h"
+#endif
+
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // asynchronous nccl allreduce or synchronous issue:
@@ -33,11 +40,14 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs)
+                                     const platform::NCCLContextMap *ctxs,
+                                     bool is_encoded, int nranks)
     : OpHandleBase(node),
       local_scopes_(local_scopes),
       places_(places),
-      nccl_ctxs_(ctxs) {
+      nccl_ctxs_(ctxs),
+      is_encoded_(is_encoded),
+      nranks_(nranks) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
@@ -51,7 +61,185 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void AllReduceOpHandle::RunImplEncoded() {
+  platform::RecordEvent record_event(Name());
+
+  WaitInputVarGenerated();
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> ins;
+  std::vector<LoDTensor *> outs;
+  int k = -1;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &local_scope =
+        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto original_name =
+        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
+    auto encode_var_name = original_name + g_dgc_encoded;
+    auto *in_var = local_scope->FindVar(encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    auto &in = in_var->Get<LoDTensor>();
+    ins.emplace_back(&in);
+
+    auto *out = local_scope->FindVar(out_var_handles[i]->name())
+                    ->GetMutable<LoDTensor>();
+    outs.emplace_back(out);
+
+    if (k < 0) {
+      k = GetKValue(in_var_handles[i]->name());
+    }
+  }
+
+  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
+  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
+  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+
+  int dtype = -1;
+  size_t in_numel = 0;
+  size_t out_numel = 0;
+  PADDLE_ENFORCE(nranks_ > 1);
+  std::vector<std::function<void()>> all_reduce_calls;
+
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &place = places_[i];
+    auto &in = *ins[i];
+    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+
+    auto &out = *outs[i];
+    float *out_tensor_buf = out.data<float>();
+
+    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
+    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
+    PADDLE_ENFORCE(in_numel % 2 == 0);
+    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
+
+    int dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+    auto stream = nccl_ctx.stream();
+    auto comm = nccl_ctx.comm_;
+
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
+    int encode_size = 2 * k * sizeof(int);
+    // dgc use ncclAllGather to get all the encoded data
+    // so the buffer need nranks.
+    int buf_size = nranks_ * encode_size;
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
+
+    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
+             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
+             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
+
+    all_reduce_calls.emplace_back([=] {
+      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
+          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
+          stream));
+    });
+  }
+
+  this->RunAndRecordEvent([&] {
+    if (all_reduce_calls.size() == 1UL) {
+      // Do not use NCCLGroup when manage NCCL by per thread per device
+      all_reduce_calls[0]();
+    } else {
+      platform::NCCLGroupGuard guard;
+      for (auto &call : all_reduce_calls) {
+        call();
+      }
+    }
+  });
+
+  if (FLAGS_sync_nccl_allreduce) {
+    for (auto &p : places_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      cudaError_t e_sync = cudaStreamSynchronize(stream);
+      if (e_sync != 0) {
+        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
+      }
+
+      cudaError_t e_get = cudaGetLastError();
+      if (e_get != 0) {
+        LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
+                   << " errno:" << e_get;
+      }
+    }
+  }
+}
+
+int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
+  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
+  auto var_name = original_name + g_dgc_k;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto var = local_scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  auto tensor = var->Get<LoDTensor>().data<float>();
+  return *tensor;
+}
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+bool AllReduceOpHandle::IsEncoded() {
+  if (!is_encoded_) {
+    return false;
+  }
+  auto counter_name = g_dgc_counter_name;
+  auto step_name = g_dgc_rampup_begin_step;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto count_var = local_scope->FindVar(counter_name);
+  auto step_var = local_scope->FindVar(step_name);
+  if (count_var == nullptr || step_var == nullptr) {
+    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
+                 step_var);
+  }
+
+  float count = *count_var->Get<LoDTensor>().data<float>();
+  float step = *step_var->Get<LoDTensor>().data<float>();
+  if (static_cast<int>(count) < static_cast<int>(step)) {
+    VLOG(10) << "in all_reduce currentstep:" << count
+             << " < rampup_begin_step:" << step
+             << " so not use sparse all reduce";
+    return false;
+  }
+
+  return true;
+}
+#else
+bool AllReduceOpHandle::IsEncoded() { return false; }
+#endif
+
 void AllReduceOpHandle::RunImpl() {
+  if (!IsEncoded()) {
+    RunImplNormal();
+    return;
+  }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  RunImplEncoded();
+#else
+  PADDLE_THROW("Not compiled with CUDA");
+#endif
+}
+
+void AllReduceOpHandle::RunImplNormal() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
@@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() {
     auto &lod_tensor =
         local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
     lod_tensors.emplace_back(&lod_tensor);
+    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
+             << ", out_name:" << out_var_handles[i]->name();
     PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
                       "The name of input and output should be equal.");
   }
@@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() {
       auto &nccl_ctx = nccl_ctxs_->at(dev_id);
       auto stream = nccl_ctx.stream();
       auto comm = nccl_ctx.comm_;
+
+      VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
+               << ", dev_id:" << dev_id << ", dtype:" << dtype
+               << ", place:" << p;
+
       all_reduce_calls.emplace_back([=] {
         PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
             buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
             comm, stream));
       });
     }
-
     this->RunAndRecordEvent([&] {
       if (all_reduce_calls.size() == 1UL) {
         // Do not use NCCLGroup when manage NCCL by per thread per device
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index b449796fca..ca75186f6c 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -28,11 +28,19 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
+constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
+constexpr char g_dgc_encoded[] = "__dgc_encoded__";
+constexpr char g_dgc_k[] = "__dgc_k__";
+#endif
+
 struct AllReduceOpHandle : public OpHandleBase {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs);
+                    const platform::NCCLContextMap *ctxs,
+                    bool is_encoded = false, int nranks = -1);
 #else
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
@@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  void RunImplEncoded();
   const platform::NCCLContextMap *nccl_ctxs_;
+  bool is_encoded_{false};
+  int nranks_{-1};
+  int GetKValue(const std::string &grad_name);
 #endif
+  void RunImplNormal();
+  bool IsEncoded();
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 253cf5b4a8..8c61684c9c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace framework {
@@ -209,7 +210,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
           for (size_t i = 0; i < backward_vars.size(); i += 2) {
             auto &p_name = backward_vars[i];
             auto &g_name = backward_vars[i + 1];
-            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name
+                     << " op_type " << node->Op()->Type();
             if (NeedCollectiveForGrad(g_name, sorted_ops)) {
               InsertCollectiveOp(&result, p_name, g_name);
             }
@@ -414,8 +416,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
-    ir::Graph *result, const std::string &og) const {
+void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
+                                                    const std::string &og,
+                                                    bool is_encoded) const {
   OpHandleBase *op_handle = nullptr;
 
   auto append_allreduce_op = [&](
@@ -424,7 +427,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-        scopes, places, nccl_ctxs_));
+        scopes, places, nccl_ctxs_, is_encoded,
+        static_cast<int>(strategy_.trainers_endpoints_.size()) *
+            places_.size()));
 #else
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -446,12 +451,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad);
+    VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
 
     auto var =
         new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
                       vars.size(), i, og, places_[i]);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
+    VLOG(10) << "all_reduce_op_handle add output " << og
+             << ", handle:" << var->DebugString();
   }
 }
 
@@ -941,6 +949,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
+bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  auto u_name = p_name + "__dgc_u__";
+  auto it = all_vars_.find(u_name);
+  if (it == all_vars_.end()) {
+    VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
+    return false;
+  }
+
+  return true;
+}
+
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
                                              const std::string &g_name) const {
@@ -956,7 +975,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-        CreateAllReduceOp(result, g_name);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+        CreateAllReduceOp(result, g_name, IsEncoded(p_name));
+#else
+        PADDLE_ENFORCE(false, "Compiled withoud cuda!");
+#endif
       }
       break;
     default:
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 0ee3a06062..8bfd7b9bf8 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -75,7 +75,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   bool IsSparseGradient(const std::string &og) const;
 
-  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og,
+                         bool is_encoded = false) const;
 
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
@@ -171,6 +172,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
   mutable bool need_broadcast_var_{false};
+
+  bool IsEncoded(const std::string &p_name) const;
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 30da029ca2..95d62e6641 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
 
 std::string VarHandle::DebugString() const {
   std::stringstream ss;
-  ss << name_ << ":" << place_;
+  ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_
+     << ", scope_idx:" << scope_idx_;
   return ss.str();
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 8f9c6cb5e9..353db43521 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -373,6 +373,11 @@ std::vector<std::string> OpDesc::AttrNames() const {
   return retv;
 }
 
+void OpDesc::RemoveAttr(const std::string &name) {
+  attrs_.erase(name);
+  need_update_ = true;
+}
+
 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   // NOTICE(minqiyang): pybind11 will take the empty list in python as
   // the std::vector<int> type in C++; so we have to change the attr's type
@@ -644,6 +649,7 @@ void OpDesc::CheckAttrs() {
     // not by users.
     return;
   }
+  VLOG(10) << "begin to check attribute of " << Type();
   checker->Check(&attrs_);
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index d7352c5ee5..dedaf24364 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -72,6 +72,7 @@ class OpDesc {
   std::vector<std::string> AttrNames() const;
 
   void SetAttr(const std::string &name, const Attribute &v);
+  void RemoveAttr(const std::string &name);
 
   void SetBlockAttr(const std::string &name, BlockDesc *block);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eef84d17a4..b0ac73f9f5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1110,8 +1110,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           proto::VarType::Type tmp = t->type();
           PADDLE_ENFORCE(
               tmp == data_type || data_type == dafault_data_type,
-              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), DataTypeToString(data_type), DataTypeToString(tmp));
+              "DataType of Paddle Op %s %s must be the same. Get (%d) != (%d)",
+              Type(), input.first, DataTypeToString(data_type),
+              DataTypeToString(tmp));
           data_type = tmp;
         }
       }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 5e0be5d445..fb433ff2a2 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -49,6 +49,11 @@ set(SHARED_INFERENCE_SRCS
     ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
+# FIXME(gongwb): hidden libdgc.a
+if(WITH_GPU AND NOT WIN32)
+    set(fluid_modules ${fluid_modules} dgc)
+endif()
+
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
               analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index afac8e4d2a..e52e83673f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -48,7 +48,7 @@ if (WITH_DISTRIBUTE)
     SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 if (WITH_GPU)
     # warpctc_op needs cudnn 7 above
@@ -72,6 +72,12 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
+if (WITH_GPU AND NOT WIN32)
+    op_library(dgc_op DEPS dgc)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n")
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
+endif()
+
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index eae86a373b..5720b295ec 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -14,69 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/clip_by_norm_op.h"
 
-namespace paddle {
-namespace operators {
-
-class ClipByNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ClipByNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ClipByNormOp should not be null.");
-    auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input of clip_by_norm op."
-             "The number of dimensions must be between [1, 9].");
-    AddOutput("Out",
-              "(Tensor) The output of clip_by_norm op with shape as input(X)");
-    AddAttr<float>("max_norm", "(float) The maximum norm value.");
-    AddComment(R"DOC(
-ClipByNorm Operator.
-
-This operator limits the L2 norm of the input $X$ within $max\_norm$.
-If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
-the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
-be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
-shown in the following formula:
-
-$$
-Out = \\frac{max\\_norm * X}{norm(X)},
-$$
-
-where $norm(X)$ represents the L2 norm of $X$.
-
-Examples:
-        .. code-block:: python
-
-            data = fluid.layer.data(
-                name='data', shape=[2, 4, 6], dtype='float32')
-            reshaped = fluid.layers.clip_by_norm(
-                x=data, max_norm=0.5)
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
                              ops::ClipByNormOpMaker);
+
 REGISTER_OP_CPU_KERNEL(
     clip_by_norm,
     ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 49e734ce96..d8baa4b8b2 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -83,5 +83,59 @@ class ClipByNormKernel : public framework::OpKernel<T> {
   }
 };
 
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm Operator.
+
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
+
+$$
+Out = \\frac{max\\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
new file mode 100644
index 0000000000..6ebad4de3c
--- /dev/null
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DGCClipByNormOp : public ClipByNormOp {
+ public:
+  using ClipByNormOp::ClipByNormOp;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("current_step"),
+                   "current_step should be set.");
+
+    return ClipByNormOp::InferShape(ctx);
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "current_step") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
+};
+
+class DGCClipByNormOpMaker : public ClipByNormOpMaker {
+ public:
+  void Make() override {
+    AddInput("current_step", "(Tensor) Current step.");
+    AddAttr<float>("rampup_begin_step",
+                   "(float, -1.0)"
+                   "The period when begin k_select.")
+        .SetDefault(-1.0);
+
+    return ClipByNormOpMaker::Make();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp,
+                             ops::DGCClipByNormOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    dgc_clip_by_norm,
+    ops::DGCClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
new file mode 100644
index 0000000000..e7f564b7ab
--- /dev/null
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    dgc_clip_by_norm,
+    ops::DGCClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
new file mode 100644
index 0000000000..bd22d16f7a
--- /dev/null
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
+    if (static_cast<int>(rampup_begin_step) >= 0) {
+      auto current_step_tensor =
+          context.Input<framework::Tensor>("current_step");
+      auto* current_step = current_step_tensor->data<T>();
+
+      if (static_cast<int>(*current_step) <
+          static_cast<int>(rampup_begin_step)) {
+        VLOG(10) << "current_step:" << *current_step
+                 << " < rampup_begin_step:" << rampup_begin_step
+                 << " so does't use dgc_clip_by_norm";
+        return;
+      }
+    }
+
+    return ClipByNormKernel<DeviceContext, T>::Compute(context);
+  };
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
new file mode 100644
index 0000000000..ccdeea2d0a
--- /dev/null
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/dgc_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class DGCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("current_step"),
+                   "Input(current_step) of DGCop should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("U_out"),
+                   "Output(U_out) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("V_out"),
+                   "Output(V_out) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("k"),
+                   "Output(k) of DGCop should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
+                   "Output(EncodeGrad) of DGCop should not be null.");
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "current_step" || var_name == "rampup_step" ||
+        var_name == "k") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
+};
+
+class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("U", "(Tensor) Middle tensor of DGC");
+    AddInput("V", "(Tensor) Middle tensor of DGC");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("current_step", "(Tensor) Current step.");
+
+    AddOutput("U_out",
+              "(Tensor) "
+              "Output encoded gradient");
+    AddOutput("V_out",
+              "(Tensor) "
+              "Output encoded gradient");
+    AddOutput("EncodeGrad",
+              "(Tensor) "
+              "Output encoded gradient");
+    AddOutput("Grad_out",
+              "(Tensor) "
+              "Output grad gradient");
+    AddOutput("k",
+              "(Tensor) "
+              "Output top-k value");
+
+    AddAttr<float>("m",
+                   "(float, 0.9) "
+                   "The momentum of learning rate.")
+        .SetDefault(0.9);
+
+    AddAttr<bool>("use_nesterov",
+                  "(bool, true)"
+                  "The momentum of learning rate.")
+        .SetDefault(true);
+
+    AddAttr<std::vector<float>>("sparsity",
+                                "(vecotr, float)"
+                                "The period sparsity of k_select.");
+
+    AddAttr<float>("rampup_begin_step",
+                   "(float, 0.0)"
+                   "The period when begin k_select.")
+        .SetDefault(0.0);
+
+    AddAttr<float>("rampup_step",
+                   "(float, 0.0)"
+                   "The period when begin k_select.");
+
+    AddComment(R"DOC(
+    Original paper is https://arxiv.org/abs/1712.01887
+
+    DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\
+        only gradients larger than a threshold are transmitted.
+
+    To avoid losing information, DGC accumulate the rest of the gradients locally.
+
+    Eventually, these gradients become large enough to be transmitted.
+
+    Thus, DGC send the large gradients immediately but eventually send all of the gradients over time.
+
+    To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance.
+
+    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
+
+    This optimizer will do two things:
+        
+        1. Compress the gradient by get TopK import value from tensor \
+            and use it for allreduce to reduce network bandwidth.
+    
+        2. Call momentum to optimize on the cost.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(dgc, ops::DGCOp, ops::DGCOpMaker);
diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu
new file mode 100644
index 0000000000..0f0bf441a7
--- /dev/null
+++ b/paddle/fluid/operators/dgc_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/dgc_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    dgc, ops::DGCOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
new file mode 100644
index 0000000000..8d1683bdb2
--- /dev/null
+++ b/paddle/fluid/operators/dgc_op.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "dgc/dgc.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline float get_period_sparcity(const std::vector<float>& sparsity,
+                                 float cur_step, float rampup_steps) {
+  PADDLE_ENFORCE(static_cast<int>(cur_step) >= 0);
+
+  size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
+  if (idx >= sparsity.size()) {
+    return 0.999;
+  }
+
+  PADDLE_ENFORCE(idx < sparsity.size());
+  return sparsity[idx];
+}
+
+template <typename DeviceContext, typename T>
+class DGCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto u = ctx.Input<framework::Tensor>("U");
+    auto v = ctx.Input<framework::Tensor>("V");
+    auto g = ctx.Input<framework::Tensor>("Grad");
+
+    // attrs
+    float m = ctx.Attr<float>("m");
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto sparsity = ctx.Attr<std::vector<float>>("sparsity");
+    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
+    auto rampup_step = ctx.Attr<float>("rampup_step");
+
+    // current step
+    auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
+    const float* current_step = current_step_tensor->data<float>();
+
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+      VLOG(10) << "current_step:" << *current_step
+               << " < rampup_begin_step:" << rampup_begin_step
+               << " so does't use dgc";
+      return;
+    }
+
+    float ratio =
+        1 - get_period_sparcity(sparsity, static_cast<float>(*current_step),
+                                rampup_step);
+    PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0);
+    int k = static_cast<int>(g->numel() * ratio);
+
+    VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
+             << ", rampup_begin_step:" << rampup_begin_step
+             << ", rampup_step:" << rampup_step
+             << ",  current_step:" << *current_step << ", ratio:" << ratio
+             << ", k:" << k;
+
+    auto k_out = ctx.Output<framework::Tensor>("k");
+    T* k_out_data = k_out->data<T>();
+    *k_out_data = k;
+
+    auto u_out = ctx.Output<framework::Tensor>("U_out");
+    auto v_out = ctx.Output<framework::Tensor>("V_out");
+    auto encode_grad_out = ctx.Output<framework::Tensor>("EncodeGrad");
+
+    // FIXME(gongwb): use cublas.
+    auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
+    auto u_e = framework::EigenVector<T>::Flatten(*u);
+    auto g_e = framework::EigenVector<T>::Flatten(*g);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto& eigen_ctx = *dev_ctx.eigen_device();
+    if (use_nesterov) {
+      // u = m * (u + g)
+      u_out_e.device(eigen_ctx) = m * (u_e + g_e);
+
+      // v = u + v + g
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, AddFunctor<T>(), v_out);
+
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, g, v, 0, AddFunctor<T>(), v_out);
+    } else {
+      // u = m * u + g
+      u_out_e.device(eigen_ctx) = m * u_e + g_e;
+
+      // v = u + v
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, AddFunctor<T>(), v_out);
+    }
+
+    T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
+    T* u_out_data = u_out->mutable_data<T>(ctx.GetPlace());
+    T* encode_grad_out_data = encode_grad_out->mutable_data<T>(
+        framework::DDim{2 * k}, ctx.GetPlace());
+
+    int buf_size = paddle::communication::dgc::get_buffer_size(k);
+    auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get(
+        ctx.GetPlace(), dev_ctx.stream());
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
+
+    if (!paddle::communication::dgc::k_select(
+            static_cast<void*>(encode_grad_out_data), k, v_out_data,
+            static_cast<int>(v_out->numel()), buf, dev_ctx.stream(),
+            u_out_data)) {
+      LOG(FATAL) << "v_out numel:" << v_out->numel();
+    }
+
+    auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
+    math::SetConstant<DeviceContext, T> tset;
+    tset(dev_ctx, grad_out, static_cast<T>(0));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 9220d35707..c3db59563f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,8 +46,9 @@ cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
 
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+    set(dgc_deps dgc)
 ELSE()
-    set(GPU_CTX_DEPS)
+    set(dgc_deps)
 ENDIF()
 
 IF(WITH_MKLDNN)
@@ -68,7 +69,8 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}  temp_allocator)
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    temp_allocator ${dgc_deps})
 
 if(WIN32)
     if(WITH_GPU AND NOT WITH_DSO)
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 2e8fa7c1b8..497c7b3c87 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -37,13 +37,13 @@ limitations under the License. */
     }                                                                   \
   } while (0)
 
-#define PADDLE_ASSERT_MSG_CODE(e, m, c)                                    \
-  do {                                                                     \
-    if (!(e)) {                                                            \
-      printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
-             TOSTRING(e), m, c);                                           \
-      asm("trap;");                                                        \
-    }                                                                      \
+#define PADDLE_ASSERT_MSG_CODE(e, m, c)                                     \
+  do {                                                                      \
+    if (!(e)) {                                                             \
+      printf("%s:%d Assertion `%s` failed (%s %ld).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m, c);                                            \
+      asm("trap;");                                                         \
+    }                                                                       \
   } while (0)
 #else
 #include <assert.h>
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 48002a7620..61386bdf05 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace platform {
 
@@ -324,8 +326,17 @@ void CUDADeviceContext::Wait() const {
   auto& allocator =
       DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
   allocator.Release([this]() {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaGetLastError());
+    cudaError_t e_sync = cudaStreamSynchronize(stream_);
+    if (e_sync != 0) {
+      LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync)
+                 << " errno:" << e_sync;
+    }
+
+    cudaError_t e_get = cudaGetLastError();
+    if (e_get != 0) {
+      LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
+                 << " errno:" << e_get;
+    }
   });
 }
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index d53a4029e1..407d1b1299 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -31,6 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "dgc/dgc.h"
+#endif
+
 DEFINE_int32(paddle_num_threads, 1,
              "Number of threads for each paddle instance.");
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
@@ -43,6 +47,10 @@ namespace framework {
 std::once_flag gflags_init_flag;
 std::once_flag p2p_init_flag;
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+std::once_flag dgc_init_flag;
+#endif
+
 void InitGflags(std::vector<std::string> argv) {
   std::call_once(gflags_init_flag, [&]() {
     FLAGS_logtostderr = true;
@@ -203,5 +211,15 @@ void InitGLOG(const std::string &prog_name) {
 #endif
 }
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void InitDGC() {
+  std::call_once(dgc_init_flag, []() {
+    PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib());
+  });
+}
+#else
+void InitDGC() {}
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 0e30594672..01d66f57dc 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -30,5 +30,7 @@ void InitDevices(bool init_p2p);
 
 void InitDevices(bool init_p2p, const std::vector<int> devices);
 
+void InitDGC();
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 7b5e417504..31b5dd5d7c 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -222,6 +222,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("attr_type", &pd::OpDesc::GetAttrType)
       .def("attr_names", &pd::OpDesc::AttrNames)
       .def("_set_attr", &pd::OpDesc::SetAttr)
+      .def("remove_attr", &pd::OpDesc::RemoveAttr)
       .def("attr", &pd::OpDesc::GetAttr)
       .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
       .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dca40edf0b..3b0939ef82 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -933,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
+  m.def("init_dgc", framework::InitDGC);
   m.def("init_devices",
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 85e1916a3a..4a5301b436 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1202,6 +1202,9 @@ class Operator(object):
         """
         self._update_desc_attr(name, val)
 
+    def _remove_attr(self, name):
+        self.desc.remove_attr(name)
+
     def _update_desc_attr(self, name, val):
         """
         Update the value of desc's attribute by attribute's name.
@@ -2725,6 +2728,10 @@ class Program(object):
         self._trainers_endpoints = []
         # the distributed lookup table names
         self._distributed_lookup_table = None
+
+        # use Deep gradient comrepssion or not
+        self._enable_dgc = False
+
         # @deprecated(the python memory optimize transpiler is deprecated)
         # whether the program is optimized by memory_optimize_transpiler
         self.__is_mem_optimized = False
@@ -2775,6 +2782,15 @@ class Program(object):
     def set_op_role_var(self, var_name):
         self._op_role_var = [var_name]
 
+    @contextlib.contextmanager
+    def _backward_role_guard(self):
+        tmp_role = self._current_role
+
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.Backward
+        yield
+        self._current_role = tmp_role
+
     @signature_safe_contextmanager
     def _optimized_guard(self, param_and_grads):
         """
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c0deb5eacc..e21f303a3e 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 from collections import defaultdict
 from .wrapped_decorator import signature_safe_contextmanager
 
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 
 from . import framework
@@ -31,13 +31,17 @@ from .layer_helper import LayerHelper
 from .layers import ops
 from .regularizer import append_regularization_ops
 from .imperative import base as imperative_base
+from paddle.fluid import core
+from paddle.fluid.layers import tensor
+from functools import reduce
+import copy
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
     'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
-    'LarsMomentumOptimizer'
+    'LarsMomentumOptimizer', 'DGCMomentumOptimizer'
 ]
 
 
@@ -294,6 +298,9 @@ class Optimizer(object):
                     outputs={"ParamOut": param_and_grad[0]})
         return new_param_grads, (table_param, table_grad), sgd_op
 
+    def _append_dgc_ops(self, param_and_grad):
+        pass
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -415,6 +422,9 @@ class Optimizer(object):
             with program_guard(program, startup_program):
                 params_grads = self.backward(loss, startup_program,
                                              parameter_list, no_grad_set)
+                # Note: since we can't use all_reduce_op now,
+                #  dgc_op should be the last op of one grad.
+                self._append_dgc_ops(params_grads)
                 optimize_ops = self.apply_gradients(params_grads)
 
         return optimize_ops, params_grads
@@ -552,6 +562,264 @@ class MomentumOptimizer(Optimizer):
         return momentum_op
 
 
+class DGCMomentumOptimizer(MomentumOptimizer):
+    """
+
+    Original paper is https://arxiv.org/abs/1712.01887
+
+    DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\
+        only gradients larger than a threshold are transmitted.
+
+    To avoid losing information, DGC accumulate the rest of the gradients locally.
+
+    Eventually, these gradients become large enough to be transmitted.
+
+    Thus, DGC send the large gradients immediately but eventually send all of the gradients over time.
+
+    To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance.
+
+    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
+
+    This optimizer will do two things:
+        
+        1. Compress the gradient by get TopK import value from tensor \
+            and use it for allreduce to reduce network bandwidth.
+    
+        2. Call momentum to optimize on the cost.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float): Momentum factor.
+        rampup_begin_step (int): The begining step from which gradient compression is implemented.
+        rampup_step (int): How long it use the sparsity periods. Default is 1.
+            for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \
+                it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \
+                it will use 0.999 then and after.
+        sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity).
+        use_nesterov (bool): Enables Nesterov momentum. True means use nesterov.
+        local_grad_clip_norm (float): Clip norm value if needed.
+        num_trainers: The number of training node.
+        regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
+        name: A optional name prefix.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.DGCMomentumOptimizer(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr),
+                momentum=0.9,
+                rampup_begin_step=1252,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+            optimizer.minimize(cost)
+
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 rampup_begin_step,
+                 rampup_step=1,
+                 sparsity=[0.999],
+                 use_nesterov=False,
+                 local_grad_clip_norm=None,
+                 num_trainers=None,
+                 regularization=None,
+                 name=None):
+        self._sparsity = sparsity
+        self._rampup_step = rampup_step
+        self._rampup_step_var = None
+
+        self._rampup_begin_step = rampup_begin_step
+        self._rampup_begin_step_var = None
+
+        self._global_step_var = None
+        self._local_grad_clip_norm = None
+        self._clip_norm = None
+
+        if local_grad_clip_norm is not None:
+            assert isinstance(num_trainers, int)
+            assert isinstance(local_grad_clip_norm, float)
+            assert num_trainers > 0
+
+            self._local_grad_clip_norm = local_grad_clip_norm
+            self._num_trainers = num_trainers
+            self._clip_norm = local_grad_clip_norm / (num_trainers *
+                                                      num_trainers)
+
+        super(DGCMomentumOptimizer, self).__init__(
+            learning_rate, momentum, use_nesterov, regularization, name)
+
+        core.init_dgc()
+
+    def _add_auto_increment_var(self, counter_name, begin, step=1):
+        helper = LayerHelper('global_step_counter')
+        counter, is_new_var = helper.create_or_get_global_variable(
+            name=counter_name, dtype='float32', shape=[1], persistable=True)
+        if is_new_var:
+            helper.set_variable_initializer(
+                counter,
+                initializer=Constant(
+                    value=float(begin - 1), force_cpu=True))
+            helper.main_program.global_block()._prepend_op(
+                type='increment',
+                inputs={'X': [counter]},
+                outputs={'Out': [counter]},
+                attrs={'step': float(step)},
+                stop_gradient=True)
+            counter.stop_gradient = True
+
+        return counter
+
+    def _append_dgc_ops(self, param_and_grads):
+        start_program = default_startup_program()
+        main_program = default_main_program()
+        main_program._enable_dgc = True
+
+        # step counter
+        self._global_step_var = self._add_auto_increment_var(
+            counter_name='__g_dgc_counter__', begin=0)
+
+        # rampup begin step var for all_reduce_op_handle
+        self._rampup_begin_step_var = tensor.create_global_var(
+            shape=[1],
+            dtype=core.VarDesc.VarType.FP32,
+            persistable=True,
+            name='__g_rampup_begin_step__',
+            value=self._rampup_begin_step * 1.0,
+            force_cpu=True)
+
+        for param_var, grad_var in param_and_grads:
+            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            if var_numel < 16384 or \
+                param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
+                grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
+                    param_var.dtype != core.VarDesc.VarType.FP32 :
+                continue
+
+            u_var = tensor.create_global_var(
+                shape=param_var.shape,
+                dtype=param_var.dtype,
+                persistable=True,
+                name=param_var.name + "__dgc_u__",
+                value=0.0)
+            v_var = tensor.create_global_var(
+                shape=param_var.shape,
+                dtype=param_var.dtype,
+                persistable=True,
+                name=param_var.name + "__dgc_v__",
+                value=0.0)
+
+            k_var = tensor.create_global_var(
+                shape=[1],
+                dtype=param_var.dtype,
+                persistable=True,
+                name=param_var.name + "__dgc_k__",
+                value=0.0,
+                force_cpu=True)
+
+            encoded_var = tensor.create_global_var(
+                shape=[1],
+                dtype=param_var.dtype,
+                persistable=True,
+                name=param_var.name + "__dgc_encoded__",
+                value=0.0,
+                force_cpu=False)
+
+            # del back oprolevarname
+            op_maker = core.op_proto_and_checker_maker
+            backward = core.op_proto_and_checker_maker.OpRole.Backward
+            for op in main_program.global_block().ops:
+                if not self._is_the_backward_op(op):
+                    continue
+
+                var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
+                if param_var.name not in var_attr:
+                    continue
+
+                var_attr.remove(param_var.name)
+                var_attr.remove(grad_var.name)
+                if len(var_attr) > 1:
+                    op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
+                else:
+                    op._remove_attr(op_maker.kOpRoleVarAttrName())
+
+            clip_var = grad_var
+            if self._local_grad_clip_norm is not None:
+                clip_var = self._append_clip_norm(grad_var, self._clip_norm)
+            self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var,
+                         encoded_var)
+
+    def _is_the_backward_op(self, op):
+        op_maker = core.op_proto_and_checker_maker
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        if op_maker.kOpRoleVarAttrName() in op.attr_names and \
+                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward):
+            return True
+        return False
+
+    def _clip_by_norm(self, x, max_norm, name=None):
+        args = {'x': x, 'max_norm': max_norm, 'name': name}
+
+        helper = LayerHelper("dgc_clip_by_norm_op", **args)
+
+        if name is None:
+            name = unique_name.generate(".".join([helper.name, 'tmp']))
+
+        out = helper.create_variable(
+            type=x.type, name=name, dtype=x.dtype, persistable=False)
+
+        helper.append_op(
+            type="clip_by_norm",
+            inputs={"X": x,
+                    "current_step": self._global_step_var},
+            attrs={
+                "max_norm": max_norm,
+                "rampup_begin_step": float(self._rampup_begin_step)
+            },
+            outputs={"Out": out})
+        return out
+
+    def _append_clip_norm(self, grad_var, clip_norm):
+        with grad_var.block.program._backward_role_guard():
+            return self._clip_by_norm(
+                x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC")
+
+    def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
+                encoded_var):
+        block = framework.default_main_program().global_block()
+        op_maker = core.op_proto_and_checker_maker
+        dgc_op = block.append_op(
+            type="dgc",
+            inputs={
+                "U": u_var,
+                "V": v_var,
+                "Grad": clip_var,
+                "current_step": self._global_step_var
+            },
+            outputs={
+                "U_out": u_var,
+                "V_out": v_var,
+                "EncodeGrad": encoded_var,
+                "k": k_var,
+                "Grad_out": grad_var
+            },
+            attrs={
+                "m": self._momentum,
+                "sparsity": self._sparsity,
+                "use_nesterov": self._use_nesterov,
+                "rampup_begin_step": float(self._rampup_begin_step),
+                "rampup_step": float(self._rampup_step)
+            },
+            stop_gradient=True)
+
+        backward = op_maker.OpRole.Backward
+        dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward)
+        dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
+                         [param_var.name, grad_var.name])
+
+
 class LarsMomentumOptimizer(Optimizer):
     """
     Momentum optimizer with LARS support
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6702fc808b..6b88e7a99f 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -103,6 +103,12 @@ class ParallelExecutor(object):
         ) if use_cuda else framework.cpu_places()
         self._scope = scope if scope is not None else executor.global_scope()
 
+        if main_program is not None and main_program._enable_dgc:
+            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
+            assert num_trainers * len(
+                self._places) > 1, "dgc is not useful for single card training"
+            assert use_cuda
+
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index cefa2b4919..d139feac6f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dgc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
@@ -97,6 +98,7 @@ if(WITH_DISTRIBUTE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+        py_test_modules(test_dgc_op MODULES test_dgc_op)
         set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
         py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
         set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
@@ -107,16 +109,20 @@ if(WITH_DISTRIBUTE)
     endif(NOT APPLE)
     # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
+
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+
 if(NOT WIN32)
-py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
+    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
 endif()
+
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
+
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     # change the timeout from 600 to 2200, because in debug mode, this test need more time.
     set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 1c45a10a9d..c598260e13 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -73,7 +73,7 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
+    def get_model(self, batch_size=2, use_dgc=False):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -93,7 +93,11 @@ class TestDistMnist2x2(TestDistRunnerBase):
         # TODO(typhoonzero): fix distributed adam optimizer
         # opt = fluid.optimizer.AdamOptimizer(
         #     learning_rate=0.001, beta1=0.9, beta2=0.999)
-        opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
+        if not use_dgc:
+            opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
+        else:
+            opt = fluid.optimizer.DGCMomentumOptimizer(
+                learning_rate=self.lr, momentum=0.9, rampup_begin_step=0)
 
         # Reader
         train_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index c3d84dba0a..a2fd61e238 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -210,7 +210,7 @@ class SE_ResNeXt():
 
 
 class DistSeResneXt2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
+    def get_model(self, batch_size=2, use_dgc=False):
         # Input data
         image = fluid.layers.data(
             name="data", shape=[3, 224, 224], dtype='float32')
@@ -237,11 +237,19 @@ class DistSeResneXt2x2(TestDistRunnerBase):
         base_lr = 0.1
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
+        if not use_dgc:
+            optimizer = fluid.optimizer.Momentum(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr),
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+        else:
+            optimizer = fluid.optimizer.DGCMomentumOptimizer(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr),
+                momentum=0.9,
+                rampup_begin_step=0,
+                regularization=fluid.regularizer.L2Decay(1e-4))
         optimizer.minimize(avg_cost)
 
         # Reader
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/test_dgc_op.py
new file mode 100644
index 0000000000..04766dd858
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dgc_op.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+
+g_array_size = 102400
+
+
+class TestDGCOp(unittest.TestCase):
+    def setup(self, place, array_size=g_array_size):
+        size = array_size
+        np.random.seed(5)  # fix seed
+
+        self.scope = fluid.global_scope()
+        self.place = place
+        print("place:", place)
+
+        # numpy data
+        # inputs: U, V, Grad, current_step
+        self.u_name = "U"
+        self.u = np.random.random(size).astype("float32")
+
+        self.v_name = "V"
+        self.v = np.random.random(size).astype("float32")
+
+        self.grad_name = "Grad"
+        self.grad = np.random.random(size).astype("float32")
+
+        self.current_step_name = "current_step"
+        self.current_step = np.full((1), 0.0).astype("float32")
+
+        # output: U_out, V_out, EncodeGrad, GradLocal_out
+        self.encode_grad_name = "EncodeGrad"
+        self.k_name = "k"
+        self.k = np.full((1), 0.0).astype("float32")
+
+        # scope data 
+        self.u_tensor = self.scope.var(self.u_name).get_tensor()
+        self.u_tensor.set(self.u, place)
+
+        self.v_tensor = self.scope.var(self.v_name).get_tensor()
+        self.v_tensor.set(self.v, place)
+
+        self.grad_tensor = self.scope.var(self.grad_name).get_tensor()
+        self.grad_tensor.set(self.grad, place)
+
+        self.encode_grad_tensor = self.scope.var(
+            self.encode_grad_name).get_tensor()
+
+        self.current_step_tensor = self.scope.var(
+            self.current_step_name).get_tensor()
+        self.current_step_tensor.set(self.current_step, core.CPUPlace())
+
+        self.k_tensor = self.scope.var(self.k_name).get_tensor()
+        self.k_tensor.set(self.k, core.CPUPlace())
+
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+
+    def test_run_and_check(self):
+        self.setup(place=core.CUDAPlace(0))
+        kwargs = {
+            # inputs
+            'U': self.u_name,
+            'V': self.v_name,
+            'Grad': self.grad_name,
+            'current_step': self.current_step_name,
+
+            # outputs
+            'U_out': self.u_name,
+            'V_out': self.v_name,
+            'EncodeGrad': self.encode_grad_name,
+            'Grad_out': self.grad_name,
+            'k': self.k_name,
+
+            # attrs
+            'm': 0.9,
+            'sparsity': [0.75, 0.9375, 0.984375, 0.996, 0.999],
+            'use_nesterov': True,
+            'rampup_begin_step': float(0.0),
+            'rampup_step': float(10.0),
+        }
+
+        dgc_op = Operator('dgc', **kwargs)
+
+        #atol = 1e-6
+        dgc_op.run(self.scope, self.place)
+
+        u_out = np.array(self.u_tensor)
+        v_out = np.array(self.v_tensor)
+        grad_out = np.array(self.grad_tensor)
+        encode_grad_out = np.array(self.encode_grad_tensor)
+        k = int(np.array(self.k_tensor)[0])
+
+        print("u_out:", u_out[0:20])
+        print("v_out:", v_out[0:20])
+        print("encode_grad_out:", encode_grad_out)
+        print("k_out:", k)
+
+        self.assertEqual(k, int(g_array_size * 0.25))
+
+        index = encode_grad_out[0:k].view(dtype=np.int32)
+        value = encode_grad_out[k:2 * k]
+
+        acl = 1e-7
+
+        for i in range(0, k):
+            self.assertAlmostEqual(u_out[index[i]], 0.0)
+            self.assertAlmostEqual(v_out[index[i]], 0.0)
+
+        a_min = np.amin(value)
+        dangling = [x for x in v_out if x > a_min]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 969f5cb63c..9c0efe6d90 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -36,7 +36,8 @@ class TestDistRunnerBase(object):
     def get_model(self,
                   batch_size=DEFAULT_BATCH_SIZE,
                   lr=0.1,
-                  single_device=False):
+                  single_device=False,
+                  use_dgc=False):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
@@ -82,6 +83,9 @@ class TestDistRunnerBase(object):
         if args.nccl2_reduce_layer_local_run:
             test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                 self.get_model(batch_size=args.batch_size, single_device=True)
+        elif args.use_dgc:
+            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+                self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
         else:
             test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                 self.get_model(batch_size=args.batch_size)
@@ -200,6 +204,7 @@ def runtime_main(test_class):
     parser.add_argument('--sync_mode', action='store_true')
     parser.add_argument('--mem_opt', action='store_true')
     parser.add_argument('--use_cuda', action='store_true')
+    parser.add_argument('--use_dgc', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument('--dc_asgd', action='store_true')
     parser.add_argument(
@@ -235,6 +240,7 @@ class TestDistBase(unittest.TestCase):
     def _after_setup_config(self):
         if self._enforce_place == "CPU":
             self.__use_cuda = False
+            self._use_dgc = False
         elif self._enforce_place == "GPU":
             self.__use_cuda = True
         else:
@@ -242,6 +248,10 @@ class TestDistBase(unittest.TestCase):
                 self.__use_cuda = True
             else:
                 self.__use_cuda = False
+                self._use_dgc = False
+
+        if self._use_reduce:
+            assert not self._use_dgc
 
     def setUp(self):
         self._trainers = 2
@@ -264,6 +274,7 @@ class TestDistBase(unittest.TestCase):
         # test, reduce check this argument everywhere.
         self._nccl2_reduce_layer = False
         self._lr = 0.001
+        self._use_dgc = False
         self._setup_config()
         self._after_setup_config()
 
@@ -506,6 +517,9 @@ class TestDistBase(unittest.TestCase):
             env0 = {'CPU_NUM': '1'}
             env1 = {'CPU_NUM': '1'}
 
+        if self._use_dgc:
+            tr0_cmd += " --use_dgc"
+            tr1_cmd += " --use_dgc"
         if self._mp_mode:
             env0 = {"FLAGS_selected_gpus": "0"}
             env1 = {"FLAGS_selected_gpus": "1"}
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 030860ec79..b9d2f6db39 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -39,6 +39,20 @@ class TestDistMnistNCCL2(TestDistBase):
             self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnistNCCL2DGC(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._use_dgc = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1e-5)
+
+
 class TestDistMnist2x2Lars(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index 28602d3251..4e9ca01f43 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -60,5 +60,20 @@ class TestDistSeResneXt2x2Async(TestDistBase):
         self.check_with_place("dist_se_resnext.py", delta=100)
 
 
+class TestDistSeResnetNCCL2DGC(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._use_dgc = True
+
+    @skip_ci
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_se_resnext.py", delta=30)
+
+
 if __name__ == "__main__":
     unittest.main()

From 174d0d0b90a610807d6f82927aad4def227ee643 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 28 Mar 2019 08:52:08 +0800
Subject: [PATCH 11/19] Revert "Fix allocator bug" add include headers to fix
 travis-ci test=develop

---
 paddle/fluid/framework/operator.h             |  3 +
 paddle/fluid/memory/allocation/CMakeLists.txt | 23 +++--
 .../memory/allocation/aligned_allocator.h     |  2 -
 paddle/fluid/memory/allocation/allocator.cc   | 14 +--
 paddle/fluid/memory/allocation/allocator.h    | 72 ++-------------
 .../memory/allocation/allocator_facade.cc     | 48 ++++------
 .../memory/allocation/allocator_strategy.cc   | 14 +--
 .../memory/allocation/best_fit_allocator.cc   |  2 +-
 .../memory/allocation/best_fit_allocator.h    |  2 +-
 .../memory/allocation/buffered_allocator.cc   | 22 +++--
 .../memory/allocation/buffered_allocator.h    |  6 +-
 .../allocation/buffered_allocator_test.cc     |  3 +-
 .../fluid/memory/allocation/cpu_allocator.cc  | 28 +++---
 .../fluid/memory/allocation/cpu_allocator.h   | 10 +-
 .../fluid/memory/allocation/cuda_allocator.cc | 10 +-
 .../fluid/memory/allocation/cuda_allocator.h  |  9 +-
 .../memory/allocation/legacy_allocator.cc     | 52 +++++------
 .../memory/allocation/legacy_allocator.h      |  2 +-
 .../memory/allocation/locked_allocator.cc     | 19 ++--
 .../memory/allocation/locked_allocator.h      |  6 +-
 .../naive_best_fit_allocator_facade_test.cc   | 91 -------------------
 .../memory/allocation/pinned_allocator.cc     |  9 +-
 .../memory/allocation/pinned_allocator.h      |  8 +-
 .../memory/allocation/retry_allocator.cc      | 18 +++-
 .../fluid/memory/allocation/retry_allocator.h | 23 +++--
 .../memory/allocation/zero_size_allocator.cc  | 11 +--
 .../memory/allocation/zero_size_allocator.h   |  7 +-
 paddle/fluid/platform/temporary_allocator.cc  | 27 ++++--
 paddle/fluid/platform/temporary_allocator.h   | 14 ++-
 paddle/fluid/pybind/pybind.cc                 |  1 -
 paddle/fluid/string/printf.h                  |  6 +-
 31 files changed, 224 insertions(+), 338 deletions(-)
 delete mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6d8ba430bd..a02e53dcf7 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -365,6 +365,9 @@ class ExecutionContext {
     auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
         allocation_ptr, deleter);
 
+    PADDLE_ENFORCE(
+        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+        "The AllocationPtr must be TemporaryAllocation.");
     PADDLE_ENFORCE_GE(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 0f6014ae8a..ac77c3d2a5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
@@ -38,20 +37,30 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
-
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS
+        ${AllocatorFacadeDeps}
+        cpu_allocator
+        locked_allocator
+        best_fit_allocator
+        aligned_allocator
+        auto_increment_allocator
+        zero_size_allocator
+        conditional_allocator
+        retry_allocator
+        buffered_allocator
+        allocator_strategy
+        legacy_allocator
+        )
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 
-cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
-
 cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
 
 cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index b536d4276e..064acd06e7 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
         underlying_allocator_->Allocate(size + kAlignment, attr);
     return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
   }
-
-  void FreeImpl(Allocation* allocation) override { delete allocation; }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 5a5253d911..8fb8a5fb89 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
 
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
   auto ptr = AllocateImpl(size, attr);
-  ptr->RegisterDecoratedAllocator(this);
+  ptr->set_allocator(this);
   return AllocationPtr(ptr);
 }
 
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
-}
-
-void Allocator::Free(Allocation* allocation) {
-  allocation->PopDecoratedAllocator();
-  FreeImpl(allocation);
-}
+void Allocator::Free(Allocation* allocation) { delete allocation; }
 
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 
 void AllocationDeleter::operator()(Allocation* allocation) const {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
+  auto* allocator = allocation->allocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 33b816b908..3465278935 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -46,56 +46,13 @@ class Allocator;
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
-
-/**
- * Allocation is returned by Allocator::Allocate() method.
- *
- * An allocator may be decorated by another allocator. For example, we can
- * decorate
- * a RetryAllocator to any allocator to perform allocation retry when first
- * allocation request fails.
- *
- * Explanations of Allocator design is as follows:
- *
- * Suppose we have an allocator which is decorated by several allocators:
- *
- *   A(1) <- A(2) <- A(3) <- ... <- A(n)
- *
- * , and the public allocator is A(1).
- *
- * The allocation process would be:
- *
- *   A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
- *
- * , and the free process would be:
- *
- *   A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
- *
- * Therefore, we should record the allocator chain when allocating, so
- * that we can free the allocation in the reverse order of allocator chain.
- * The field `decorated_allocators_` is used to record this chain.
- *
- * Another example is that we want to add additional fields in Allocation,
- * e.g., something what is done in AlignedAllocator, etc.
- * In this case, we should declare a derived class of Allocation, which
- * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
- * be a new chain, differing from the underlying Allocation object.
- */
 class Allocation {
  public:
   Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {
-    // NOTE(zjl): Since decorated_allocators_ is usually a small vector
-    // We reserve a small buffer to it to prevent frequent heap allocation
-    // Not quite sure whether we need something like gtl vector.
-    decorated_allocators_.reserve(8);
-  }
+      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
 
   // Returns the holding pointer.
   // NOTE: For performance consideration, it is better not to make this method
@@ -117,31 +74,17 @@ class Allocation {
 
   const platform::Place& place() const { return place_; }
 
-  virtual ~Allocation();
-
- private:
-  const std::vector<Allocator*>& DecoratedAllocators() const {
-    return decorated_allocators_;
-  }
-
-  inline void RegisterDecoratedAllocator(Allocator* allocator) {
-    decorated_allocators_.push_back(allocator);
-  }
+  Allocator* allocator() { return allocator_; }
 
-  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
+  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
 
-  inline Allocator* TopDecoratedAllocator() {
-    return decorated_allocators_.back();
-  }
+  virtual ~Allocation();
 
  private:
+  Allocator* allocator_;
   void* ptr_;
   size_t size_;
   platform::Place place_;
-  std::vector<Allocator*> decorated_allocators_;
-
-  friend class Allocator;
-  friend class AllocationDeleter;
 };
 
 using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
@@ -191,12 +134,9 @@ class Allocator {
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
-  // This function should not be called outside
-  void Free(Allocation* allocation);
-
  protected:
+  virtual void Free(Allocation* allocation);
   virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
-  virtual void FreeImpl(Allocation* allocation);
 
  private:
   friend class AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 09328aded5..a3b73e3ba3 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -49,17 +49,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-static inline std::shared_ptr<Allocator> WrapRetryAllocator(
-    std::shared_ptr<Allocator> allocator, int64_t retry_time) {
-  if (retry_time > 0) {
-    auto* retry_allocator =
-        new RetryAllocator(std::move(allocator), retry_time);
-    allocator.reset(retry_allocator);
-  }
-
-  return allocator;
-}
-
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
  public:
@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    std::shared_ptr<Allocator> allocator(new LockedAllocator(
-        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
+    std::unique_ptr<Allocator> allocator(new LockedAllocator(
+        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    allocator = WrapRetryAllocator(allocator, retry_time_);
+    if (retry_time_ > 0) {
+      auto* retry_allocator =
+          new RetryAllocator(std::move(allocator), retry_time_);
+      allocator.reset(retry_allocator);
+    }
 
     return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
   }
@@ -197,23 +190,13 @@ class AllocatorFacadePrivate {
   ~AllocatorFacadePrivate() = default;
 
   AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
-      case AllocatorStrategy::kLegacy: {
-        InitLegacyAllocator();
-        break;
-      }
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitCPUAllocator();
-        InitCUDAAllocator();
-        InitCUDAPinnedAllocator();
-        WrapZeroSizeAllocator();
-        break;
-      }
-      default: {
-        PADDLE_THROW("Unsupported allocator strategy: %d",
-                     static_cast<int>(strategy));
-      }
+    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
+      InitLegacyAllocator();
+    } else {
+      InitCPUAllocator();
+      InitCUDAAllocator();
+      InitCUDAPinnedAllocator();
+      WrapZeroSizeAllocator();
     }
   }
 
@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
+                                     AllocationDeleter());
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index fff94c01e7..8cebda9005 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,22 +19,16 @@
 DEFINE_string(
     allocator_strategy, "legacy",
     "The allocation strategy. Legacy means the original allocator of Fluid."
-    "naive_best_fit means the experimental best fit allocator. "
-    "allocator. Enum in [legacy, naive_best_fit].");
+    "New means the experimental allocators of Fluid. in [legacy, new]");
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 static AllocatorStrategy GetStrategyFromFlag() {
-  if (FLAGS_allocator_strategy == "legacy") {
-    return AllocatorStrategy::kLegacy;
-  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
-    return AllocatorStrategy::kNaiveBestFit;
-  } else {
-    PADDLE_THROW("Unsupported allocator strategy: %s",
-                 FLAGS_allocator_strategy);
-  }
+  return FLAGS_allocator_strategy == "legacy"
+             ? AllocatorStrategy::kLegacy
+             : AllocatorStrategy::kNaiveBestFit;
 }
 
 AllocatorStrategy GetAllocatorStrategy() {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index d87dd9a4b6..e3d6c2f511 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::Free(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(bf_allocation,
                           "The input allocation is not BestFitAllocation.");
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index c137438c0c..4f10f2b53e 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index e04c0aa34b..fc75abc9df 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
+BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
     : underlying_allocator_(std::move(allocator)) {
   PADDLE_ENFORCE_NOT_NULL(
       underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must not be null");
+      "Underlying allocator of BufferedAllocator must be unmanaged");
   if (underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
   while (!allocations_.empty()) {  // free the largest
     auto it = --allocations_.end();
     cur += it->second->size();
-    underlying_allocator_->Free(it->second.release());
+    delete it->second.release();
     allocations_.erase(it);
     if (cur >= size) return;
   }
 }
 
-bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
-
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+bool BufferedAllocator::IsAllocThreadSafe() const {
+  return this->underlying_allocator_->IsAllocThreadSafe();
+}
+void BufferedAllocator::Free(Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   allocations_.emplace(allocation->size(), AllocationPtr(allocation));
 }
-
 Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
     if (it != allocations_.end() && it->first < size * 2) {
       AllocationPtr result(std::move(it->second));
       allocations_.erase(it);
-      return result.release();
+      return new AllocationWithUnderlying(std::move(result));
     }
   }
 
   try {
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   } catch (BadAlloc &) {
     FreeCache(size);
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index c728395705..d44a3f85be 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -31,7 +31,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
+  explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
 
   ~BufferedAllocator();
 
@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::multimap<size_t, AllocationPtr> allocations_;
   std::unique_ptr<std::mutex> mtx_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 854a117b0e..c8bd5292ca 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -65,7 +66,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void Free(Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(alloc);
     if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 90c49c87a6..cc81a6f7b8 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -20,27 +20,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+CPUAllocation::CPUAllocation(void *ptr, size_t size)
+    : Allocation(ptr, size, platform::CPUPlace()) {}
+
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
-  void *p = allocation->ptr();
-#ifdef _WIN32
-  _aligned_free(p);
-#else
-  free(p);
-#endif
+void CPUAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
+  free(allocation->ptr());
   delete allocation;
 }
 
 Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  void *p;
-#ifdef _WIN32
-  p = _aligned_malloc(size, kAlignment);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
-                    size);
-#endif
-  return new Allocation(p, size, platform::CPUPlace());
+  void *ptr;
+  auto status = posix_memalign(&ptr, kAlignment, size);
+  if (UNLIKELY(status) != 0) {
+    throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
+                                   size, status));
+  }
+  return new CPUAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 3eb1416b0e..26d3643f4e 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -31,13 +31,19 @@ namespace allocation {
 //
 // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
 // an open-sourced allocator into Paddle.
+class CPUAllocator;
+class CPUAllocation : public Allocation {
+ public:
+  CPUAllocation(void* ptr, size_t size);
+};
+
 class CPUAllocator : public Allocator {
  public:
-  constexpr static size_t kAlignment = 4096UL;
+  constexpr static size_t kAlignment = 64u;
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 895a24a6a2..430bf0be98 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,14 +23,15 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::Free(Allocation* allocation) {
   platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
+  auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
+  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
                     place_);
   PADDLE_ENFORCE(cudaFree(allocation->ptr()));
   delete allocation;
 }
-
 Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
         "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
         status, cudaGetErrorString(status)));
   }
-  return new Allocation(ptr, size, platform::Place(place_));
+  return new CUDAAllocation(ptr, size, platform::Place(place_));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 580a2d1df1..63726f5820 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,6 +20,13 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// CUDA System allocator and allocation.
+// Just a flag type.
+class CUDAAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
 class CUDAAllocator : public Allocator {
  public:
   explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 0dc2de3746..514ac7883a 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 }
 
 #ifdef PADDLE_WITH_CUDA
-class GPUBuddyAllocatorList {
- public:
-  GPUBuddyAllocatorList()
-      : allocators_(platform::GetCUDADeviceCount()),
-        flags_(platform::GetCUDADeviceCount()) {
-    allocation::GPUMemMonitor.Initialize(allocators_.size());
-  }
+BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator **a_arr = nullptr;
+  static std::vector<int> devices;
+
+  std::call_once(init_flag, [gpu_id]() {
+    devices = platform::GetSelectedDevices();
+    int gpu_num = devices.size();
 
-  BuddyAllocator *Get(size_t dev_id) {
-    PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
-    std::call_once(flags_[dev_id], [this, dev_id] {
+    allocation::GPUMemMonitor.Initialize(devices.size());
+
+    a_arr = new BuddyAllocator *[gpu_num];
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int dev_id = devices[i];
+      a_arr[i] = nullptr;
       platform::SetDeviceId(dev_id);
-      allocators_[dev_id] = new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::GPUAllocator(dev_id)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                        new detail::GPUAllocator(dev_id)),
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
 
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList {
                << FLAGS_initial_gpu_memory_in_mb
                << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
                << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-    return allocators_[dev_id];
-  }
-
- private:
-  std::vector<BuddyAllocator *> allocators_;
-  std::vector<std::once_flag> flags_;
-};
+    }
+  });
 
-BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
-  static GPUBuddyAllocatorList allocators;
   platform::SetDeviceId(gpu_id);
-  return allocators.Get(gpu_id);
+  auto pos = std::distance(devices.begin(),
+                           std::find(devices.begin(), devices.end(), gpu_id));
+  return a_arr[pos];
 }
 #endif
 
@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 #ifdef PADDLE_WITH_CUDA
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr && size > 0) {
+  if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
     size_t avail, total;
@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   return tmp_alloc;
 }
 
-void LegacyAllocator::FreeImpl(Allocation *allocation) {
+void LegacyAllocator::Free(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index 27cd42ea35..d9bdae153d 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
 
  protected:
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index c43099cc88..62d768c580 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -17,7 +17,6 @@
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
-
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -25,24 +24,26 @@ namespace allocation {
 bool LockedAllocator::IsAllocThreadSafe() const { return true; }
 
 LockedAllocator::LockedAllocator(
-    std::shared_ptr<Allocator> underlying_allocator)
+    std::unique_ptr<Allocator> &&underlying_allocator)
     : underlying_allocator_(std::move(underlying_allocator)) {
   PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
   if (!underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
 }
-
-void LockedAllocator::FreeImpl(Allocation *allocation) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  underlying_allocator_->Free(allocation);
+void LockedAllocator::Free(Allocation *allocation) {
+  {
+    platform::LockGuardPtr<std::mutex> guard(mtx_);
+    reinterpret_cast<AllocationWithUnderlying *>(allocation)
+        ->allocation_.reset();  // Destroy inner allocation
+  }
+  delete allocation;
 }
-
 Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return underlying_allocator_->Allocate(size, attr).release();
+  return new AllocationWithUnderlying(
+      underlying_allocator_->Allocate(size, attr));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index b735ccef10..4967b9bb8d 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -24,15 +24,15 @@ namespace allocation {
 // A allocator to make underlying allocator thread safe.
 class LockedAllocator : public Allocator {
  public:
-  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
+  explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
deleted file mode 100644
index 3334589a4b..0000000000
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
-  FLAGS_allocator_strategy = "naive_best_fit";
-
-  auto &instance = AllocatorFacade::Instance();
-  platform::Place place;
-  size_t size = 1024;
-
-  {
-    place = platform::CPUPlace();
-    size = 1024;
-    auto cpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(cpu_allocation, nullptr);
-    ASSERT_NE(cpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(cpu_allocation->place(), place);
-    ASSERT_EQ(cpu_allocation->size(), size);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    place = platform::CUDAPlace(0);
-    size = 1024;
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    // Allocate 2GB gpu memory
-    place = platform::CUDAPlace(0);
-    size = 2 * static_cast<size_t>(1 << 30);
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    place = platform::CUDAPinnedPlace();
-    size = (1 << 20);
-    auto cuda_pinned_allocation =
-        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
-    ASSERT_NE(cuda_pinned_allocation, nullptr);
-    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
-    ASSERT_EQ(cuda_pinned_allocation->place(), place);
-    ASSERT_GE(cuda_pinned_allocation->size(), size);
-  }
-#endif
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5a3d817211..de81d12cca 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,15 +20,20 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
   PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
   delete allocation;
 }
 Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
                                              Allocator::Attr attr) {
+  // PADDLE_ENFORCE_EQ(
+  //    attr, kCrossDevice,
+  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
+
   void *ptr;
   PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
-  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
+  return new CPUPinnedAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index deeb55a8fb..42d0938f2a 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -20,12 +20,18 @@ namespace memory {
 namespace allocation {
 
 // Allocator uses `cudaHostAlloc`
+class CPUPinnedAllocation : public Allocation {
+ public:
+  CPUPinnedAllocation(void *ptr, size_t size)
+      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
+};
+
 class CPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 7e888988f9..981705051b 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -18,15 +18,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+bool RetryAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+
+void RetryAllocator::Free(Allocation* allocation) {
   // Delete underlying allocation first.
-  underlying_allocator_->Free(allocation);
-  cv_.notify_all();
+  reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
+  {
+    // notify all waited allocators, they can try to allocate memory after free.
+    std::lock_guard<std::mutex> lock(mutex_);
+    cv_.notify_all();
+  }
+  delete allocation;
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto alloc_func = [&, this]() {
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   };
   // In fact, we can unify the code of allocation success and failure
   // But it would add lock even when allocation success at the first time
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 379f576d6e..6ab8ca8fbe 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -25,25 +25,32 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class RetryAllocator;
+
 class RetryAllocator : public Allocator {
  public:
-  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
+  RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
       : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
+    EnforceCheck();
+  }
+
+  bool IsAllocThreadSafe() const override;
+
+ private:
+  void EnforceCheck() {
     PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_,
-        "UnderlyingAllocator of RetryAllocator must not be null");
+        underlying_allocator_.get(),
+        "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
     PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
                    "UnderlyingAllocator of RetryAllocator must be thread-safe");
   }
 
-  bool IsAllocThreadSafe() const override { return true; }
-
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::chrono::milliseconds retry_time_;
   std::mutex mutex_;
   std::condition_variable cv_;
@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
   // For debug, We can add an atomic integer to record how many memory sizes are
   // waited to allocate
   // std::atomic<size_t> waited_allocate_size_{0};
+
+  friend class RetryAllocation;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
index 39743bcb10..cb2df1a029 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
 
 Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   if (size == 0) {
-    return new Allocation(nullptr, 0, place_);
+    return new ZeroSizeAllocation(place_);
   } else {
     return underlying_allocator_->Allocate(size, attr).release();
   }
 }
-
-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (allocation->size() == 0) {
-    delete allocation;
-  } else {
-    underlying_allocator_->Free(allocation);
-  }
-}
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index 08a7a06dbf..0f01dfcdf5 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -24,6 +24,12 @@ namespace allocation {
 // The allocator handles the request's size is zero. Allocator will always
 // return an allocation even the request size is zero. However, the
 // allocation.ptr() is nullptr
+class ZeroSizeAllocation : public Allocation {
+ public:
+  explicit ZeroSizeAllocation(const platform::Place& p)
+      : Allocation(nullptr, 0, p) {}
+};
+
 class ZeroSizeAllocator : public Allocator {
  public:
   ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator {
 
  protected:
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index ddde7baf4c..250efe70fd 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -30,31 +30,38 @@ namespace paddle {
 namespace platform {
 namespace alloc = memory::allocation;
 
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
     t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
     wait_delete_mem_ = 0;
   }
 
-  alloc::AllocationDeleter deleter;
   for (auto tmp : *t_allocations) {
     VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
              << " size: " << tmp.second->size();
-    deleter(tmp.second);
+    delete tmp.second;
   }
 }
 
-void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
   if (platform::is_gpu_place(temp_allocation->place())) {
     PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
                    "The place should be the same.");
@@ -78,7 +85,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
   }
   VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
            << " size: " << temp_allocation->size();
-  alloc::AllocationDeleter()(temp_allocation);
+  delete temp_allocation;
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
@@ -113,9 +120,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
   }
   // If not find the the available allocation, get allocation from
   // AllocatorFacadeInstance.
-  auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
   VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
-  return temp_mem.release();
+  return temp_mem;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 912d45eaf1..f8a43b889d 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -23,6 +23,14 @@
 namespace paddle {
 namespace platform {
 
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
 /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
  * which used by CUDA's async operation.
  *
@@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   void SetCallback(const std::function<void()> &callback);
 
  protected:
-  void FreeImpl(memory::allocation::Allocation *allocation) override;
+  void Free(memory::allocation::Allocation *allocation) override;
 
   memory::allocation::Allocation *AllocateImpl(
       size_t size, memory::allocation::Allocator::Attr attr) override;
@@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   platform::Place place_;
   // When the allocation is not held by any variable, it should be placed
   // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
-      temp_mem_map_{nullptr};
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
+      nullptr};
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dca40edf0b..7bf0896378 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) {
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("_clear", &Tensor::clear)
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 66b768665b..16bb3771f2 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
-inline std::string HumanReadableSize(double f_size) {
+template <typename T>
+std::string HumanReadableSize(T size) {
   size_t i = 0;
+  double f_size = static_cast<double>(size);
   double orig = f_size;
   const std::vector<std::string> units(
       {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size >= 1024) {
+  while (f_size > 1024) {
     f_size /= 1024;
     i++;
   }

From 5656fa9f7ca278aff7319485c0d289a4ffc2f9d0 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 28 Mar 2019 09:51:19 +0800
Subject: [PATCH 12/19] fix travis ci test=develop

---
 paddle/fluid/platform/temporary_allocator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 250efe70fd..d489ed5368 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <memory>
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 DEFINE_int64(limit_of_tmp_allocation, -1,

From 5ab56871386c883c3161191c85e1c7f03d51c9a1 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Thu, 28 Mar 2019 10:37:34 +0800
Subject: [PATCH 13/19] remove no necessary doc changes. test=develop

---
 python/paddle/fluid/framework.py | 200 ++++++++++++++++++++++++++++++-
 1 file changed, 198 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a209f389f3..7abd2a23aa 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -627,6 +627,183 @@ class Variable(object):
         """
         self.error_clip = error_clip
 
+    def _slice_indices(self, slice, length):
+        """
+        Reference implementation for the slice.indices method.
+        """
+        # Compute step and length as integers.
+        step = 1 if slice.step is None else slice.step
+
+        # Raise ValueError for negative length or zero step.
+        if length < 0:
+            raise ValueError("length should not be negative")
+        if step == 0:
+            raise ValueError("slice step cannot be zero")
+
+        # Find lower and upper bounds for start and stop.
+        lower = -1 if step < 0 else 0
+        upper = length - 1 if step < 0 else length
+
+        # Compute start.
+        if slice.start is None:
+            start = upper if step < 0 else lower
+        else:
+            start = slice.start
+            start = max(start + length, lower) if start < 0 else min(start,
+                                                                     upper)
+
+        # Compute stop.
+        if slice.stop is None:
+            stop = lower if step < 0 else upper
+        else:
+            stop = slice.stop
+            stop = max(stop + length, lower) if stop < 0 else min(stop, upper)
+
+        return start, stop, step
+
+    def _detectEllipsis(self, item):
+        has_ellipsis = False
+        start = 0
+        end = len(self.shape)
+        for index, o in enumerate(item):
+            if o is Ellipsis:
+                if has_ellipsis:
+                    raise ValueError("Index can have one ellipsis only.")
+                has_ellipsis = True
+                start = index
+            else:
+                if has_ellipsis:
+                    end = index
+        return has_ellipsis, start, end
+
+    def _reconstructSliceinfo(self, item):
+        has_ellipsis, start, end = self._detectEllipsis(item)
+        if has_ellipsis:
+            newitem = []
+            for i in range(start):
+                newitem.append(item[i])
+            for i in range(start, end):
+                newitem.append(slice(None, None, None))
+            for i in range(end, len(item)):
+                newitem.append(item[i])
+            return newitem
+        else:
+            return None
+
+    def _detectContinuesSlice(self, item):
+        starts = []
+        ends = []
+        for index, o in enumerate(item):
+            if isinstance(o, int):
+                start = int(o)
+                if (index > 0 and index >= self.shape[index]) \
+                        or (index < 0 and (index + self.shape[index]) < 0):
+                    raise IndexError("invalid index")
+                start = max(start + self.shape[index], 0) if start < 0 else min(
+                    start, self.shape[index])
+                starts.append(start)
+                ends.append(start + 1)
+            elif isinstance(o, slice):
+                start, stop, step = self._slice_indices(o, self.shape[index])
+                if step == 1 or step == -1:
+                    starts.append(start)
+                    ends.append(stop)
+                else:
+                    return False, None
+            else:
+                raise IndexError("Valid index accept int or slice or ellipsis")
+        return True, [starts, ends]
+
+    def _cloneVar(self, copy=False):
+        if not copy:
+            return self.block.create_var(
+                name=unique_name.generate(".".join(self.name)),
+                dtype=self.dtype,
+                persistable=self.persistable,
+                stop_gradient=self._stop_gradient, )
+        else:
+            return self
+
+    def _sliceVar(self, axes, starts, ends):
+        new_var = self._cloneVar()
+        self.block.append_op(
+            type="slice",
+            inputs={'Input': [self]},
+            outputs={'Out': [new_var]},
+            attrs={'axes': axes,
+                   'starts': starts,
+                   'ends': ends})
+        return new_var
+
+    def _concatVar(self, inputs, axis):
+        new_var = self._cloneVar()
+        self.block.append_op(
+            type="concat",
+            inputs={'X': inputs},
+            outputs={'Out': [new_var]},
+            attrs={'axis': axis, })
+        return new_var
+
+    def _sliceAndConcatVar(self, item, axis):
+        if isinstance(item, slice):
+            if self.shape[axis] < 0:
+                return self._cloneVar(True)
+            start, stop, step = self._slice_indices(item, self.shape[axis])
+            if step == 1:
+                return self._sliceVar([axis], [start], [stop])
+            else:
+                vars = []
+                if step > 0:
+                    while start < stop:
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1]))
+                        start += step
+                else:
+                    while start > stop:
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1]))
+                        start += step
+                return self._concatVar(vars, axis)
+        elif isinstance(item, int):
+            if self.shape[axis] < 0:
+                return self._cloneVar(True)
+            index = int(item)
+            if (index > 0 and index >= self.shape[axis])\
+                    or (index < 0 and (index + self.shape[axis]) < 0):
+                raise IndexError("invalid index")
+            return self._sliceVar([axis], [index], [index + 1])
+        else:
+            raise IndexError("Valid index accept int or slice or tuple")
+
+    def __getitem__(self, item):
+        """
+        Slice the variable.
+
+        Args:
+            item(int/slice/tuple) : the index.
+
+        Returns:
+            Sliced variable
+        """
+        new_var = None
+        if isinstance(item, tuple):
+            if len(item) > len(self.shape):
+                raise IndexError("Too many indexes")
+            newitem = self._reconstructSliceinfo(item) or item
+            check, info = self._detectContinuesSlice(newitem)
+            if check:
+                starts = info[0]
+                ends = info[1]
+                axes = [i for i in range(len(starts))]
+                return self._sliceVar(axes, starts, ends)
+            else:
+                new_var = self
+                for index, o in enumerate(newitem):
+                    new_var = new_var._sliceAndConcatVar(o, index)
+        else:
+            new_var = self._sliceAndConcatVar(item, 0)
+        return new_var
+
 
 def get_all_op_protos():
     """
@@ -744,7 +921,7 @@ class Operator(object):
         if _in_imperative_mode():
             if type is None:
                 raise ValueError(
-                    "`type` to initilized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None.")
             self.iop = core.OpBase(type)
 
             # TODO(minqiyang): remove these lines after we take apart all
@@ -906,7 +1083,10 @@ class Operator(object):
 
     @property
     def type(self):
-        return self.desc.type()
+        if _in_imperative_mode():
+            return self.iop.type
+        else:
+            return self.desc.type()
 
     def input(self, name):
         """
@@ -1022,6 +1202,9 @@ class Operator(object):
         """
         self._update_desc_attr(name, val)
 
+    def _remove_attr(self, name):
+        self.desc.remove_attr(name)
+
     def _update_desc_attr(self, name, val):
         """
         Update the value of desc's attribute by attribute's name.
@@ -2515,6 +2698,10 @@ class Program(object):
         self._trainers_endpoints = []
         # the distributed lookup table names
         self._distributed_lookup_table = None
+
+        # use Deep gradient comrepssion or not
+        self._enable_dgc = False
+
         # @deprecated(the python memory optimize transpiler is deprecated)
         # whether the program is optimized by memory_optimize_transpiler
         self.__is_mem_optimized = False
@@ -2565,6 +2752,15 @@ class Program(object):
     def set_op_role_var(self, var_name):
         self._op_role_var = [var_name]
 
+    @contextlib.contextmanager
+    def _backward_role_guard(self):
+        tmp_role = self._current_role
+
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.Backward
+        yield
+        self._current_role = tmp_role
+
     @signature_safe_contextmanager
     def _optimized_guard(self, param_and_grads):
         """

From ecc3088df830f8574cef7d4f859d93946e93be5c Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Thu, 28 Mar 2019 10:49:26 +0800
Subject: [PATCH 14/19] Fix saving in quantization strategy. (#16474)

test=develop
---
 .../fluid/contrib/slim/quantization/quantization_strategy.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index 6812b4c633..c4b02166ab 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -152,7 +152,7 @@ class QuantizationStrategy(Strategy):
                 ]
 
             if self.save_in_nodes == None:
-                in_vars = list(context.eval_graph.out_nodes.values())
+                in_vars = list(context.eval_graph.in_nodes.values())
             else:
                 in_vars = self.save_in_nodes
 

From 59f75ec76e8fea156e97bea8739bb3bd4e27bf87 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Thu, 28 Mar 2019 11:51:22 +0800
Subject: [PATCH 15/19] Make unitest of fsp op faster and more stable. (#16502)

* Make unitest of fsp op faster and more stable.
test=develop

* Skip unitest of fsp op.
test=develop
---
 python/paddle/fluid/tests/unittests/test_fsp_op.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py
index 6ad7418447..01991f4d36 100644
--- a/python/paddle/fluid/tests/unittests/test_fsp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
@@ -39,19 +39,21 @@ class TestFSPOp(OpTest):
         self.op_type = "fsp"
         self.initTestCase()
 
-        feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32')
-        feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32')
+        feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float64')
+        feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float64')
 
         self.inputs = {'X': feature_map_0, 'Y': feature_map_1}
         self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)}
 
     def initTestCase(self):
-        self.a_shape = (2, 16, 32, 31)
-        self.b_shape = (2, 28, 32, 31)
+        self.a_shape = (2, 3, 5, 6)
+        self.b_shape = (2, 4, 5, 6)
 
+    @unittest.skip("Disable temporarily.")
     def test_check_output(self):
         self.check_output()
 
+    @unittest.skip("Disable temporarily.")
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
 

From ed61d67c737590ebf2819ca9770a9a6d4e294880 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 27 Mar 2019 23:37:17 -0500
Subject: [PATCH 16/19] Fix the interface of Pass::Apply (#16484)

* modify the interface of Pass::Allay
test=develop

* Polish code
test=develop

* Fix Travis CI
test=develop

* fix Pass::Apply interface
test=develop

* Fix Travis CI
test=develop
---
 .../framework/details/all_reduce_deps_pass.cc |  5 +-
 .../framework/details/all_reduce_deps_pass.h  |  3 +-
 .../alloc_continuous_space_for_grad_pass.cc   |  7 +--
 .../fluid/framework/details/build_strategy.cc | 17 +++----
 .../fluid/framework/details/build_strategy.h  | 15 +++---
 .../framework/details/eager_deletion_pass.cc  |  8 ++--
 .../details/fuse_all_reduce_op_pass.cc        |  6 +--
 .../framework/details/inplace_op_pass.cc      |  9 ++--
 .../fluid/framework/details/inplace_op_pass.h |  3 +-
 .../framework/details/memory_optimize_pass.cc |  7 +--
 .../framework/details/memory_optimize_pass.h  |  4 +-
 .../modify_op_lock_and_record_event_pass.cc   |  4 +-
 .../modify_op_lock_and_record_event_pass.h    |  3 +-
 .../details/multi_devices_graph_check_pass.cc |  6 +--
 .../details/multi_devices_graph_pass.cc       |  4 +-
 .../details/multi_devices_graph_pass.h        |  3 +-
 .../details/multi_devices_graph_print_pass.cc |  2 +
 .../details/multi_devices_graph_print_pass.h  |  5 +-
 .../details/parallel_ssa_graph_executor.cc    |  2 +-
 .../framework/details/reference_count_pass.cc |  5 +-
 .../framework/details/reference_count_pass.h  |  3 +-
 .../details/sequential_execution_pass.cc      |  4 +-
 .../details/sequential_execution_pass.h       |  3 +-
 .../details/while_op_eager_deletion_pass.cc   |  4 +-
 ...anakin_fillconstant_elementwisemul_fuse.cc | 10 ++--
 .../anakin_fillconstant_elementwisemul_fuse.h |  3 +-
 .../framework/ir/attention_lstm_fuse_pass.cc  |  9 ++--
 .../framework/ir/attention_lstm_fuse_pass.h   |  3 +-
 .../ir/conv_affine_channel_fuse_pass.cc       | 24 ++++------
 .../ir/conv_affine_channel_fuse_pass.h        |  6 +--
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   | 31 +++++-------
 paddle/fluid/framework/ir/conv_bn_fuse_pass.h |  6 +--
 .../ir/conv_elementwise_add2_act_fuse.cc      |  6 +--
 .../ir/conv_elementwise_add2_act_fuse_pass.cc | 13 ++---
 .../ir/conv_elementwise_add2_act_fuse_pass.h  |  3 +-
 .../ir/conv_elementwise_add_act_fuse_pass.cc  | 12 ++---
 .../ir/conv_elementwise_add_act_fuse_pass.h   |  3 +-
 .../ir/conv_elementwise_add_fuse_pass.cc      | 10 ++--
 .../ir/conv_elementwise_add_fuse_pass.h       |  3 +-
 .../ir/embedding_fc_lstm_fuse_pass.cc         | 14 +++---
 .../ir/embedding_fc_lstm_fuse_pass.h          |  3 +-
 paddle/fluid/framework/ir/fc_fuse_pass.cc     | 13 +++--
 paddle/fluid/framework/ir/fc_fuse_pass.h      |  3 +-
 .../fluid/framework/ir/fc_fuse_pass_tester.cc |  2 +-
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 22 ++++-----
 paddle/fluid/framework/ir/fc_gru_fuse_pass.h  |  6 +--
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   | 21 ++++----
 paddle/fluid/framework/ir/fc_lstm_fuse_pass.h |  6 +--
 .../framework/ir/fuse_elewise_add_act_pass.cc | 48 +++++++++----------
 .../framework/ir/fuse_elewise_add_act_pass.h  | 20 ++++----
 .../ir/fuse_relu_depthwise_conv_pass.cc       | 24 +++++-----
 .../ir/fuse_relu_depthwise_conv_pass.h        |  6 +--
 .../framework/ir/graph_to_program_pass.cc     |  6 +--
 .../framework/ir/graph_to_program_pass.h      |  2 +-
 .../ir/graph_to_program_pass_test.cc          |  4 +-
 paddle/fluid/framework/ir/graph_viz_pass.cc   | 13 ++---
 paddle/fluid/framework/ir/graph_viz_pass.h    |  4 +-
 .../ir/identity_scale_op_clean_pass.cc        |  8 ++--
 .../ir/identity_scale_op_clean_pass.h         |  3 +-
 .../framework/ir/infer_clean_graph_pass.cc    | 10 ++--
 paddle/fluid/framework/ir/is_test_pass.cc     |  4 +-
 paddle/fluid/framework/ir/is_test_pass.h      |  3 +-
 .../fluid/framework/ir/is_test_pass_tester.cc |  2 +-
 .../framework/ir/lock_free_optimize_pass.cc   | 11 ++---
 .../framework/ir/lock_free_optimize_pass.h    |  3 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   | 14 +++---
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h    |  3 +-
 .../conv_bias_mkldnn_fuse_pass_tester.cc      |  4 +-
 .../conv_elementwise_add_mkldnn_fuse_pass.cc  | 12 ++---
 .../conv_elementwise_add_mkldnn_fuse_pass.h   |  5 +-
 ...elementwise_add_mkldnn_fuse_pass_tester.cc |  4 +-
 .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc   | 12 ++---
 .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.h    |  3 +-
 .../conv_relu_mkldnn_fuse_pass_tester.cc      |  2 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  | 15 +++---
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |  3 +-
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |  2 +-
 .../ir/mkldnn/cpu_quantize_placement_pass.cc  |  4 +-
 .../ir/mkldnn/cpu_quantize_placement_pass.h   |  3 +-
 .../cpu_quantize_placement_pass_tester.cc     |  2 +-
 .../ir/mkldnn/cpu_quantize_squash_pass.cc     | 13 ++---
 .../ir/mkldnn/cpu_quantize_squash_pass.h      |  3 +-
 .../mkldnn/cpu_quantize_squash_pass_tester.cc |  2 +-
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc   | 10 ++--
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.h    |  3 +-
 .../depthwise_conv_mkldnn_pass_tester.cc      |  2 +-
 .../ir/mkldnn/mkldnn_placement_pass.cc        |  5 +-
 .../ir/mkldnn/mkldnn_placement_pass.h         |  3 +-
 .../ir/mkldnn/mkldnn_placement_pass_tester.cc |  2 +-
 .../framework/ir/multi_batch_merge_pass.cc    |  7 ++-
 .../framework/ir/multi_batch_merge_pass.h     |  2 +-
 paddle/fluid/framework/ir/pass.cc             | 14 +++---
 paddle/fluid/framework/ir/pass.h              |  9 ++--
 paddle/fluid/framework/ir/pass_test.cc        | 15 +++---
 .../ir/repeated_fc_relu_fuse_pass.cc          | 10 ++--
 .../framework/ir/repeated_fc_relu_fuse_pass.h |  3 +-
 .../ir/runtime_context_cache_pass.cc          |  4 +-
 .../framework/ir/runtime_context_cache_pass.h |  3 +-
 .../framework/ir/seq_concat_fc_fuse_pass.cc   | 15 +++---
 .../framework/ir/seq_concat_fc_fuse_pass.h    |  3 +-
 .../ir/seqconv_eltadd_relu_fuse_pass.cc       | 10 ++--
 .../ir/seqconv_eltadd_relu_fuse_pass.h        |  3 +-
 .../framework/ir/seqpool_concat_fuse_pass.cc  | 10 ++--
 .../framework/ir/seqpool_concat_fuse_pass.h   |  3 +-
 .../ir/seqpool_concat_fuse_pass_tester.cc     |  2 +-
 .../simplify_anakin_detection_pattern_pass.cc | 11 ++---
 .../simplify_anakin_detection_pattern_pass.h  |  3 +-
 .../framework/ir/squared_mat_sub_fuse_pass.cc | 10 ++--
 .../framework/ir/squared_mat_sub_fuse_pass.h  |  3 +-
 .../framework/ir/sync_batch_norm_pass.cc      |  4 +-
 .../fluid/framework/ir/sync_batch_norm_pass.h |  3 +-
 .../ir/sync_batch_norm_pass_tester.cc         |  2 +-
 .../ir/transpose_flatten_concat_fuse_pass.cc  | 10 ++--
 .../ir/transpose_flatten_concat_fuse_pass.h   |  3 +-
 paddle/fluid/framework/parallel_executor.cc   | 40 ++++++----------
 .../inference/analysis/ir_pass_manager.cc     |  4 +-
 .../ir_passes/anakin_subgraph_pass.cc         |  6 +--
 .../analysis/ir_passes/anakin_subgraph_pass.h |  3 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       | 17 +++----
 .../ir_passes/tensorrt_subgraph_pass.h        |  3 +-
 .../passes/ir_graph_to_program_pass.cc        |  4 +-
 paddle/fluid/pybind/pybind.cc                 |  4 +-
 122 files changed, 370 insertions(+), 539 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index 98a74d630c..d93c84606d 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -42,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) {
   return nullptr;
 }
 
-std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
   auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
 
   // get vars order
@@ -131,8 +130,6 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
     VLOG(10) << "pre_op:" << pre_op->DebugString()
              << ", op:" << op->DebugString();
   }
-
-  return graph;
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
index e8b9108981..4ed3736587 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@@ -24,8 +24,7 @@ namespace details {
 // TODO(gongwb): overlap allreduce with backward computation.
 class AllReduceDepsPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
index fbc8bbf56b..e195e93fb8 100644
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -46,8 +46,7 @@ static framework::proto::VarType::Type kDefaultDtype =
 
 class AllocContinuousSpaceForGradPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
+  void ApplyImpl(ir::Graph *graph) const override {
     ir::Graph &result = *graph;
 
     auto &places = Get<const std::vector<platform::Place>>(kPlaces);
@@ -65,7 +64,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
 
     if (params_grads.size() == 0) {
       VLOG(10) << "Doesn't find gradients";
-      return std::move(graph);
+      return;
     }
 
     std::unordered_map<std::string, ir::Node *> vars;
@@ -124,8 +123,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
 
     InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
                                       fused_var_name, params_grads);
-
-    return std::move(graph);
   }
 
   template <typename AttrType>
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 5d9db23753..078403f30f 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -204,15 +204,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
   return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
 }
 
-std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    std::unique_ptr<ir::Graph> graph,
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
-    const size_t &nranks,
+ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
+                                const std::vector<platform::Place> &places,
+                                const std::string &loss_var_name,
+                                const std::vector<Scope *> &local_scopes,
+                                const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+                                const bool use_cuda,
+                                platform::NCCLContextMap *nccl_ctxs) const {
 #else
-    const bool use_cuda) const {
+                                const bool use_cuda) const {
 #endif
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
@@ -265,7 +266,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       }
     }
     VLOG(3) << "Start Apply Pass " << pass->Type();
-    graph = pass->Apply(std::move(graph));
+    graph = pass->Apply(graph);
     VLOG(3) << "Finish Apply Pass " << pass->Type();
   }
   return graph;
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 4b599fb914..9587a6f0f9 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -120,16 +120,15 @@ struct BuildStrategy {
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
-                                   const std::vector<platform::Place> &places,
-                                   const std::string &loss_var_name,
-                                   const std::vector<Scope *> &local_scopes,
-                                   const size_t &nranks,
+  ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places,
+                   const std::string &loss_var_name,
+                   const std::vector<Scope *> &local_scopes,
+                   const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                                   const bool use_cuda,
-                                   platform::NCCLContextMap *nccl_ctxs) const;
+                   const bool use_cuda,
+                   platform::NCCLContextMap *nccl_ctxs) const;
 #else
-                                   const bool use_cuda) const;
+                   const bool use_cuda) const;
 #endif
 
   // If set true, ParallelExecutor would build the main_program into multiple
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index a6baa26134..622a59b4c2 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -170,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars(
 
 class EagerDeletionPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph *graph) const override;
 };
 
-std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
   auto &ref_cnts =
       Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
   PADDLE_ENFORCE(ref_cnts.empty(),
@@ -240,7 +238,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
 
   auto while_op_eager_deletion_pass =
       ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
-  return while_op_eager_deletion_pass->Apply(std::move(graph));
+  while_op_eager_deletion_pass->Apply(graph);
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
index f226491c9f..31efd78ad3 100644
--- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
@@ -28,8 +28,7 @@ namespace details {
 
 class FuseAllReduceOpPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
+  void ApplyImpl(ir::Graph *graph) const override {
     ir::Graph &result = *graph;
 
     auto &places = Get<const std::vector<platform::Place>>(kPlaces);
@@ -71,7 +70,7 @@ class FuseAllReduceOpPass : public ir::Pass {
 
     VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
     if (all_reduce_ops.size() == 0) {
-      return std::move(graph);
+      return;
     }
 
     PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
@@ -99,7 +98,6 @@ class FuseAllReduceOpPass : public ir::Pass {
                            group_all_reduce_ops, &result);
 #endif
     }
-    return std::move(graph);
   }
 
   void InsertFusedAllReduce(const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 88f26b4161..afbda33b06 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -144,10 +144,9 @@ void InplacePass::InitSSAGraphNodes() const {
   }
 }
 
-std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void InplacePass::ApplyImpl(ir::Graph* graph) const {
   var_nodes_.clear();
-  view_.Build(graph.get());
+  view_.Build(graph);
   InitSSAGraphNodes();
 
   auto cnt = 0;
@@ -155,11 +154,9 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
     VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
     if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
       continue;
-    TryInplaceOpInputOutput(op, graph.get());
+    TryInplaceOpInputOutput(op, graph);
   }
   // graph->ResolveHazard(var_nodes_);
-
-  return graph;
 }
 
 void InplacePass::InplaceModifyDesc(const std::string& var,
diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h
index 01964ba8fc..fbec973dda 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -69,8 +69,7 @@ class InplacePass : public ir::Pass {
   InplacePass();
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   void InitSSAGraphNodes() const;
 
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 80720af32d..ddaef20602 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -44,8 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
   auto nodes = graph->Nodes();
   CollectSkipVarsSet(nodes);
 
@@ -113,7 +112,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
 
           cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
           RenameVarInGraphDesc(var_name, cache_name, idx);
-          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+          RenameVarInGraphNode(var_name, cache_name, idx, graph);
           pool_.Erase(cache_name);
         }
       }
@@ -128,8 +127,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
     }
   }
   graph->ResolveHazard(var_nodes_);
-
-  return graph;
 }
 
 void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h
index 593ffc10fc..ce94890b38 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -21,6 +21,7 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -35,8 +36,7 @@ namespace details {
 
 class MemoryOptimizePass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   // fill the variable map(var_nodes) by version.
   void InitSSAGraphNodes() const;
 
diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
index 67aad9f94f..ae363f9639 100644
--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle(
   return true;
 }
 
-std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> ir_graph) const {
+void ModifyOpLockAndRecordEventPass::ApplyImpl(ir::Graph *ir_graph) const {
   auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph);
   OpGraphView graph_view(all_ops);
   for (auto &op : all_ops) {
@@ -49,7 +48,6 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
                << compute_op->DebugString();
     }
   }
-  return ir_graph;
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h
index b54e1b318b..54d52d6240 100644
--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h
@@ -23,8 +23,7 @@ namespace details {
 
 class ModifyOpLockAndRecordEventPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index a4bb1e26d9..9859b04dec 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -23,10 +23,8 @@ namespace details {
 
 class SSAGraghBuilderWithChecker : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
+  void ApplyImpl(ir::Graph *graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph));
   }
 
   bool IsValidGraph(const ir::Graph *graph) const {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 8c61684c9c..f80a098bfa 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -153,8 +153,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
 
-std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
   Init();
   CheckGraph(*graph);
   std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
@@ -236,7 +235,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
   AddOutputToLeafOps(&result);
 
   result.Erase(kGraphOps);
-  return graph;
 }
 
 void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 8bfd7b9bf8..884089df38 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -36,8 +36,7 @@ namespace details {
 
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph *graph) const override;
 
   virtual void Init() const;
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index e82eb104fa..34c38ea81a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
index b06c87a5c1..6d57d75e8a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 #include <fstream>
 #include <iosfwd>
+#include <memory>
 #include <ostream>
 #include <string>
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 
 class SSAGraghBuilderWithPrinter : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
+  void ApplyImpl(ir::Graph* graph) const override {
     std::unique_ptr<std::ostream> fout(
         new std::ofstream(Get<std::string>(kGraphvizPath)));
     PADDLE_ENFORCE(fout->good());
     Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
-    return graph;
   }
 };
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 2afac32437..137e0dd770 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
   auto seq_allreduce_pass =
       ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
   for (size_t i = 0; i < graphs_.size(); ++i) {
-    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+    graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release()));
   }
 
   // set the correct size of thread pool to each device.
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index c218e55b70..25337872c1 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -266,8 +266,7 @@ static bool ShrinkNoNeedBufferVarOpDependency(
   }
 }
 
-std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
   auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
   auto &last_live_ops_of_vars =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
@@ -342,8 +341,6 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       // Just skip this corner case
     }
   }
-
-  return graph;
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h
index bcbef02735..7bb01ee616 100644
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -23,8 +23,7 @@ namespace details {
 
 class ReferenceCountPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index 0b53a76e78..839f8dc43e 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
          op1->Outputs() == op2->Outputs();
 }
 
-std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void SequentialExecutionPass::ApplyImpl(ir::Graph *graph) const {
   // FIXME(zjl): Insert dependencies between some distributed ops may cause
   // the multi_devices_graph_pass fails. So we skip these ops here.
   // Indeed, maybe we should not insert dependencies between these ops
@@ -98,7 +97,6 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
     VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
              << " and " << op_node_list[i]->Name();
   }
-  return graph;
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h
index ea3034877f..7d6a4f4cc5 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -23,8 +23,7 @@ namespace details {
 
 class SequentialExecutionPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
index fd6b6dd227..8f7c99f12a 100644
--- a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
@@ -23,8 +23,7 @@ namespace details {
 
 class WhileOpEagerDeletionPass : public ir::Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
+  void ApplyImpl(ir::Graph *graph) const override {
     auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
 
     // Find all while_op and while_grad_op
@@ -50,7 +49,6 @@ class WhileOpEagerDeletionPass : public ir::Pass {
       operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
           while_ops, while_grad_ops);
     }
-    return graph;
   }
 };
 
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
index 83b0da0c01..39077f6420 100644
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
@@ -29,10 +29,9 @@ namespace ir {
   GET_IR_NODE(elementwise_mul);   \
   GET_IR_NODE(elementwise_mul_out);
 
-std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
@@ -69,12 +68,11 @@ std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
     IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(graph,
                          {fill_constant, fill_constant_out, elementwise_mul});
   };
 
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
index fa95143d3a..14c07c5884 100644
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
@@ -26,8 +26,7 @@ class AnakinFillconstantElementwisemulFuse : public FusePassBase {
   virtual ~AnakinFillconstantElementwisemulFuse() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index a9897e0bb8..5a82d7927f 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -253,8 +254,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
 
 // Parameters
 
-std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
   PDPattern external_pattern, subblock_pattern;
 
   // Use the following variables to tell whether this model is RNN1.
@@ -269,12 +269,11 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
     }
   }
   if (count < specified_vars.size()) {
-    return graph;
+    return;
   }
 
   // Continue to fuse.
-  FindWhileOp(graph.get());
-  return graph;
+  FindWhileOp(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index 39b0585d3a..47ed9f0393 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,8 +22,7 @@ namespace ir {
 
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index a7bfb8cf1e..fecc159ade 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -77,10 +77,9 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
   weights_array_2d.colwise() *= scale_array;
 }
 
-std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
@@ -139,7 +138,7 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
     desc.SetAttr("axis", 1);
     auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
 
-    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
+    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
 
     IR_NODE_LINK_TO(conv_out, eltwise_op);
     IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
@@ -147,16 +146,14 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
     found_conv_ac_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_conv_ac_count);
-  return graph;
 }
 
-std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
@@ -199,7 +196,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
     eltwise->Op()->SetAttr("axis", 1);
     eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
 
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(graph,
                          {ac_scale, ac_bias, affine_channel, eltwise_out});
 
     IR_NODE_LINK_TO(eltwise, ac_out);
@@ -207,9 +204,8 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
     found_conv_ac_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
   AddStatis(found_conv_ac_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index 8c3c8b56c0..d607020a47 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,8 +31,7 @@ class ConvAffineChannelFusePass : public FusePassBase {
   virtual ~ConvAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph*) const override;
   const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 
@@ -41,8 +40,7 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph*) const override;
   const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 04765dd144..876a999645 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -101,10 +101,9 @@ void recompute_bias_and_weights(const Scope* scope,
   weights_array_2d.colwise() *= variance_array;
 }
 
-std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
@@ -187,7 +186,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
                             std::vector<std::string>({bn_out->Name()}));
 
       GraphSafeRemoveNodes(
-          graph.get(),
+          graph,
           {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
            bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
 
@@ -203,10 +202,9 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
       desc.SetAttr("axis", 1);
       auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
 
-      GraphSafeRemoveNodes(
-          graph.get(),
-          {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
-           bn_variance_out, bn_saved_mean, bn_saved_variance});
+      GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance,
+                                   batch_norm, bn_mean_out, bn_variance_out,
+                                   bn_saved_mean, bn_saved_variance});
 
       IR_NODE_LINK_TO(conv_out, eltwise_op);
       IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
@@ -215,16 +213,14 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
     }
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_conv_bn_count);
-  return graph;
 }
 
-std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
@@ -274,7 +270,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
     eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
 
     GraphSafeRemoveNodes(
-        graph.get(),
+        graph,
         {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
          bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
 
@@ -283,10 +279,9 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
     found_conv_bn_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_conv_bn_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index cf425a2730..837a48ed73 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,8 +31,7 @@ class ConvBNFusePass : public FusePassBase {
   virtual ~ConvBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   const std::string name_scope_{"conv_bn_fuse"};
 };
 
@@ -41,8 +40,7 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
index 6e9905b7ec..99bc5fe8c5 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
@@ -50,10 +50,9 @@ framework::proto::OpDesc PrepareOpDesc(
   return *desc.Proto();
 }
 
-std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "conv_elementwise_add_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
@@ -95,7 +94,6 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
                           elementwise_add_out});
   };
   gpd(graph.get(), handler);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index c6121777e8..b4d6f683ce 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -51,10 +51,9 @@ framework::proto::OpDesc PrepareOpDesc(
   return *desc.Proto();
 }
 
-std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "conv_elementwise_add2_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
@@ -92,12 +91,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
 
     // Delete the unneeded nodes.
     GraphSafeRemoveNodes(
-        graph.get(),
-        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
-         elementwise_add_out, elementwise_add_out_1, act_op});
+        graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+                elementwise_add_out, elementwise_add_out_1, act_op});
   };
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index 9259a4ac5c..ea9e465d8d 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAdd2ActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index fe3b4fca79..ba0a2fb964 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -48,10 +48,9 @@ framework::proto::OpDesc PrepareOpDesc(
   return *desc.Proto();
 }
 
-std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "conv_elementwise_add_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
@@ -88,12 +87,11 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
     IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op,
-                                       elementwise_add_out, act_op});
+    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op,
+                                 elementwise_add_out, act_op});
   };
 
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index 9c0b50f155..8b34c3551d 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 476c9dbc35..8c491d4f58 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -30,10 +30,9 @@ namespace ir {
   GET_IR_NODE(elementwise_add_in_y); \
   GET_IR_NODE(elementwise_add_out);
 
-std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "conv_elementwise_add_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
@@ -76,11 +75,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
     IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
+    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op});
   };
 
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index bf43bd5ce2..66a562cdd1 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAddFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index ba11f19c92..3a6bbe65b3 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 #include "paddle/fluid/operators/math/blas.h"
@@ -201,7 +203,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
       // Remove unneeded nodes.
       // TODO(jczaja): Proper removing of lookup table
       std::unordered_set<const Node*> marked_nodes(
-          //{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
+          // {lookup_table, mul, lstm, elementwise_add, fc_bias, W});
           {mul, lstm, elementwise_add, fc_bias});
       GraphSafeRemoveNodes(graph, marked_nodes);
     } else {
@@ -224,15 +226,13 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
-                                 true /*with_fc_bias*/);
+  int fusion_count =
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
 
   AddStatis(fusion_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index fde2a0a4ee..65cb443972 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -32,8 +32,7 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
   virtual ~EmbeddingFCLSTMFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"embedding_fc_lstm_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 12b31da010..ca008763bf 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -22,10 +23,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("fc_fuse", graph.get());
+void FCFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("fc_fuse", graph);
 
   std::unordered_set<Node*> nodes2delete;
 
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
     desc.SetType("fc");
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
+    GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});
 
     PADDLE_ENFORCE(subgraph.count(x));
     IR_NODE_LINK_TO(subgraph.at(x), fc_node);
@@ -72,10 +72,9 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     found_fc_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_fc_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index 783a052edc..0a0fcd2da8 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -31,8 +31,7 @@ class FCFusePass : public FusePassBase {
   virtual ~FCFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 4e1e4e27f9..affe506910 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -73,7 +73,7 @@ TEST(FCFusePass, basic) {
 
   int pre_nodes = graph->Nodes().size();
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   int after_nodes = graph->Nodes().size();
 
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index a902b0b50c..5f660c6d36 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
@@ -39,7 +40,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   // Create New OpDesc
   auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
                          Node* bias, Node* hidden, Node* fc_bias) {
-
     OpDesc op_desc;
     op_desc.SetType("fusion_gru");
 
@@ -155,26 +155,22 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
-                                 false /*with_fc_bias*/);
+  int fusion_count =
+      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
 
   AddStatis(fusion_count);
-  return graph;
 }
 
-std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
-                                 true /*with_fc_bias*/);
+  int fusion_count =
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
 
   AddStatis(fusion_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index e359a32894..e11cdac7ea 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,8 +30,7 @@ class FCGRUFusePass : public FusePassBase {
   virtual ~FCGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"fc_gru_fuse"};
 };
@@ -42,8 +41,7 @@ class MulGRUFusePass : public FusePassBase {
   virtual ~MulGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   const std::string name_scope_{"fc_nobias_gru_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index f5c2864865..babeba9614 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
@@ -157,26 +158,22 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
-                                 false /*with_fc_bias*/);
+  int fusion_count =
+      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
 
   AddStatis(fusion_count);
-  return graph;
 }
 
-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
-                                 true /*with_fc_bias*/);
+  int fusion_count =
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
 
   AddStatis(fusion_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 21482615a6..5dea7c91a8 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -32,8 +32,7 @@ class FCLstmFusePass : public FusePassBase {
   virtual ~FCLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"fc_lstm_fuse"};
 };
@@ -43,8 +42,7 @@ class MulLstmFusePass : public FusePassBase {
   virtual ~MulLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 648acc4a75..bd49673168 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,29 +25,25 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const {
   std::unordered_set<std::string> act_types = {"relu", "scale"};
-  graph = FuseActElewiseAdd(std::move(graph), act_types);
-  graph = FuseElewiseAddAct(std::move(graph), act_types);
+  graph = FuseActElewiseAdd(graph, act_types);
+  graph = FuseElewiseAddAct(graph, act_types);
   // backward
   {
     std::unordered_set<std::string> in_place_act_types = {"relu_grad"};
-    graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types);
+    graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types);
   }
 
   // Remove the removable intermediate_out.
-  RemoveIntermediateOut(graph.get());
-
-  return graph;
+  RemoveIntermediateOut(graph);
 }
 
 // ele_add(x, act(y))
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
-    std::unique_ptr<ir::Graph> graph,
-    const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("elewise_add_act", graph.get());
+ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("elewise_add_act", graph);
 
   GraphPatternDetector gpd;
   auto *x = gpd.mutable_pattern()
@@ -86,18 +84,17 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
     found_elewise_add_act_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_elewise_add_act_count);
   return graph;
 }
 
 // act(ele_add(x,y))
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
-    std::unique_ptr<ir::Graph> graph,
-    const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("act_elewise_add", graph.get());
+ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("act_elewise_add", graph);
 
   GraphPatternDetector gpd;
   auto *x = gpd.mutable_pattern()
@@ -137,7 +134,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
     found_elewise_add_act_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_elewise_add_act_count);
   return graph;
@@ -146,11 +143,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
 // the backward of act(ele_add(x,y))
 // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
 // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
-    std::unique_ptr<ir::Graph> graph,
-    const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("elewise_add_act_grad", graph.get());
+ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("elewise_add_act_grad", graph);
 
   GraphPatternDetector gpd;
   auto *d_act_out = gpd.mutable_pattern()
@@ -217,7 +213,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
     found_elewise_add_act_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_elewise_add_act_count);
   return graph;
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index 0fee527447..dc73f1fda0 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -14,6 +14,8 @@
 #pragma once
 
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -32,20 +34,16 @@ class FuseElewiseAddActPass : public FusePassBase {
   virtual ~FuseElewiseAddActPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph *graph) const override;
 
-  std::unique_ptr<ir::Graph> FuseElewiseAddAct(
-      std::unique_ptr<ir::Graph> graph,
-      const std::unordered_set<std::string> &act_types) const;
+  ir::Graph *FuseElewiseAddAct(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
 
-  std::unique_ptr<ir::Graph> FuseActElewiseAdd(
-      std::unique_ptr<ir::Graph> graph,
-      const std::unordered_set<std::string> &act_types) const;
+  ir::Graph *FuseActElewiseAdd(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
 
-  std::unique_ptr<ir::Graph> FuseElewiseAddActInplaceGrad(
-      std::unique_ptr<ir::Graph> graph,
-      const std::unordered_set<std::string> &act_types) const;
+  ir::Graph *FuseElewiseAddActInplaceGrad(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
 
   /**
    * Remove the removable intermediate_out.
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index fe844caed2..c4e6b6e6a5 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,20 +24,18 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  graph = FuseReluDepthwiseConv(std::move(graph), true);
-  graph = FuseReluDepthwiseConv(std::move(graph), false);
-  return graph;
+void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const {
+  graph = FuseReluDepthwiseConv(graph, true);
+  graph = FuseReluDepthwiseConv(graph, false);
 }
 
-std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
-    std::unique_ptr<ir::Graph> graph, bool only_forward) const {
-  PADDLE_ENFORCE(graph.get());
+ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
+    ir::Graph *graph, bool only_forward) const {
+  PADDLE_ENFORCE(graph);
   if (only_forward)
-    FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get());
+    FusePassBase::Init("relu_depthwise_conv_only_forward", graph);
   else
-    FusePassBase::Init("relu_depthwise_conv", graph.get());
+    FusePassBase::Init("relu_depthwise_conv", graph);
   /*
            x ---act--> y ---layer-> z
             +----------+
@@ -144,10 +143,9 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
     }
     count++;
   };
-  gpd(graph.get(), handler);
-  GraphSafeRemoveNodes(graph.get(), need_removed_nodes);
+  gpd(graph, handler);
+  GraphSafeRemoveNodes(graph, need_removed_nodes);
   AddStatis(count);
-
   return graph;
 }
 
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index efb49b8300..d37c153dd2 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,10 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
   virtual ~FuseReluDepthwiseConvPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
-  std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
-      std::unique_ptr<ir::Graph> graph, bool only_forward) const;
+  void ApplyImpl(ir::Graph* graph) const override;
+  ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 3372dcd181..b0d056f2c0 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 
 #include <map>
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -26,8 +28,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
-    std::unique_ptr<Graph> graph) const {
+void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
   // Remove the unneeded variables after memory optimization.
   std::unordered_set<std::string> vars2remove;
   if (graph->Has(kGraphToProgramVarsToRemove)) {
@@ -73,7 +74,6 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
   }
 
   program.CopyFrom(*program_pb);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 4c36c3a5da..52c8f4e0fc 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -26,7 +26,7 @@ const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
 
 class GraphToProgramPass : public Pass {
  protected:
-  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
index 5d51d9751a..5ee6b8a5f1 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -84,7 +86,7 @@ TEST(GraphToProgramPass, Basic) {
 
   ProgramDesc compiled_prog;
   pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog);
-  pass->Apply(std::move(g));
+  pass->Apply(g.get());
   std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps();
   EXPECT_EQ(ops[0]->Type(), "op1");
   EXPECT_EQ(ops[1]->Type(), "op2");
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 87a28a2a66..f4df4cfeba 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include <algorithm>
+#include <unordered_map>
 #include <unordered_set>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/string/printf.h"
@@ -38,8 +38,7 @@ std::string FormatName(const Node* node) {
 }
 }  // namespace
 
-std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
   const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
   VLOG(3) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
@@ -82,7 +81,7 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
       {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
        Dot::Attr("fillcolor", "yellow")});
 
-  auto marked_nodes = ConsumeMarkedNodes(graph.get());
+  auto marked_nodes = ConsumeMarkedNodes(graph);
   // Create nodes
   for (const Node* n : graph->Nodes()) {
     std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")";
@@ -115,8 +114,6 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
   }
 
   sout << dot.Build();
-
-  return graph;
 }
 
 GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
@@ -135,4 +132,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
 }  // namespace paddle
 
 REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
-    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
\ No newline at end of file
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index e64916a5bb..7091aa6a95 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -34,8 +35,7 @@ class GraphVizPass : public Pass {
   using marked_nodes_t = std::unordered_set<const Node*>;
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   // Tell whether there are any marked nodes in the graph. Consume the
   // corresponding attribute.
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 5bdc0c5fae..a39901e63b 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -20,9 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init("identity_scale_op_clean", graph.get());
+void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init("identity_scale_op_clean", graph);
 
   // pre_op -> scale_in -> scale_op -> scale_out
   // ->
@@ -72,8 +71,7 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
     IR_NODE_LINK_TO(pre_op_var, scale_out_var);
   };
 
-  detector(graph.get(), handler);
-  return graph;
+  detector(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index 6da592561d..d66b411257 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -22,8 +22,7 @@ namespace ir {
 
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
  private:
   virtual ~IdentityScaleOpCleanPass() = default;
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index 6607c026a7..d76924116f 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -26,9 +26,9 @@ class InferCleanGraphPass : public FusePassBase {
   virtual ~InferCleanGraphPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
-    FusePassBase::Init("original_graph", graph.get());
-    PADDLE_ENFORCE(graph.get());
+  void ApplyImpl(ir::Graph* graph) const {
+    FusePassBase::Init("original_graph", graph);
+    PADDLE_ENFORCE(graph);
 
     auto is_valid_node = [](Node* x) {
       return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
@@ -46,11 +46,9 @@ class InferCleanGraphPass : public FusePassBase {
       }
     }
 
-    GraphSafeRemoveNodes(graph.get(), invalid_nodes);
+    GraphSafeRemoveNodes(graph, invalid_nodes);
 
     AddStatis(valid_op);
-
-    return graph;
   }
 
   void CleanEdges(std::vector<Node*>* nodes,
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 57cc98e2ca..bf6fe999c1 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -20,8 +20,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void IsTestPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
              "for activations and pooling.";
   auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
@@ -47,7 +46,6 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
       }
     }
   }
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h
index 99e76ca4a3..80cedbf9f8 100644
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
@@ -22,8 +22,7 @@ namespace ir {
 
 class IsTestPass : public Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index 9696441a21..3fa543c622 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -97,7 +97,7 @@ TEST(IsTestPass, basic) {
 
   auto pass = PassRegistry::Instance().Get("is_test_pass");
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   for (auto* node : graph->Nodes()) {
     if (node->IsOp()) {
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
index 92e897ca9c..05d23961a8 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
@@ -32,9 +32,8 @@ const char kSumGradOpName[] = "sum";
 // other optimizers later.
 const char kOptimizerType[] = "sgd";
 
-std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
+void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
 
   // We could collect all weights' name from SGD, where
   // W1 <- SGD(W0, Grad0)
@@ -92,14 +91,14 @@ std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
 
             // find the forward op related to the backward op
             ir::Node* forward_op =
-                FindForwardOpViaBackwardOp(graph.get(), backward_op);
+                FindForwardOpViaBackwardOp(graph, backward_op);
 
             VLOG(3) << "Found forward_op " << forward_op->Name();
 
             PADDLE_ENFORCE(forward_op);
 
             Node* new_optimizer_node = CreateNewSGDNode(
-                graph.get(), forward_op, backward_op, node, opt_node);
+                graph, forward_op, backward_op, node, opt_node);
 
             PADDLE_ENFORCE(new_optimizer_node);
           }
@@ -140,8 +139,6 @@ std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
       }
     }
   }
-
-  return graph;
 }
 
 ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index f9157b10d9..d1718857a5 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -60,8 +60,7 @@ class LockFreeOptimizePass : public Pass {
   virtual ~LockFreeOptimizePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
  private:
   // Create a new sgd node via current optimizer node
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 5d0b294f6f..8ef3993b06 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -38,10 +38,9 @@ LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
   return vec_y;
 }
 
-std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
@@ -99,7 +98,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
       conv->Op()->SetOutput("Output",
                             std::vector<std::string>({eltwise_out->Name()}));
 
-      GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out});
+      GraphSafeRemoveNodes(graph, {eltwise, conv_out});
 
       IR_NODE_LINK_TO(conv, eltwise_out);
     } else {
@@ -123,14 +122,13 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
       IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
       IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
 
-      GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
+      GraphSafeRemoveNodes(graph, {conv, eltwise, conv_out});
     }
 
     found_conv_bias_count++;
   };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
   AddStatis(found_conv_bias_count);
-  return graph;
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index 0ef5c177bf..84106d0655 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -29,8 +29,7 @@ class ConvBiasFusePass : public FusePassBase {
   virtual bool is_conv3d() const { return false; }
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 38b7fe5203..ff7f9190fd 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
@@ -103,7 +103,7 @@ void MainTest(bool convWithExistingBias) {
 
   int original_nodes_num = graph->Nodes().size();
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   int current_nodes_num = graph->Nodes().size();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index fb3db81347..ef7874c1c0 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -16,8 +16,8 @@
 #include <functional>
 #include <list>
 #include <map>
+#include <memory>
 #include <tuple>
-
 #include "paddle/fluid/framework/ir/graph_traits.h"
 
 namespace paddle {
@@ -327,17 +327,15 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
       get_node_from_elementwise_add);
 }
 
-graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+  FusePassBase::Init(name_scope_, graph);
   auto fused_graph_with_stats = FuseConvAsY(
       name_scope_,
-      FuseConvAsX(
-          name_scope_,
-          FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0))));
+      FuseConvAsX(name_scope_,
+                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
 
   std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl;
   AddStatis(fused_graph_with_stats.second);
-  return graph;
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index 6629dae425..9bf1ae6079 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -27,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-using graph_ptr = std::unique_ptr<ir::Graph>;
+using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
 void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
@@ -124,7 +125,7 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(graph_ptr graph) const;
 
   const std::string name_scope_{"residual_connection_fuse_pass"};
 };
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 433d89d8d3..8a13596cd5 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -148,7 +148,7 @@ void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
   auto pass =
       PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
   int original_nodes_num = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
   int current_nodes_num = graph->Nodes().size();
 
   EXPECT_TRUE(is_reachable(graph)(from, to));
@@ -258,7 +258,7 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
   auto pass =
       PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
   int original_nodes_num = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
   int current_nodes_num = graph->Nodes().size();
 
   EXPECT_TRUE(is_reachable(graph)("a", "g"));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
index 4f4605398a..dd0fb45604 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
@@ -21,10 +21,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
+void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("conv_relu_mkldnn_fuse", graph);
 
   GraphPatternDetector gpd;
   auto* conv_input = gpd.mutable_pattern()
@@ -56,7 +55,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
     OpDesc* desc = conv->Op();
     desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
     desc->SetAttr("fuse_relu", true);
-    GraphSafeRemoveNodes(graph.get(), {relu, conv_out});
+    GraphSafeRemoveNodes(graph, {relu, conv_out});
 
     PADDLE_ENFORCE(subgraph.count(conv_input));
     IR_NODE_LINK_TO(conv, relu_out);
@@ -64,10 +63,9 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
     found_conv_relu_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
 
   AddStatis(found_conv_relu_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
index fe585bd7c4..2174c22dbf 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
@@ -31,8 +31,7 @@ class ConvReLUFusePass : public FusePassBase {
   virtual ~ConvReLUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
index 06d56f6222..67a9957059 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -88,7 +88,7 @@ TEST(ConvReLUFusePass, basic) {
 
   int original_nodes_num = graph->Nodes().size();
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   int current_nodes_num = graph->Nodes().size();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index b3a8c20891..dff98e523a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -216,19 +216,16 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
   PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
 }
 
-std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init(name_scope_, graph.get());
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init(name_scope_, graph);
 
   PADDLE_ENFORCE(param_scope());
 
-  QuantizeConv(graph.get(), false /* with_residual_data */);
-  QuantizeConv(graph.get(), true /* with_residual_data */);
-  QuantizePool(graph.get());
-
-  return graph;
+  QuantizeConv(graph, false /* with_residual_data */);
+  QuantizeConv(graph, true /* with_residual_data */);
+  QuantizePool(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 9873bb04e1..a178c4dc36 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -42,8 +42,7 @@ class CPUQuantizePass : public FusePassBase {
   virtual ~CPUQuantizePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 0d0ed98901..8716a412e4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -139,7 +139,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
 
   int original_nodes_num = graph->Nodes().size();
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   int current_nodes_num = graph->Nodes().size();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 511003dce5..79a8ac68b8 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -20,8 +20,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
@@ -43,7 +42,6 @@ std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
       }
     }
   }
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index ef3861b249..008a462dc4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -25,8 +25,7 @@ namespace ir {
  */
 class CPUQuantizePlacementPass : public Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 11d72a56bd..ba4d281f81 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -94,7 +94,7 @@ void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
   pass->Set("quantize_excluded_op_ids",
             new std::unordered_set<int>(quantize_excluded_op_ids));
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   unsigned use_quantizer_true_count = 0;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 6e74cc7787..debbbd6440 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -126,16 +126,13 @@ void CPUQuantizeSquashPass::Squash(
                   found_squash_count);
 }
 
-std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
+void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("cpu_quantize_squash_pass", graph);
 
   std::unordered_map<const Node*, int> nodes_keep_counter;
-  FindNodesToKeep(graph.get(), &nodes_keep_counter);
-  Squash(graph.get(), &nodes_keep_counter);
-
-  return graph;
+  FindNodesToKeep(graph, &nodes_keep_counter);
+  Squash(graph, &nodes_keep_counter);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index b823a2cef3..e873994c57 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -34,8 +34,7 @@ class CPUQuantizeSquashPass : public FusePassBase {
   virtual ~CPUQuantizeSquashPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   /*
    * For each dequantize's output find the number of operators it is an input to
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 3cf51d97aa..fda337066f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -125,7 +125,7 @@ void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
 
   int original_nodes_num = graph->Nodes().size();
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   int current_nodes_num = graph->Nodes().size();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index 7851e8c84b..e854559ae7 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -25,10 +25,9 @@ namespace ir {
   auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
   PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
 
-std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph);
   GraphPatternDetector gpd;
 
   auto* pattern = gpd.mutable_pattern();
@@ -45,9 +44,8 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
     found_depthwise_conv_mkldnn_count++;
   };
 
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
   AddStatis(found_depthwise_conv_mkldnn_count);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
index 8ca6a73251..ca314afde5 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
@@ -25,8 +25,7 @@ class DepthwiseConvMKLDNNPass : public FusePassBase {
   virtual ~DepthwiseConvMKLDNNPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index 1783e3322b..f2dfbc84a5 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -86,7 +86,7 @@ TEST(DepthwiseConvMKLDNNPass, basic) {
 
   counters before{1, 1, 1, 1};
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   // initialize counters before loop
   counters after{0, 0, 0, 0};
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index ccac65f3b3..500419e4b7 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
 #include <string>
+#include <unordered_set>
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies MKL-DNN placement strategy.";
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
@@ -37,7 +37,6 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
       }
     }
   }
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
index c071d9aed2..ffa62273ec 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -26,8 +26,7 @@ namespace ir {
  */
 class MKLDNNPlacementPass : public Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
index b6ec7e4d68..5885f327e6 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
@@ -97,7 +97,7 @@ void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types,
   pass->Set("mkldnn_enabled_op_types",
             new std::unordered_set<std::string>(mkldnn_enabled_op_types));
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   unsigned use_mkldnn_true_count = 0;
 
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index 9e77f98e9e..dcc48fb934 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -16,8 +16,9 @@
 
 #include <map>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
-
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
@@ -68,8 +69,7 @@ VarDesc UpdateGradVarDesc(
   return *var_desc;
 }
 
-std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
-    std::unique_ptr<Graph> graph) const {
+void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
   int num_repeats = Get<const int>(kNumRepeats);
   std::vector<Node*> forward_backward_ops;
   std::vector<Node*> optimize_ops;
@@ -325,7 +325,6 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
   }
 
   result.ResolveHazard(created);
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
index c1e5aef20d..a89616683d 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
@@ -36,7 +36,7 @@ class BatchMergePass : public Pass {
   virtual ~BatchMergePass() {}
 
  protected:
-  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+  void ApplyImpl(Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 33ccee6aa0..c0ed0519b1 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
+Graph* Pass::Apply(Graph* graph) const {
+  PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty.");
   for (const std::string& attr : required_pass_attrs_) {
     PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
                    "Required pass atrribute %s not set.", attr);
@@ -28,16 +28,16 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
     PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
                    attr);
   }
-  auto* native_graph = graph.get();
-  auto applied_graph = ApplyImpl(std::move(graph));
+  auto* native_graph = graph;
+  ApplyImpl(graph);
   // TODO(panyx0718): Add more verifications.
-  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+  PADDLE_ENFORCE(!HasCircle(*graph),
                  "Illegal Pass. Generated graph shouldn't has cycle.");
-  PADDLE_ENFORCE(applied_graph.get() == native_graph,
+  PADDLE_ENFORCE(graph == native_graph,
                  "Pass::Apply() cannot delete the passed graph and shouldn't "
                  "return a new graph.(For the need of pybind11)");
   applied_ = true;
-  return applied_graph;
+  return graph;
 }
 
 PassRegistry& PassRegistry::Instance() {
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 27746ff145..6cbe9a8212 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <functional>
 #include <map>
+#include <memory>
 #include <string>
-
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -44,7 +46,7 @@ class Pass {
 
   std::string Type() const { return type_; }
 
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
+  Graph *Apply(Graph *graph) const;
 
   // Get a reference to the attributed previously set.
   template <typename AttrType>
@@ -98,9 +100,8 @@ class Pass {
   }
 
  protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+  virtual void ApplyImpl(Graph *graph) const {
     LOG(FATAL) << "Calling virtual Pass not implemented.";
-    return graph;
   }
 
  private:
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 6ad7d1df8b..87e3c96416 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
+#include <memory>
 #include <string>
+#include <utility>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -39,7 +41,7 @@ void BuildCircleGraph(Graph* g) {
 
 class TestPass : public Pass {
  protected:
-  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+  void ApplyImpl(ir::Graph* graph) const {
     graph->Set<int>("copy_test_pass_attr", new int);
     graph->Set<int>("copy_test_graph_attr", new int);
 
@@ -48,7 +50,6 @@ class TestPass : public Pass {
 
     int test_graph_attr = graph->Get<int>("test_graph_attr");
     graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
-    return graph;
   }
 };
 
@@ -58,7 +59,7 @@ TEST(PassTest, TestPassAttrCheck) {
   std::unique_ptr<Graph> graph(new Graph(prog));
   std::string exception;
   try {
-    graph = pass->Apply(std::move(graph));
+    graph.reset(pass->Apply(graph.release()));
   } catch (paddle::platform::EnforceNotMet e) {
     exception = std::string(e.what());
   }
@@ -69,7 +70,7 @@ TEST(PassTest, TestPassAttrCheck) {
   pass->SetNotOwned<int>("test_pass_attr", &val);
 
   try {
-    graph = pass->Apply(std::move(graph));
+    graph.reset(pass->Apply(graph.release()));
   } catch (paddle::platform::EnforceNotMet e) {
     exception = std::string(e.what());
   }
@@ -78,14 +79,14 @@ TEST(PassTest, TestPassAttrCheck) {
   graph.reset(new Graph(prog));
   graph->Set<int>("test_graph_attr", new int);
   graph->Get<int>("test_graph_attr") = 1;
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
   ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
   ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
 
   // Allow apply more than once.
   graph.reset(new Graph(prog));
   graph->Set<int>("test_graph_attr", new int);
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   pass = PassRegistry::Instance().Get("test_pass");
   pass->SetNotOwned<int>("test_pass_attr", &val);
@@ -94,7 +95,7 @@ TEST(PassTest, TestPassAttrCheck) {
   graph->Set<int>("test_graph_attr", new int);
   graph->Get<int>("test_graph_attr") = 2;
   try {
-    auto tmp = pass->Apply(std::move(graph));
+    pass->Apply(graph.release());
   } catch (paddle::platform::EnforceNotMet e) {
     exception = std::string(e.what());
   }
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 84a4ff2de1..00263b8a34 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
 #include <algorithm>  // for max
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -365,17 +366,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> RepeatedFCReluFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
   int fusion_count = 0;
   for (int i = MAX_NUM_FC; i > 1; --i) {
     fusion_count +=
-        BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i);
+        BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i);
   }
   AddStatis(fusion_count);
-
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index ede0bea07f..ae777bcceb 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -31,8 +31,7 @@ class RepeatedFCReluFusePass : public FusePassBase {
   virtual ~RepeatedFCReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"repeated_fc_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index 67b29512c4..c7cf9b0dc3 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -20,15 +20,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
     }
   }
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
index a6cf1a9ae5..e4783166e0 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -23,8 +23,7 @@ namespace ir {
 
 class RuntimeContextCachePass : public Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index 012e68036c..b230c50167 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
 #include <set>
 #include <string>
-
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
@@ -178,9 +178,8 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
   return fc_out;
 }
 
-std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init("seq_concat_fc_fuse", graph.get());
+void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init("seq_concat_fc_fuse", graph);
   GraphPatternDetector detector;
   auto* pattern = detector.mutable_pattern();
   auto* concat_out = BuildSeqExpandConcatPattern(pattern);
@@ -194,8 +193,8 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
 
   int fuse_count{0};
 
-  detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
-                            Graph* graph) {
+  detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
     VLOG(4) << "get one concat pattern";
     // fc
     GET_NODE(fc_w, detector.pattern());
@@ -246,8 +245,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
   });
 
   AddStatis(fuse_count);
-
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index 06e18f9dc3..d68840a554 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -27,8 +27,7 @@ class SeqConcatFcFusePass : public FusePassBase {
   virtual ~SeqConcatFcFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 0a1f65d274..3fd368741f 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
@@ -83,14 +84,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
 
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope());
+  int fusion_count = BuildFusion(graph, name_scope_, param_scope());
   AddStatis(fusion_count);
-
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index c36c6b76a2..fde9b586c8 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -28,8 +28,7 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
   virtual ~SeqConvEltAddReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 63a0c24f2a..4ac379eb04 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -194,17 +195,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+void SeqPoolConcatFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
   int fusion_count = 0;
   for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
     fusion_count +=
-        BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i);
+        BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i);
   }
   AddStatis(fusion_count);
-
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index a5db3528da..40a9edc5e6 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -42,8 +42,7 @@ class SeqPoolConcatFusePass : public FusePassBase {
   virtual ~SeqPoolConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"seqpool_concat_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index 35d1d5129b..d366803851 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -59,7 +59,7 @@ std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
     const std::string& pass_type = "seqpool_concat_fuse_pass") {
   auto pass = PassRegistry::Instance().Get(pass_type);
   *before = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
   *after = graph->Nodes().size();
   return graph;
 }
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
index 84fb8063e6..e1ddc44470 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -24,11 +24,11 @@ namespace framework {
 namespace ir {
 
 template <int times>
-std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+    ir::Graph *graph) const {
   const std::string pattern_name =
       "simplify_anakin_detection_pattern_pass" + std::to_string(times);
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   std::vector<PDNode *> input_nodes;
@@ -207,11 +207,10 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
     multiclass_nms_out->inputs.push_back(detection_out_op);
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+    GraphSafeRemoveNodes(graph, delete_nodes);
   };
 
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 template class SimplifyAnakinDetectionPatternPass<1>;
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
index 2338e4c38b..e4a266cbe8 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
@@ -32,8 +32,7 @@ class SimplifyAnakinDetectionPatternPass : public FusePassBase {
   virtual ~SimplifyAnakinDetectionPatternPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 78c8cabb10..42f4a91a6f 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -362,13 +363,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
   return fusion_count;
 }
 
-std::unique_ptr<ir::Graph> SquaredMatSubFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_);
+void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  int fusion_count = BuildFusion(graph, name_scope_);
   AddStatis(fusion_count);
-
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index c21ba65c40..b6165a512a 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -31,8 +31,7 @@ class SquaredMatSubFusePass : public FusePassBase {
   virtual ~SquaredMatSubFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 
   const std::string name_scope_{"squared_mat_sub_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
index b370039915..f4f924a604 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@@ -21,8 +21,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void SyncBatchNormPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Use synchronous batch norm";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
@@ -35,7 +34,6 @@ std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
       }
     }
   }
-  return graph;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
index 51cce3dca6..694fae7494 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
@@ -23,8 +23,7 @@ namespace ir {
 
 class SyncBatchNormPass : public Pass {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
index 9c94c1746a..894f96050e 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@@ -60,7 +60,7 @@ TEST(IsTestPass, basic) {
 
   auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
 
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
 
   for (auto* node : graph->Nodes()) {
     if (node->IsOp()) {
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index cab69c408d..61c12d4b6e 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -26,11 +26,10 @@ namespace framework {
 namespace ir {
 
 template <int times>
-std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
   const std::string pattern_name =
       "transpose_flatten" + std::to_string(times) + "_concat_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   std::vector<PDNode *> input_nodes;
@@ -117,11 +116,10 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
     concat_out->inputs.push_back(new_conv_op);
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+    GraphSafeRemoveNodes(graph, delete_nodes);
   };
 
-  gpd(graph.get(), handler);
-  return graph;
+  gpd(graph, handler);
 }
 
 template class TransposeFlattenConcatFusePass<1>;
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index a7d18ec86d..366d26d800 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -30,8 +30,7 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
   virtual ~TransposeFlattenConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 20a8c47d5d..ab0947c631 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -77,8 +77,7 @@ class ParallelExecutorPrivate {
     }
   }
 
-  std::unique_ptr<ir::Graph> PrepareGCAndRefCnts(
-      std::unique_ptr<ir::Graph> graph, size_t max_memory_size);
+  ir::Graph *PrepareGCAndRefCnts(ir::Graph *graph, size_t max_memory_size);
 
   inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
 
@@ -118,8 +117,8 @@ class ParallelExecutorPrivate {
   details::GarbageCollectorMap gcs_;
 };
 
-std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
-    std::unique_ptr<ir::Graph> graph, size_t max_memory_size) {
+ir::Graph *ParallelExecutorPrivate::PrepareGCAndRefCnts(
+    ir::Graph *graph, size_t max_memory_size) {
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &place = places_[i];
     if (gcs_.count(place) > 0) {
@@ -161,7 +160,7 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
                               &global_ref_cnts_);
     ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
                               &last_live_ops_of_vars);
-    graph = ref_cnt_pass->Apply(std::move(graph));
+    graph = ref_cnt_pass->Apply(graph);
     VLOG(10) << "ReferenceCountPass Applied";
 
     auto eager_deletion_pass =
@@ -172,10 +171,9 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
                                      &last_live_ops_of_vars);
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
-    graph = eager_deletion_pass->Apply(std::move(graph));
+    graph = eager_deletion_pass->Apply(graph);
     VLOG(10) << "EagerDeletionPass Applied";
   }
-
   return graph;
 }
 
@@ -220,13 +218,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
   }
 
-  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
-
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.
-  build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(
-      *temp_owned_graph, exec_strategy, build_strategy);
+  build_strategy.enable_parallel_graph_ =
+      EnableParallelGraphExecution(*graph, exec_strategy, build_strategy);
   if (build_strategy.enable_parallel_graph_)
     VLOG(0) << "The Executor would execute the graph by ParallelGraph "
                "Execution which can get better performance,"
@@ -304,27 +300,21 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-
-  temp_owned_graph = build_strategy.Apply(
-      std::move(temp_owned_graph), member_->places_, loss_var_name,
-      member_->local_scopes_, member_->nranks_, member_->use_cuda_,
-      member_->nccl_ctxs_.get());
+  graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
-  temp_owned_graph = build_strategy.Apply(
-      std::move(temp_owned_graph), member_->places_, loss_var_name,
-      member_->local_scopes_, member_->nranks_, member_->use_cuda_);
+  graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_);
 
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    graph = member_
-                ->PrepareGCAndRefCnts(std::move(temp_owned_graph),
-                                      static_cast<size_t>(max_memory_size))
-                .release();
-  } else {
-    graph = temp_owned_graph.release();
+    graph = member_->PrepareGCAndRefCnts(graph,
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 7a96ac11d8..78e502c670 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -140,7 +140,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
     if (pass->Type() != "graph_viz_pass") {
       PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     }
-    graph = pass->Apply(std::move(graph));
+    graph.reset(pass->Apply(graph.release()));
   }
   return graph;
 }
@@ -156,7 +156,7 @@ framework::proto::ProgramDesc IRPassManager::AcquireProgram(
   desc.CopyFrom(*program->Proto());
   pass->SetNotOwned("program", &desc);
   auto *the_graph = graph->release();
-  *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
+  graph->reset(pass->Apply(the_graph));
   return *desc.Proto();
 }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index 12deed2533..9e05aa5c16 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -35,8 +35,8 @@ namespace analysis {
 
 using framework::ir::Node;
 
-std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
-    std::unique_ptr<framework::ir::Graph> graph) const {
+void analysis::AnakinSubgraphPass::ApplyImpl(
+    framework::ir::Graph *graph) const {
   framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
 
   auto teller = [](const framework::ir::Node *node) {
@@ -72,8 +72,6 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
   framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
   graph->Set(framework::ir::kRepetitiveParamAttr,
              new std::vector<std::string>(repetitive_params));
-
-  return graph;
 }
 
 std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
index c13b9ecda4..e80b8bb612 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -29,8 +29,7 @@ namespace analysis {
 
 class AnakinSubgraphPass : public framework::ir::FusePassBase {
  public:
-  std::unique_ptr<framework::ir::Graph> ApplyImpl(
-      std::unique_ptr<framework::ir::Graph> graph) const override;
+  void ApplyImpl(framework::ir::Graph *graph) const override;
 
  private:
   void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 5939940327..ef5872c52c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -31,16 +31,16 @@ namespace analysis {
 
 using framework::ir::Node;
 
-std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
-    std::unique_ptr<framework::ir::Graph> graph) const {
-  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
+void analysis::TensorRtSubgraphPass::ApplyImpl(
+    framework::ir::Graph *graph) const {
+  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
 
   auto teller = [](const framework::ir::Node *node) {
     if (!node->IsOp() || !node->Op()) return false;
     return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
   };
 
-  SubGraphFuser fuser(graph.get(), teller,
+  SubGraphFuser fuser(graph, teller,
                       Get<int>("min_subgraph_size") /*min subgraph size*/);
   fuser();
 
@@ -52,12 +52,11 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
 
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get(), graph_param_names,
-                       &repetitive_params);
+      CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
 
       std::unordered_set<const Node *> nodes2remove(
           Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
-      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
     }
   }
 
@@ -67,11 +66,9 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
       nodes2remove.insert(node);
     }
   }
-  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
   graph->Set(framework::ir::kRepetitiveParamAttr,
              new std::vector<std::string>(repetitive_params));
-
-  return graph;
 }
 
 std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index f043670c5a..f530a5a0b3 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -28,8 +28,7 @@ namespace analysis {
 
 class TensorRtSubgraphPass : public framework::ir::FusePassBase {
  public:
-  std::unique_ptr<framework::ir::Graph> ApplyImpl(
-      std::unique_ptr<framework::ir::Graph> graph) const override;
+  void ApplyImpl(framework::ir::Graph *graph) const override;
 
  private:
   void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 6b3d80fcef..35df396fe8 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
+#include <memory>
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -37,8 +38,7 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
   framework::ProgramDesc desc;
   desc.CopyFrom(*argument->main_program().Proto());
   pass->SetNotOwned("program", &desc);
-  auto thegraph = pass->Apply(std::move(graph));
-  thegraph.release();  // the argument still own the graph.
+  pass->Apply(graph.release());  // the argument still own the graph.
 
   argument->SetIrAnalyzedProgram(
       new framework::proto::ProgramDesc(*desc.Proto()));
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3b0939ef82..d4c85fd0c6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1046,9 +1046,7 @@ All parameter, weight, gradient are variables in Paddle.
                      int val) { self.Set<const int>(name, new int(val)); })
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        std::unique_ptr<ir::Graph> origin_graph(graph.get());
-        auto optim_graph = self.Apply(std::move(origin_graph));
-        optim_graph.release();
+        self.Apply(graph.get());
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(

From 2265d091e6a0e18b48f801e73f048112ecc24904 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 28 Mar 2019 07:42:52 -0500
Subject: [PATCH 17/19] Fix threaded executor bug (#16508)

* fix threaded executor bug
test=develop

* change the order of class member
test=develop

* Fix Travis CI
test=develop
---
 .../fast_threaded_ssa_graph_executor.cc       |  5 +++--
 .../fast_threaded_ssa_graph_executor.h        | 19 ++++++++++++-------
 .../details/threaded_ssa_graph_executor.cc    |  8 ++++----
 .../details/threaded_ssa_graph_executor.h     | 19 +++++++++----------
 4 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index d4fbea9d95..297ee92fc3 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       local_scopes_(local_scopes),
       places_(places),
       graph_(graph),
+      fetch_ctxs_(places),
       pool_(strategy.num_threads_),
-      prepare_pool_(1),  // add one more thread for generate op_deps
-      fetch_ctxs_(places) {
+      // add one more thread for generate op_deps
+      prepare_pool_(1) {
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     int dep = static_cast<int>(op->NotReadyInputSize());
     op_deps_.emplace(op, dep);
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 970298950c..f6d5160e75 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -14,7 +14,9 @@
 
 #pragma once
 #include <ThreadPool.h>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   const ir::Graph &Graph() const override;
 
  private:
+  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
+  // be destroyed first.
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unordered_map<OpHandleBase *, int> op_deps_;
   std::vector<OpHandleBase *> bootstrap_ops_;
 
-  ::ThreadPool pool_;
-  ::ThreadPool prepare_pool_;
   platform::DeviceContextPool fetch_ctxs_;
   std::atomic<int> remaining_;
 
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+
+  ::ThreadPool pool_;
+  ::ThreadPool prepare_pool_;
+
   void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
                   OpHandleBase *op,
                   const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
 
   void PrepareAtomicOpDeps();
-
-  std::future<
-      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
-      atomic_op_deps_;
-  ExceptionHolder exception_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c4254bbadf..c00932a7bd 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, ir::Graph *graph)
     : graph_(graph),
-      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
-                                       : nullptr),
-      prepare_pool_(1),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      strategy_(strategy) {
+      strategy_(strategy),
+      prepare_pool_(1),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr) {
   PrepareOpDeps();
   CopyOpDeps();
 }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b9bccba8fa..1fa5196970 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
+  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
+  // be destroyed first.
   ir::Graph *graph_;
-  std::unique_ptr<::ThreadPool> pool_;
-  ::ThreadPool prepare_pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
+  std::unique_ptr<OpDependentData> op_deps_;
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
+  ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
+  ::ThreadPool prepare_pool_;
+  std::unique_ptr<::ThreadPool> pool_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
@@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
   void PrepareOpDeps();
   void CopyOpDeps();
-
- private:
-  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
-
-  ExecutionStrategy strategy_;
-  std::unique_ptr<OpDependentData> op_deps_;
-  // use std::list because clear(), push_back, and for_each are O(1)
-  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details

From 2632327429ed823cf1a0c2593cfa411fb2f111b9 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 28 Mar 2019 14:27:17 +0100
Subject: [PATCH 18/19] [MKL-DNN] Tensor modifications revert  (#16462)

* Revert "[MKL-DNN] Fix to crash of Transformer when mkldnn is to be used (#16233)"

This reverts commit 13816dd4acdabd21a715b3b1c63fb43cdbac7622.
Apart from enabling transformer for MKL-DNN

* Revert "- MKL-DNN pooling updated to set_prim_desc"

This reverts commit c63f6b20393d8b21b540e2b6091419e584ea5155.

Conflicts:
	paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc

* Revert "[MKL-DNN] MKL-DNN specific Tensor modification (#15429)"

test=develop

This reverts commit dec9cf53c89e0acc605a053b436ba24be68f62c7.

* - concat compilation fix

- lint

test=develop

- Lint fixes

test=develop

- Lint fixes

test=develop

- Fix Transpose MKLDNN op

test=develop
---
 .../fluid/framework/data_layout_transform.cc  | 23 ++++--
 paddle/fluid/framework/data_transform.cc      | 30 ++------
 paddle/fluid/framework/tensor.h               | 42 +++--------
 paddle/fluid/framework/tensor_util.cc         |  5 --
 .../mkldnn/elementwise_add_mkldnn_op.cc       | 19 +++--
 .../operators/mkldnn/activation_mkldnn_op.cc  | 24 ++++--
 .../operators/mkldnn/batch_norm_mkldnn_op.cc  | 36 ++++++---
 .../operators/mkldnn/concat_mkldnn_op.cc      |  3 +-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 47 ++++++------
 .../mkldnn/conv_transpose_mkldnn_op.cc        |  3 +-
 .../mkldnn/gaussian_random_mkldnn_op.cc       |  8 +-
 .../fluid/operators/mkldnn/lrn_mkldnn_op.cc   | 21 ++++--
 .../operators/mkldnn/softmax_mkldnn_op.cc     |  8 --
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   |  9 ++-
 .../operators/mkldnn/transpose_mkldnn_op.cc   | 26 ++-----
 paddle/fluid/platform/mkldnn_reuse.h          | 73 +++++++++----------
 paddle/fluid/platform/mkldnn_utils.h          | 69 ------------------
 17 files changed, 172 insertions(+), 274 deletions(-)
 delete mode 100644 paddle/fluid/platform/mkldnn_utils.h

diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 10aa7a5942..72c50518af 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   out_layout =
       out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
 
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
   std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
   std::vector<int> out_tz = in_tz;
 
@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: %s", in.type());
   memory::data_type out_type = in_type;
 
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  // tempory mem pd fr out , to make reorder
-  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
-      paddle::framework::vectorize2int(out->dims()),
-      mkldnn::memory::format::blocked, out_type);
-  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
+  if (in_format != out_format) {
     void* in_data = GetDataFromTensor(in, in_type);
     auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
-    auto out_memory = memory(out_mem_pd, out_data);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
 
     platform::Reorder(in_memory, out_memory);
   } else {
     out->ShareDataWith(in);
   }
   out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
 #endif
 }
 
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index f0203edf05..8287222450 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
+
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
+
         out.ShareDataWith(input_tensor);
-        // TODO(jczaja): Remove that once all mkldnn ops
-        // are modified to work with mkldnn_blocked
-        auto mkldnn_fmt = [&](int rank) {
-          switch (rank) {
-            case 5:
-              return mkldnn::memory::format::ncdhw;
-            case 4:
-              return mkldnn::memory::format::nchw;
-            case 3:
-              return mkldnn::memory::format::ncw;
-            case 2:
-              return mkldnn::memory::format::nc;
-            case 1:
-              return mkldnn::memory::format::x;
-            default:
-              return mkldnn::memory::format::blocked;
-          }
-        };
-
-        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
-            paddle::framework::vectorize2int(out.dims()),
-            mkldnn_fmt(out.dims().size()));
-
-        out.set_mkldnn_prim_desc(out_mem_pd);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(out_format);
 #endif
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 88f5b757a8..a3c1063ce9 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cstring>
 #include <memory>
 #include <typeindex>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
@@ -27,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_utils.h"
-#endif
-
 namespace paddle {
 
 namespace framework {
@@ -41,34 +38,10 @@ class Tensor {
 #ifdef PADDLE_WITH_MKLDNN
 
  public:
-  // TODO(jczaja): This is depracted and will be removed
-  inline mkldnn::memory::format format() const {
-    if (layout_ == DataLayout::kMKLDNN) {
-      return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
-    } else {
-      return mkldnn::memory::format::format_undef;
-    }
-  }
+  inline mkldnn::memory::format format() const { return format_; }
 
-  // TODO(jczaja): This is depracted and will be removed
-  inline void set_format(
-      const mkldnn::memory::format fmt,
-      mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
-    mem_pd_ = paddle::platform::create_prim_desc_from_format(
-        paddle::framework::vectorize2int(dims()), fmt, data_type);
-    layout_ = DataLayout::kMKLDNN;
-  }
-
-  inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
-    return mem_pd_;
-  }
-
-  inline void set_mkldnn_prim_desc(
-      const mkldnn::memory::primitive_desc& mem_pd) {
-    // Internally MKL-DNN is just copying (increasing reference counter)
-    // to shared_ptr. So asignment should be quite cheap
-    mem_pd_ = mem_pd;
-    layout_ = DataLayout::kMKLDNN;
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
   }
 
  protected:
@@ -76,9 +49,12 @@ class Tensor {
    * @brief the detail format of memory block which have layout as kMKLDNN
    *
    * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
    */
-  mutable mkldnn::memory::primitive_desc mem_pd_;
+
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
 #endif
 
  public:
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 5f21dae605..a7f09df491 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-#ifdef PADDLE_WITH_MKLDNN
-    if (src.layout() == DataLayout::kMKLDNN) {
-      dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
-    }
-#endif
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 7aaa607f15..6a6741d8fc 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       } else {
         functor.RunMidWise(n, pre, post);
       }
-      z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc());
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
     } else {
       PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
                          x->format() != memory::format::format_undef,
@@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
 
       // create mkldnn memory for dst
-      auto dst_mem_pd = sum_pd.dst_primitive_desc();
-      memory dst_memory = memory(dst_mem_pd, z_data);
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
 
       std::vector<primitive::at> inputs;
       inputs.push_back(srcs[0]);
@@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       pipeline.push_back(sum_prim);
       stream(stream::kind::eager).submit(pipeline).wait();
 
-      z->set_mkldnn_prim_desc(dst_mem_pd);
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
     }
   }
 };
@@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     auto* out = dout;
     auto *x = dout, *y = dout;
 
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
     if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
       if (dx->dims() == dy->dims()) {
         auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
         if (dx) {
           blas.VCOPY(dout->numel(), dout->data<T>(),
                      dx->mutable_data<T>(ctx.GetPlace()));
-          dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
+          set_mkldnn_format(dx, dout);
         }
 
         if (dy) {
           blas.VCOPY(dout->numel(), dout->data<T>(),
                      dy->mutable_data<T>(ctx.GetPlace()));
-          dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
+          set_mkldnn_format(dy, dout);
         }
       }
     } else {
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 43559940d9..5b7505f3c4 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
-  auto src_format = x->format();
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
 
   const std::string key = gethash(src_tz, algorithm);
   const std::string key_src_data =
@@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   if (p_fwd == nullptr) {
     // create mkldnn memory for input X
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
     auto src_memory = std::shared_ptr<memory>(
-        new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data)));
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
     // save src_memory to be referred in backward path
     dev_ctx.SetBlob(key_src_mem, src_memory);
 
@@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   pipeline.push_back(*p_fwd);
   stream(stream::kind::eager).submit(pipeline).wait();
 
-  y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }
 
 template <typename T>
@@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
 
   std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
 
+  auto diff_y_format =
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
+
   const std::string key = gethash(diff_dst_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
@@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
       key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
   const std::string key_fwd_pd =
       key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
-  const std::string key_with_layouts = key + std::to_string(*p_src_layout) +
-                                       "-" + std::to_string(diff_y->format());
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
   const std::string key_diff_src_mem =
       key_with_layouts + "@eltwise_diff_src_mem";
   const std::string key_diff_dst_mem =
@@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
 
   if (p_grad == nullptr) {
     // create mkldnn memory for input diff_y
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
     auto diff_dst_memory = std::shared_ptr<memory>(
-        new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)));
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
     dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
 
     // retrieve eltwise primitive desc from device context
@@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   pipeline.push_back(*p_grad);
   stream(stream::kind::eager).submit(pipeline).wait();
 
-  diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
 
 template <typename T, mkldnn::algorithm algorithm>
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 04e45d4853..bddca232e6 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
     // keys for backward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, global_stats, x->format(),
+        src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
-    auto user_src_md = x->get_mkldnn_prim_desc().desc();
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
                                    key);
 
-    auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(),
-                                               to_void_cast(x_data));
+    auto src_memory =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
 
     // crate mkldnn memory for weights(scale/shift)
     auto scaleshift_memory =
@@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           variance_memory, false);
     }
 
-    y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
 
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*batch_norm_p);
@@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
     mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
@@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     // keys from forward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, false, x->format(),
+        src_tz, epsilon, flags, false, input_format,
         ctx.op().Input("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
     // keys for primitives reuse
     const std::string key_with_hash =
         key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
-                                              x->format());
+                                              input_format);
     const std::string key_batch_norm_bwd_p =
         key_with_hash + "@batch_norm_bwd_p";
     const std::string key_batch_norm_src_mem_p =
@@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     primitive reorder_diff_dst;
     bool is_diff_dst_reordered = false;
-    auto user_diff_dst_memory =
-        memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data));
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // MKLDNN requires a single piece of memory for scale and shift/bias data
     const size_t scaleshift_size = 2 * ic;
@@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
 
       // set layout/format of output tensors
-      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
     } else {
       // primitives already exist
       UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
@@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       // set layout/format of output tensors
-      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
     }
 
     // execute optional reorder and batch_norm backward primitive
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 97387af92f..50fe2e6e4c 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     stream(stream::kind::eager).submit({*concat_p}).wait();
 
-    output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetDstMemFormat(*concat_pd));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 8d96ae7e42..5e4d79f1c3 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
-    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
     PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
                    "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
     PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
@@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     std::vector<primitive> pipeline;
 
-    // For convolution with groups we need to recreate primitive descriptor
-    // as Paddle tensor is not having group dims while mkldnn treats
-    // group as another dimensions
-    mkldnn::memory::primitive_desc user_weights_mpd =
-        filter->get_mkldnn_prim_desc();
-    if (g > 1) {
-      mkldnn::memory::format weights_format =
-          GetWeightsFormat(filter->format(), g, is_conv3d);
-      auto user_weights_md = platform::MKLDNNMemDesc(
-          {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-      user_weights_mpd =
-          mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
-    }
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    mkldnn::memory::format weights_format = mkldnn::memory::format::any;
+    weights_format = mkldnn::memory::format::any;
     // Check the format for user's special output
     if (chosen_memory_format != mkldnn::memory::format::any) {
       if (is_conv3d) {
@@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p = handler.AcquireSrcMemory(
-        input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
     auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_mpd, to_void_cast<T>(filter_data));
+        user_weights_md, to_void_cast<T>(filter_data));
 
     // create reorder primitive if the input format is not the preferred one
     auto src_memory_p =
@@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
   void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       // push primitive to stream and wait until it's executed
       pipeline.push_back(*conv_bwd_weights_p);
 
-      auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
-      filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
+      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
     }
 
     if (input_grad) {
@@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
       pipeline.push_back(*conv_bwd_data_p);
 
-      input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc());
+      input_grad->set_layout(DataLayout::kMKLDNN);
+      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
     }
     stream(stream::kind::eager).submit(pipeline).wait();
   }
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 79a0c5c768..317d4cebe2 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*conv_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
   }
 
  private:
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index d01e8dbf4c..76b00b396c 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     // The format of output is set as the mkldnn's format
     // TODO(@mozga-intel) The format of matrix sets inside the another layers.
-    // TODO(jczaja): Remove this hack after checking performance on block layout
-
-    auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
-        paddle::framework::vectorize2int(tensor->dims()),
-        mkldnn::memory::format::oihw);
-    tensor->set_mkldnn_prim_desc(tensor_mem_pd);
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 4ff27ab122..097ba01d40 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
     e_mid = e_mid.constant(k);
 
-    auto src_md = x->get_mkldnn_prim_desc().desc();
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, x->format());
 
     auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                   mkldnn::lrn_across_channels,
@@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   beta,
                                                   k};
 
-    auto src_memory_pd = x->get_mkldnn_prim_desc();
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
 
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
@@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       src_memory->set_data_handle(
           static_cast<void*>(const_cast<T*>(input_data)));
 
-      auto dst_memory_pd = forward_pd->dst_primitive_desc();
-      auto dst_memory =
-          mkldnn::memory(dst_memory_pd, static_cast<void*>(output_data));
+      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
       auto workspace_memory = insert_to_context<mkldnn::memory>(
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
       run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
-      out->set_mkldnn_prim_desc(dst_memory_pd);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
-      auto dst_memory_pd = forward_pd.dst_primitive_desc();
       auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
                                        static_cast<void*>(output_data));
 
       run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
-      out->set_mkldnn_prim_desc(dst_memory_pd);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     }
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 0ce5522194..dc1176f084 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto softmax_p =
         handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
 
-    // We cannot use softmax_dst_memory_p to get prim desc as
-    // it contains flattened dims (2D) while output tensor can
-    // have 2,3,4+ dims
-    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
-        paddle::framework::vectorize2int(output->dims()),
-        mkldnn::memory::format::blocked);
-    output->set_mkldnn_prim_desc(output_mem_pd);
-
     std::vector<primitive> pipeline{
         *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
     stream(stream::kind::eager).submit(pipeline).wait();
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index aef5b7d431..6f64157b64 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
 
       auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
-      auto dst_mem_pd = sum_pd.dst_primitive_desc();
+
       std::shared_ptr<memory> dst_mem;
       if (in_place) {
-        dst_mem.reset(new memory(dst_mem_pd));
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
       } else {
-        dst_mem.reset(new memory(dst_mem_pd, output_data));
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
       }
       std::vector<mkldnn::primitive::at> inputs;
       for (size_t i = 0; i < srcs_mem.size(); ++i) {
@@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       if (in_place) pipeline.push_back(reorder_prim);
       stream(stream::kind::eager).submit(pipeline).wait();
 
-      output->set_mkldnn_prim_desc(dst_mem_pd);
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
     } else {  // Fallback to naive version
       // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
       SumKernel<CPUDeviceContext, T> reference_kernel;
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 4debc7ca5e..95cee806ac 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                              mkldnn_engine, key);
 
     auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data));
+        input->format(), platform::to_void_cast<T>(input_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(output, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    // Transpose did change logical dimensions of Tensor, but reorder does not.
-    // Reorder does change only physical layout eg. format , strides
-    // so we need to create new primitive descriptor with changed logical layout
-    // so it match output shape
-    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
-        paddle::framework::vectorize2int(output->dims()),
-        mkldnn::memory::format::blocked);
-    output->set_mkldnn_prim_desc(output_mem_pd);
+    output->set_layout(DataLayout::kNCHW);
+    output->set_format(mkldnn::memory::format::format_undef);
   }
 };
 
@@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
                                              mkldnn_engine, key);
 
-    auto transpose_src_memory_p =
-        handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(),
-                                 platform::to_void_cast<T>(out_grad_data));
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(x_grad, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    // Transpose did change logical dimensions of Tensor, but reorder does not.
-    // Reorder does change only physical layout eg. format , strides
-    // so we need to create new primitive descriptor with changed logical layout
-    // so it match output shape
-    auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
-        paddle::framework::vectorize2int(x_grad->dims()),
-        mkldnn::memory::format::blocked);
-    x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
   }
 };
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 4fa6774f02..ecaad4ec07 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout_transform.h"
@@ -39,45 +40,6 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_src_mem_p");
   }
 
-  // TODO(jczaja): extract common part and make AcquireMemory
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   " find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
-    auto local_key = key_ + "@user_weights_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   " find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
       const mkldnn::memory::desc& md, void* ptr,
       user_function custom_func = {}) {
@@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
                          mkldnn::engine engine, const std::string& base_key)
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
         dims_(dims),
-        axis_(axis) {}
+        axis_(axis),
+        logical_axis_(dims.size(), 0) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::format& fmt, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      // Make memory descriptor using input format, unless it
+      // cannot be trusted (nchw) then make up memory fmt manually
+      for (size_t i = 0; i < logical_axis_.size(); ++i) {
+        logical_axis_[i] = i;
+      }
+      auto src_md = fmt != mkldnn::memory::format::nchw
+                        ? platform::MKLDNNMemDesc(
+                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
+                        : Axis2MemoryDesc(dims_, logical_axis_);
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
                                                    platform::Place place) {
@@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
  private:
   std::vector<int> dims_;
   std::vector<int> axis_;
+  std::vector<int> logical_axis_;
 };
 
 template <class forward_t, class backward_data_t, class backward_weights_t>
diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h
deleted file mode 100644
index 8c511f97d1..0000000000
--- a/paddle/fluid/platform/mkldnn_utils.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <mkldnn.h>
-#include <string>
-
-namespace paddle {
-namespace platform {
-
-inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
-    const std::vector<int>& ltz, mkldnn::memory::format fmt,
-    mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
-  mkldnn_memory_desc_t mem_fmt;
-
-  mem_fmt.primitive_kind = mkldnn_memory;
-  mem_fmt.ndims = ltz.size();
-  for (unsigned int i = 0; i < ltz.size(); ++i) {
-    mem_fmt.dims[i] = ltz[i];  // logical dimensions (nchw format,
-                               // regardless physical layout)
-  }
-  mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
-  mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
-
-  unsigned int total_stride = 1;
-  for (int i = ltz.size() - 1; i >= 0; --i) {
-    mem_fmt.layout_desc.blocking.padding_dims[i] =
-        ltz[i];  // logical dimensions (nchw format, regardless physical
-                 // layout)
-    mem_fmt.layout_desc.blocking.block_dims[i] = 1;
-    mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
-    mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
-    mem_fmt.layout_desc.blocking.strides[1][i] = 1;
-    total_stride *= ltz[i];
-  }
-  mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
-
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto place = paddle::platform::CPUPlace();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
-  auto& cpu_engine = dev_ctx->GetEngine();
-  return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
-}
-
-inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
-    const std::vector<int>& ltz, const mkldnn::memory::format format,
-    const mkldnn::memory::data_type data_type) {
-  auto md = mkldnn::memory::desc({ltz}, data_type, format);
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto place = paddle::platform::CPUPlace();
-  auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
-  PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
-  auto& cpu_engine = dev_ctx->GetEngine();
-  return mkldnn::memory::primitive_desc(md, cpu_engine);
-}
-
-}  // namespace platform
-}  // namespace paddle

From 1096746cbfbb2a5cba835e284b3054f66db0ea85 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 28 Mar 2019 09:59:54 -0500
Subject: [PATCH 19/19] Fuse Adam And SGD ops (#15933)

* fuse optimizer
---
 paddle/fluid/framework/details/CMakeLists.txt |   9 +-
 .../alloc_continuous_space_for_grad_pass.cc   |  48 ++--
 .../framework/details/broadcast_op_handle.cc  |  13 +-
 .../fluid/framework/details/build_strategy.cc |  52 +++-
 .../fluid/framework/details/build_strategy.h  |   3 +-
 .../framework/details/fuse_adam_op_pass.cc    | 199 +++++++++++++++
 .../framework/details/fuse_adam_op_pass.h     |  55 ++++
 .../details/fuse_optimizer_op_pass.cc         | 240 ++++++++++++++++++
 .../details/fuse_optimizer_op_pass.h          |  75 ++++++
 .../framework/details/fuse_sgd_op_pass.cc     |  74 ++++++
 .../framework/details/fuse_sgd_op_pass.h      |  50 ++++
 .../details/fused_all_reduce_op_handle.cc     |  29 ++-
 .../details/multi_devices_graph_pass.h        |   5 +-
 .../framework/details/multi_devices_helper.h  |  26 +-
 paddle/fluid/framework/tensor.cc              |   2 +-
 paddle/fluid/framework/tensor.h               |   2 +-
 .../operators/alloc_continuous_space_op.cc    |  45 +++-
 paddle/fluid/pybind/pybind.cc                 |   9 +
 .../unittests/parallel_executor_test_base.py  |   2 +
 .../test_alloc_continuous_space_op.py         |  43 +++-
 .../unittests/test_fuse_optimizer_pass.py     | 135 ++++++++++
 .../unittests/test_parallel_executor_crf.py   | 115 +++++----
 .../test_parallel_executor_dry_run.py         |  17 +-
 23 files changed, 1101 insertions(+), 147 deletions(-)
 create mode 100644 paddle/fluid/framework/details/fuse_adam_op_pass.cc
 create mode 100644 paddle/fluid/framework/details/fuse_adam_op_pass.h
 create mode 100644 paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
 create mode 100644 paddle/fluid/framework/details/fuse_optimizer_op_pass.h
 create mode 100644 paddle/fluid/framework/details/fuse_sgd_op_pass.cc
 create mode 100644 paddle/fluid/framework/details/fuse_sgd_op_pass.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 046ec6978a..d4939779a2 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
+
 cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
+cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
+cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
         fuse_elewise_add_act_pass multi_batch_merge_pass 
-        fuse_relu_depthwise_conv_pass 
-        memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
+        fuse_relu_depthwise_conv_pass
+        memory_optimize_pass lock_free_optimize_pass
+        alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
+        fuse_adam_op_pass fuse_sgd_op_pass)
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
index e195e93fb8..8e8258ffb1 100644
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
+
 DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
               "fuse_parameter_memory_size is up limited memory size "
               "of one group parameters' gradient which is the input "
@@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
       auto ele_dtype = iter->second->Var()->GetDataType();
       if (dtype == kDefaultDtype) {
         dtype = ele_dtype;
-        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
+        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
+                          "The data type should not be bool.");
       }
-      PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+      PADDLE_ENFORCE_EQ(ele_dtype, dtype,
+                        "The data type of input is not consistent.");
     }
 
-    // Create the fused variable name.
+    // Create a FusedVarsSet to avoid duplicating names for fused_var in other
+    // pass.
     if (!result.Has(kFusedVars)) {
       result.Set(kFusedVars, new FusedVars);
     }
-    const std::string prefix(kFusedVarNamePrefix);
-    // The fused_var_name should be unique.
-    auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
+    // the kFusedGrads is used be fuse_optimizer_op_pass.
+    result.Set(kFusedGrads, new FusedGrads);
+
+    // the fused_var_name should be unique, so it appends
+    // params_grads.begin()->second.
+    auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
+                          params_grads.begin()->second;
+    result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
     auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
+                      "%s is duplicate in FusedVars.", fused_var_name);
     fused_var_set.insert(fused_var_name);
 
     InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
@@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     return type == proto::VarType::LOD_TENSOR;
   }
 
-  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
-                                 const std::vector<std::string> &grads_name,
-                                 const std::string &fused_var_name,
-                                 BlockDesc *global_block) const {
-    auto op_desc = global_block->AppendOp();
-    op_desc->SetType("alloc_continuous_space");
-    op_desc->SetInput("Input", params_name);
-    op_desc->SetOutput("Output", grads_name);
-    op_desc->SetOutput("FusedOutput", {fused_var_name});
-  }
-
   void RecordParamsAndGrads(ir::Node *node,
                             ParamsAndGrads *params_grads) const {
     try {
@@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
       }
     }
 
+    // Alloc continuous space for vars.
     std::vector<std::string> grads_name;
     std::vector<std::string> params_name;
     grads_name.reserve(params_grads.size());
@@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
                               program_desc.MutableBlock(0));
 
-    // Run Only Once Programs
     for (size_t i = 0; i < local_scopes.size(); ++i) {
       for (auto &op_desc : program_desc.Block(0).AllOps()) {
         auto op = OpRegistry::CreateOp(*op_desc);
@@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
       }
     }
   }
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const {
+    auto op_desc = global_block->AppendOp();
+    op_desc->SetType("alloc_continuous_space");
+    op_desc->SetInput("Input", params_name);
+    op_desc->SetOutput("Output", grads_name);
+    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  }
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index fdff83b928..752c932a21 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() {
   if (places_.size() == 1) return;
 
   // The input and output may have dummy vars.
-  VarHandle *in_var_handle;
-  {
-    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                      "The number of input should be one.");
-    in_var_handle = in_var_handles[0];
-  }
-
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
   auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
+                    "The number of input should be one.");
   PADDLE_ENFORCE_EQ(
       out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
+  VarHandle *in_var_handle = in_var_handles[0];
+
   WaitInputVarGenerated();
 
   std::vector<const Scope *> var_scopes;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 078403f30f..df69b11ec6 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <memory>
 #include <utility>
-
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
@@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       AppendPass("inplace_pass");
     }
 
-    if (strategy.fuse_elewise_add_act_ops_) {
+    if (strategy_.fuse_elewise_add_act_ops_) {
       VLOG(10) << "Add fuse_elewise_add_act_pass";
       AppendPass("fuse_elewise_add_act_pass");
     }
 
     // for single card training, fuse_all_reduce_ops is unnecessary.
     // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
-    if (strategy.fuse_all_reduce_ops_) {
+    if (strategy_.fuse_all_reduce_ops_) {
       VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
       AppendPass("alloc_continuous_space_for_grad_pass");
     }
 
+    if (strategy_.fuse_all_optimizer_ops_) {
+      if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
+          strategy_.is_distribution_) {
+        VLOG(3)
+            << "Currently, fuse_all_optimizer_ops only works under AllReduce "
+               "mode.";
+        strategy_.fuse_all_optimizer_ops_ = false;
+      } else {
+        VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+        AppendPass("alloc_continuous_space_for_grad_pass");
+        // NOTE: fuse_all_xx_ops will count the number of xx operator first,
+        // if the number is zero, fuse_all_reduce_ops will do nothing.
+        // Currently, only one type of optimization algorithm can be fused.
+        VLOG(10) << "Add fuse_adam_op_pass";
+        AppendPass("fuse_adam_op_pass");
+        VLOG(10) << "Add fuse_sgd_op_pass";
+        AppendPass("fuse_sgd_op_pass");
+      }
+    }
+
     // Add a graph viz pass to record a graph.
     if (!strategy.debug_graphviz_path_.empty()) {
       auto viz_pass = AppendPass("graph_viz_pass");
       const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph");
       viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
     }
 
@@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // the de-fact IR, any reuse on Graph is meaningless.
     // A side-effect of that, memory optimize cannot forsee the fetched vars
     // , so fetchlist should be set persistable before call the Run interface.
-    if (strategy.memory_optimize_) {
+    if (strategy_.memory_optimize_) {
       VLOG(10) << "Add memory_optimize_pass";
       AppendPass("memory_optimize_pass");
     }
 
-    AppendMultiDevPass(strategy);
+    AppendMultiDevPass(strategy_);
 
-    if (strategy.fuse_all_reduce_ops_) {
+    if (strategy_.fuse_all_reduce_ops_) {
       // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
       // first, if the number is zero, fuse_all_reduce_ops will do nothing.
       VLOG(10) << "Add fuse_all_reduce_op_pass";
@@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       AppendPass("all_reduce_deps_pass");
     }
 
-    if (SeqOnlyAllReduceOps(strategy)) {
+    if (SeqOnlyAllReduceOps(strategy_)) {
       VLOG(10) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
     }
@@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   // Convert graph to run on multi-devices.
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass = nullptr;
-    if (strategy_.is_distribution_) {
+    if (strategy.is_distribution_) {
       VLOG(10) << "Add dist_multi_devices_pass";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
@@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kNCCLCtxs);
       pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
-    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
+    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
+               pass->Type() == "fuse_adam_op_pass" ||
+               pass->Type() == "fuse_sgd_op_pass" ||
+               pass->Type() == "fuse_all_reduce_op_pass") {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
       pass->Erase(kLocalScopes);
       pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                     &local_scopes);
+      if (pass->Type() == "fuse_all_reduce_op_pass") {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+        platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+        pass->Erase(kNCCLCtxs);
+        pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
+      }
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -294,4 +318,6 @@ USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
 USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
+USE_PASS(fuse_adam_op_pass);
+USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 9587a6f0f9..85f328b7c4 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -18,7 +18,6 @@
 #include <string>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -76,6 +75,8 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
+  bool fuse_all_optimizer_ops_{false};
+
   bool fuse_all_reduce_ops_{false};
 
   bool fuse_relu_depthwise_conv_{false};
diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
new file mode 100644
index 0000000000..0ef75e3192
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
@@ -0,0 +1,199 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
+
+const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
+  return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
+}
+
+void FuseAdamOpPass::FuseOptimizerOps(
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+  FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
+  FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
+               adam_ops, graph);
+  FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
+               adam_ops, graph);
+}
+
+void FuseAdamOpPass::FuseAdamOps(
+    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+  PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
+
+  // Check attributions
+  // NOTE: If new attribution is added, the following code maybe need change.
+  int op_role = boost::get<int>(
+      adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
+  float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
+  float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
+  bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
+  int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
+      adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
+  for (auto &adam_op : adam_ops) {
+    PADDLE_ENFORCE_EQ(beta1,
+                      boost::get<float>(adam_op->Op()->GetAttr("beta1")));
+    PADDLE_ENFORCE_EQ(beta2,
+                      boost::get<float>(adam_op->Op()->GetAttr("beta2")));
+    PADDLE_ENFORCE_EQ(epsilon,
+                      boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
+    PADDLE_ENFORCE_EQ(lazy_mode,
+                      boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
+    PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
+                      boost::get<int64_t>(adam_op->Op()->GetAttr(
+                          "min_row_size_to_use_multithread")));
+    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+  }
+
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+
+  VLOG(10) << "Insert adam to graph ";
+  OpDesc adam_desc(adam_ops[0]->Op()->Block());
+  adam_desc.SetType("adam");
+  adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
+  adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
+  adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
+  adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
+  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
+  adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
+  adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
+  adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
+
+  adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+  adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
+  adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
+  adam_desc.SetAttr("beta1", beta1);
+  adam_desc.SetAttr("beta2", beta2);
+  adam_desc.SetAttr("epsilon", epsilon);
+  adam_desc.SetAttr("lazy_mode", lazy_mode);
+  adam_desc.SetAttr("min_row_size_to_use_multithread",
+                    min_row_size_to_use_multithread);
+  adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+
+  auto adam_node = graph->CreateOpNode(&adam_desc);
+
+  InserInputAndOutputForOptOps(adam_ops, adam_node);
+}
+
+void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
+                                  const std::string &fused_var_name,
+                                  const std::vector<ir::Node *> &adam_ops,
+                                  ir::Graph *graph) const {
+  PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
+  const std::string scale_op_name = "scale";
+
+  // Get the scale_ops of dealing the adam's beta var.
+  std::vector<ir::Node *> scale_ops;
+  scale_ops.reserve(beta_name.size());
+  for (size_t i = 0; i < adam_ops.size(); ++i) {
+    auto &beta_1_pow_name = beta_name[i];
+    auto beta_pow_iter = std::find_if(
+        adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
+        [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
+          return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
+        });
+    PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
+
+    auto beta_pow_node = *beta_pow_iter;
+    auto scale_op_iter = std::find_if(
+        beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
+        [&scale_op_name](ir::Node *op_node) -> bool {
+          return op_node->Op() && op_node->Op()->Type() == scale_op_name;
+        });
+    PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
+
+    scale_ops.emplace_back(*scale_op_iter);
+  }
+  PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
+
+  // Check attributions
+  // NOTE: If new attribution is added, the following code maybe need change.
+  int op_role = boost::get<int>(
+      scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
+  float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
+  bool bias_after_scale =
+      boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
+  for (auto &scale_op : scale_ops) {
+    PADDLE_ENFORCE_EQ(scale,
+                      boost::get<float>(scale_op->Op()->GetAttr("scale")));
+    PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
+    PADDLE_ENFORCE_EQ(
+        bias_after_scale,
+        boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
+    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+  }
+
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+
+  VLOG(10) << "Insert fused scale to graph.";
+  OpDesc scale_desc(scale_ops[0]->Op()->Block());
+  scale_desc.SetType("scale");
+  scale_desc.SetInput("X", {fused_var_name});
+  scale_desc.SetOutput("Out", {fused_var_name});
+  scale_desc.SetAttr("scale", scale);
+  scale_desc.SetAttr("bias", bias);
+  scale_desc.SetAttr("bias_after_scale", bias_after_scale);
+  scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+  auto scale_node = graph->CreateOpNode(&scale_desc);
+
+  for (auto scale_op : scale_ops) {
+    // set inputs
+    scale_node->inputs.insert(scale_node->inputs.begin(),
+                              scale_op->inputs.begin(), scale_op->inputs.end());
+    for (auto &input : scale_op->inputs) {
+      std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
+                   scale_node);
+    }
+    // set outputs
+    scale_node->outputs.insert(scale_node->outputs.begin(),
+                               scale_op->outputs.begin(),
+                               scale_op->outputs.end());
+    for (auto &output : scale_op->outputs) {
+      std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
+                   scale_node);
+    }
+  }
+
+  // Delete scale_ops
+  for (auto &scale_op : scale_ops) {
+    graph->RemoveNode(scale_op);
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h
new file mode 100644
index 0000000000..5866c37552
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.h
@@ -0,0 +1,55 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseAdamOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const;
+
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
+
+  // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
+
+  void FuseAdamOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
+
+  void FuseScaleOps(const std::vector<std::string> &aux_var_set,
+                    const std::string &fused_var_name,
+                    const std::vector<ir::Node *> &adam_ops,
+                    ir::Graph *graph) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
new file mode 100644
index 0000000000..b49f095d42
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
@@ -0,0 +1,240 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include <algorithm>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
+  ir::Graph &result = *graph;
+
+  auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+  auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+
+  const std::string fuse_op_type = GetOpType();
+  const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+
+  // Step 1: Get the specified op and auxiliary variables.
+  std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+  std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
+  std::vector<ir::Node *> opt_ops;
+  for (auto &node : topo_nodes) {
+    GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops,
+                           &aux_var_set);
+  }
+
+  VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
+  if (opt_ops.size() == 0) {
+    return;
+  }
+
+  if (result.Has(kFusedOptType)) {
+    VLOG(10)
+        << "Currently only support fusing one type optimizer op. Has fused "
+        << result.Get<FusedOptType>(kFusedOptType);
+    return;
+  } else {
+    result.Set(kFusedOptType, new FusedOptType);
+  }
+  result.Get<FusedOptType>(kFusedOptType) = fuse_op_type;
+
+  // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
+  // initialized in scopes before execution.
+  if (!result.Has(kFusedVars)) {
+    result.Set(kFusedVars, new FusedVars);
+  }
+  std::unordered_map<std::string, std::string> fused_vars_name;
+  fused_vars_name.reserve(aux_var_names.size() + 1);
+  auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+  const std::string prefix(kFusedVarNamePrefix);
+  // NOTE: the fused_var_name should be unique.
+  for (auto &var_name : aux_var_names) {
+    auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
+                          aux_var_set[var_name][0];
+    VLOG(10) << fused_var_name;
+    fused_vars_name.emplace(var_name, fused_var_name);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    fused_var_set.insert(fused_var_name);
+  }
+
+  // Step 3: Get the fused Gradient's name
+  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+  if (!result.Has(kFusedGrads)) {
+    PADDLE_THROW(
+        "The alloc_continuous_space_for_grad_pass should be called before this "
+        "pass.");
+  }
+  auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
+  auto &fused_vars = result.Get<FusedVars>(kFusedVars);
+  auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
+  PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
+  fused_vars_name.emplace("Grad", fused_grad);
+
+  // Step 4: Sort the parameters and auxiliary variables according
+  // to parameters' name to make variables' name correspond correctly.
+  PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
+  PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
+                    "The size of params_grads and aux_var_set are not equal.");
+  SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
+
+  // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
+  // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
+  InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
+                                    aux_var_set, fused_vars_name);
+
+  // Step 6: Fuse optimizer Ops and Scale Ops
+  FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
+
+  // Step 7: Remove optimizer Ops
+  for (auto &opt_op : opt_ops) {
+    graph->RemoveNode(opt_op);
+  }
+}
+
+void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<std::string> &aux_var_names,
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name) const {
+  VLOG(10) << "Init FusedVars.";
+  // Alloc parameters and auxiliary vars in the respective scope.
+  size_t idx = local_scopes.size();
+  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
+       ++iter, --idx) {
+    auto &scope = *iter;
+    for (auto &var_name : aux_var_names) {
+      auto fused_var_name = fused_vars_name.at(var_name);
+      VLOG(10) << "Init " << fused_var_name;
+      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                     "%s has exist in scope[%d]", fused_var_name, idx);
+      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+    }
+  }
+
+  ProgramDesc program_desc;
+  auto *global_block = program_desc.MutableBlock(0);
+  for (auto &var_name : aux_var_names) {
+    AppendAllocContinuousSpace(aux_var_set.at(var_name),
+                               fused_vars_name.at(var_name), true,
+                               global_block);
+  }
+
+  for (size_t i = 0; i < local_scopes.size(); ++i) {
+    for (auto &op_desc : global_block->AllOps()) {
+      auto op = OpRegistry::CreateOp(*op_desc);
+      op->Run(*local_scopes[i], places[i]);
+    }
+  }
+}
+
+void FuseOptimizerOpPass::SortParametersAndAuxVars(
+    const std::vector<std::pair<std::string, std::string>> &params_grads,
+    std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
+    std::vector<ir::Node *> *ops) const {
+  PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0));
+  auto &param_vec = aux_vars_set->at("Param");
+
+  std::vector<size_t> param_sort_idx;
+  param_sort_idx.reserve(param_vec.size());
+
+  for (auto &p_g : params_grads) {
+    auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first);
+    PADDLE_ENFORCE(iter != param_vec.end());
+    auto idx = std::distance(param_vec.begin(), iter);
+    param_sort_idx.emplace_back(idx);
+  }
+
+  for (auto &aux_vars : *aux_vars_set) {
+    std::vector<std::string> sorted_vars;
+    sorted_vars.reserve(aux_vars.second.size());
+    for (size_t i = 0; i < aux_vars.second.size(); ++i) {
+      sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i]));
+    }
+    std::swap(aux_vars.second, sorted_vars);
+
+    std::stringstream out;
+    for (auto &var_name : aux_vars.second) {
+      out << var_name << " ";
+    }
+    VLOG(10) << aux_vars.first << ": " << out.str();
+  }
+
+  std::vector<ir::Node *> sorted_ops;
+  sorted_ops.reserve(ops->size());
+  for (size_t i = 0; i < ops->size(); ++i) {
+    sorted_ops.emplace_back(ops->at(param_sort_idx[i]));
+  }
+  std::swap(*ops, sorted_ops);
+}
+
+void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
+    const std::string &op_type, const std::vector<std::string> &aux_vars_name,
+    ir::Node *node, std::vector<ir::Node *> *ops,
+    std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
+    const {
+  if (node->Op()->Type() != op_type) return;
+
+  for (auto &var_n : aux_vars_name) {
+    auto arg_names = node->Op()->Input(var_n);
+    PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
+    (*aux_args_name)[var_n].emplace_back(arg_names[0]);
+    VLOG(10) << var_n << ", " << arg_names[0];
+  }
+  ops->emplace_back(node);
+}
+
+void FuseOptimizerOpPass::AppendAllocContinuousSpace(
+    const std::vector<std::string> &args, const std::string &out_arg,
+    bool copy_data, BlockDesc *global_block) const {
+  auto op_desc = global_block->AppendOp();
+  op_desc->SetType("alloc_continuous_space");
+  op_desc->SetInput("Input", args);
+  op_desc->SetOutput("Output", args);
+  op_desc->SetOutput("FusedOutput", {out_arg});
+  op_desc->SetAttr("copy_data", copy_data);
+  op_desc->SetAttr("check_name", true);
+}
+
+void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
+    const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const {
+  std::unordered_set<ir::Node *> inputs;
+  std::unordered_set<ir::Node *> outputs;
+  for (auto opt_op : opt_ops) {
+    // set inputs
+    inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
+    for (auto &input : opt_op->inputs) {
+      replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
+    }
+    // set outputs
+    outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
+    for (auto &output : opt_op->outputs) {
+      replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
+    }
+  }
+  opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
+                          inputs.end());
+  opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
+                           outputs.end());
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
new file mode 100644
index 0000000000..0240f1594d
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
@@ -0,0 +1,75 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseOptimizerOpPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+ protected:
+  virtual void SortParametersAndAuxVars(
+      const std::vector<std::pair<std::string, std::string>> &params_grads,
+      std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
+      std::vector<ir::Node *> *ops) const;
+
+  void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
+                                    ir::Node *opt_node) const;
+
+ private:
+  virtual const std::string GetOpType() const = 0;
+
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
+
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
+
+  void GetSpecifiedOpsAndVars(
+      const std::string &op_type, const std::vector<std::string> &aux_vars_name,
+      ir::Node *node, std::vector<ir::Node *> *ops,
+      std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
+      const;
+
+  void AppendAllocContinuousSpace(const std::vector<std::string> &args,
+                                  const std::string &out_arg, bool copy_data,
+                                  BlockDesc *global_block) const;
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::vector<std::string> &aux_var_names,
+      const std::unordered_map<std::string, std::vector<std::string>>
+          &aux_var_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name)
+      const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
new file mode 100644
index 0000000000..f91c21e3cc
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
@@ -0,0 +1,74 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
+
+const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
+  return {"Param"};
+}
+
+void FuseSgdOpPass::FuseOptimizerOps(
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
+  FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
+}
+
+void FuseSgdOpPass::FuseSgdOps(
+    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
+  PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
+
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+
+  int op_role = boost::get<int>(
+      sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  VLOG(10) << "Insert sgd to graph ";
+  // Add fused scale
+  OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
+  Sgd_desc.SetType("sgd");
+  Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
+  Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
+  Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+
+  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
+  Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
+
+  // NOTE: multi_devices_pass requires that every op should have a role.
+  Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+
+  auto sgd_node = graph->CreateOpNode(&Sgd_desc);
+
+  InserInputAndOutputForOptOps(sgd_ops, sgd_node);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h
new file mode 100644
index 0000000000..b3aa6a203b
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseSgdOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const;
+
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
+
+  // Fuse Sgd Ops
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
+
+  void FuseSgdOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 644cd4e150..a57d670f11 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -24,6 +24,19 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+// Note(zcd): Addresses should be aligned, otherwise, the results may have
+// diff.
+static size_t Alignment(size_t size, const platform::Place &place) {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  size_t alignment = 1 << 12;
+  if (platform::is_gpu_place(place)) {
+    // Allow to allocate the minimum chunk size is 256 B.
+    alignment = 1 << 8;
+  }
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
     GradientAndLoDTensor;
 
@@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() {
           return grad1.second->data<void>() < grad2.second->data<void>();
         });
 
+    size_t size_of_dtype = framework::SizeOfType(dtype);
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data<void>();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = len * framework::SizeOfType(dtype);
+      auto offset = Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data<void>();
@@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
     const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
     proto::VarType::Type *dtype, int64_t *numel) const {
   *numel = 0;
+  size_t size_of_dtype = 0;
   for (size_t i = 0; i < grad_tensor.size(); ++i) {
-    // Get element number
-    int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
-    *numel += len;
-
     // Get dtype
     auto ele_type = grad_tensor.at(i).second->type();
     if (i == 0) {
       *dtype = ele_type;
+      size_of_dtype = framework::SizeOfType(ele_type);
     }
     PADDLE_ENFORCE_EQ(ele_type, *dtype);
+
+    // Get element number
+    int64_t len = grad_tensor.at(i).second->numel();
+    PADDLE_ENFORCE_GT(len, 0);
+    //    Alignment(len)
+    *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 884089df38..611693fc7c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -20,7 +20,6 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -34,6 +33,10 @@ namespace framework {
 class Scope;
 namespace details {
 
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override;
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index ab5e099023..6e6ef074db 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -20,7 +20,6 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
 
@@ -41,22 +40,25 @@ namespace details {
 // `std::vector<VarHandle*>` is the version of varaibles.
 typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
     GraphVars;
-const char kGraphVars[] = "vars";
-
-// aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase *> GraphDepVars;
-const char kGraphDepVars[] = "dep_vars";
+constexpr char kGraphVars[] = "vars";
 
-constexpr char kNCCLCtxs[] = "nccl_ctxs";
-
-constexpr char kLossVarName[] = "loss_var_name";
 constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
-constexpr char kStrategy[] = "strategy";
-constexpr char kNRanks[] = "nranks";
+constexpr char kNCCLCtxs[] = "nccl_ctxs";
+
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
+constexpr char kGraphDepVars[] = "dep_vars";
 
 typedef std::unordered_set<std::string> FusedVars;
 constexpr char kFusedVars[] = "fused_vars";
+constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
+
+typedef std::string FusedOptType;
+constexpr char kFusedOptType[] = "fused_opt_type";
+
+typedef std::string FusedGrads;
+constexpr char kFusedGrads[] = "fused_gradients";
 
 typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
 constexpr char kParamsAndGrads[] = "params_grads";
@@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>>
     GroupGradsAndParams;
 constexpr char kGroupGradsAndParams[] = "group_grads_params";
 
-constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index ef096c2b81..ea7f8c496a 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
                     "The start row index must be greater than 0.");
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index a3c1063ce9..0fa76f943e 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -133,7 +133,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc
index df0e9911cf..d4bdecff62 100644
--- a/paddle/fluid/operators/alloc_continuous_space_op.cc
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
@@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
     // Get numel and dtype
     size_t numel = 0;
     auto dtype = kDefaultDtype;
-    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype);
+    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype,
+                       context.GetPlace());
 
     // Alloc the continuous space
     auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
 
     // Init the continuous space
     auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
-    int64_t offset = 0;
+    size_t offset = 0;
+    size_t size_of_dtype = framework::SizeOfType(dtype);
     if (context.Attr<bool>("copy_data")) {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
-        int64_t len = out_tensors[i]->numel();
-        auto sub_tensor = fused_tensor->Slice(offset, offset + len);
-        offset += len;
-        framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
+        size_t len = static_cast<size_t>(in_tensors[i]->numel());
+        auto sub_tensor = fused_tensor->Slice(
+            static_cast<int64_t>(offset), static_cast<int64_t>(offset + len));
+        framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                               &sub_tensor);
+
+        offset +=
+            Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
       }
     } else if (context.Attr<bool>("set_constant")) {
       math::SetConstant<DeviceContext, T> set_constant;
@@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
     // Make the outputs point to the continuous space.
     offset = 0;
     for (size_t i = 0; i < out_tensors.size(); ++i) {
-      int64_t len = out_tensors[i]->numel();
+      size_t len = static_cast<size_t>(out_tensors[i]->numel());
       auto dim = out_tensors[i]->dims();
       out_tensors[i]
-          ->ShareDataWith(fused_tensor->Slice(offset, offset + len))
+          ->ShareDataWith(fused_tensor->Slice(
+              static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
+      len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
       offset += len;
       VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
                << ") ,dim:(" << dim << ")"
@@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
     }
   }
 
+ private:
+  // Note(zcd): Addresses should be aligned, otherwise, the results may have
+  // diff.
+  size_t Alignment(size_t size, const platform::Place &place) const {
+    // Allow to allocate the minimum chunk size is 4 KB.
+    size_t alignment = 1 << 12;
+    if (platform::is_gpu_place(place)) {
+      // Allow to allocate the minimum chunk size is 256 B.
+      alignment = 1 << 8;
+    }
+    size_t remaining = size % alignment;
+    return remaining == 0 ? size : size + (alignment - remaining);
+  }
+
   void GetMemSizeAndDtype(
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
-      framework::proto::VarType::Type *dtype) const {
+      framework::proto::VarType::Type *dtype,
+      const platform::Place &place) const {
     PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
     *numel = 0;
+    size_t size_of_dtype = 0;
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
                      var_names[i]);
@@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
                           var_names[i], kDefaultDtype);
         *dtype = p_dtype;
+        size_of_dtype = framework::SizeOfType(p_dtype);
       }
       PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
 
@@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(size, 0);
       VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
                << lod_tensors[i]->dims() << ")";
-      *numel += size;
+      *numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
+                size_of_dtype;
     }
   }
 };
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f62531c7bb..fa978f1c99 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1282,6 +1282,15 @@ All parameter, weight, gradient are variables in Paddle.
                       it will save GPU memory and may make the execution faster.
                       This options is only available in GPU devices.
                       Default False)DOC")
+      .def_property("fuse_all_optimizer_ops",
+                    [](const BuildStrategy &self) {
+                      return self.fuse_all_optimizer_ops_;
+                    },
+                    [](BuildStrategy &self, bool b) {
+                      PADDLE_ENFORCE(!self.IsFinalized(),
+                                     "BuildStrategy is finlaized.");
+                      self.fuse_all_optimizer_ops_ = b;
+                    })
       .def_property(
           "sync_batch_norm",
           [](const BuildStrategy &self) { return self.sync_batch_norm_; },
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 61fd9af127..18ed02a722 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_ir_memory_optimize=True,
                                   enable_inplace=True,
                                   fuse_elewise_add_act_ops=False,
+                                  fuse_all_optimizer_ops=False,
                                   fuse_all_reduce_ops=False,
                                   fuse_relu_depthwise_conv=False,
                                   optimizer=fluid.optimizer.Adam,
@@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase):
         build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
         build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
         build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
+        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
         build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
         # python memory optimization is conflict with inplace pass.
         # Use ir graph memory optimization after inplace pass is the correct way.
diff --git a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
index 9d5fe114ba..29eb0166b7 100644
--- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
+++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
@@ -16,8 +16,10 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-
 from op_test import OpTest
+from paddle.fluid import core
+
+alignment = 256
 
 
 class TestAllocContinuousSpace(OpTest):
@@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest):
         self.constant = attrs["constant"]
         self.set_constant = attrs["set_constant"]
         self.Inputs = self.init_input()
-        self.FusedOutput = self.init_output(self.Inputs, self.set_constant,
-                                            self.constant)
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
         self.inputs = {'Input': self.Inputs}
         self.attrs = attrs
-        self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput}
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest):
         return {"copy_data": True, "set_constant": False, "constant": 0.0}
 
     def init_output(self, input_list, set_constant, constant):
-        inputs = [input[1].flatten() for input in input_list]
-        output = np.concatenate(inputs)
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len))
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        alloc_continuous_space_var = np.concatenate([input for input in inputs])
         if set_constant:
-            output = np.ones((len(output))) * constant
-        return output
+            alloc_continuous_space_var = np.ones(
+                (len(alloc_continuous_space_var))) * constant
+            outputs = [(out[0],
+                        np.ones(out[1].shape).astype(self.dtype) * constant)
+                       for out in outputs]
+        return outputs, alloc_continuous_space_var
 
     def test_check_output(self):
-        self.check_output()
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(
+                place=core.CUDAPlace(0),
+                no_check_set=["FusedOutput"],
+                atol=1e-5)
 
 
 class TestAllocContinuousSpace2(TestAllocContinuousSpace):
@@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace):
         return {"copy_data": False, "set_constant": True, "constant": 0.5}
 
     def test_check_output(self):
-        self.check_output(no_check_set=["Output"])
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(
+                place=core.CUDAPlace(0),
+                no_check_set=["FusedOutput"],
+                atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
new file mode 100644
index 0000000000..93e67deaf3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+
+def simple_fc_net(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(2):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestFuseAdamOps(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_fused_optimizer_ops(self,
+                                     model,
+                                     use_cuda,
+                                     random_data=True,
+                                     optimizer=fluid.optimizer.Adam):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        img, label = self._init_data(random_data)
+        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_all_optimizer_ops=False,
+            memory_opt=False,  # avoid the gradient's name changed in Python side.
+            optimizer=optimizer)
+        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_all_optimizer_ops=True,
+            memory_opt=False,  # avoid the gradient's name changed in Python side.
+            optimizer=optimizer)
+
+        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(simple_fc_net, True)
+        self._compare_fused_optimizer_ops(simple_fc_net, False)
+
+    def test_batchnorm_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(fc_with_batchnorm, True)
+        # self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
+
+
+class TestFuseSGDOps(TestFuseAdamOps):
+    def sgd_optimizer(self, learning_rate=1e-4):
+        return fluid.optimizer.SGD(learning_rate=learning_rate)
+
+    def test_simple_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(
+            simple_fc_net, True, optimizer=self.sgd_optimizer)
+        self._compare_fused_optimizer_ops(
+            simple_fc_net, False, optimizer=self.sgd_optimizer)
+
+    def test_batchnorm_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(
+            fc_with_batchnorm, True, optimizer=self.sgd_optimizer)
+        self._compare_fused_optimizer_ops(
+            fc_with_batchnorm, False, optimizer=self.sgd_optimizer)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index ba63213a41..6671a2def3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
             param_attr=fluid.ParamAttr(
                 name=embedding_name, trainable=False)) for x in word_input
     ]
+    # TODO(zcd): if the parameter is not trainable, the
+    #  parameter's gradient should not generated.
+    for emb_layer in emb_layers:
+        emb_layer.stop_gradient = True
+
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
 
@@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase):
         os.environ['CPU_NUM'] = str(4)
         main = fluid.Program()
         startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            word = fluid.layers.data(
-                name='word_data', shape=[1], dtype='int64', lod_level=1)
-            predicate = fluid.layers.data(
-                name='verb_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n2 = fluid.layers.data(
-                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n1 = fluid.layers.data(
-                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_0 = fluid.layers.data(
-                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p1 = fluid.layers.data(
-                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p2 = fluid.layers.data(
-                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-            mark = fluid.layers.data(
-                name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-            feature_out = db_lstm(**locals())
-            target = fluid.layers.data(
-                name='target', shape=[1], dtype='int64', lod_level=1)
-            crf_cost = fluid.layers.linear_chain_crf(
-                input=feature_out,
-                label=target,
-                param_attr=fluid.ParamAttr(
-                    name='crfw', learning_rate=1e-1))
-            avg_cost = fluid.layers.mean(crf_cost)
-
-            sgd_optimizer = fluid.optimizer.SGD(
-                learning_rate=fluid.layers.exponential_decay(
-                    learning_rate=0.01,
-                    decay_steps=100000,
-                    decay_rate=0.5,
-                    staircase=True))
-            sgd_optimizer.minimize(avg_cost)
-
-            train_data = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.conll05.test(), buf_size=8192),
-                batch_size=16)
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup)
-
-            train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=avg_cost.name, build_strategy=build_strategy)
-
-            feeder = fluid.DataFeeder(
-                feed_list=[
-                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
-                    mark, target
-                ],
-                place=fluid.CPUPlace())
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(main, startup):
+                word = fluid.layers.data(
+                    name='word_data', shape=[1], dtype='int64', lod_level=1)
+                predicate = fluid.layers.data(
+                    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+                ctx_n2 = fluid.layers.data(
+                    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+                ctx_n1 = fluid.layers.data(
+                    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+                ctx_0 = fluid.layers.data(
+                    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+                ctx_p1 = fluid.layers.data(
+                    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+                ctx_p2 = fluid.layers.data(
+                    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+                mark = fluid.layers.data(
+                    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+                feature_out = db_lstm(**locals())
+                target = fluid.layers.data(
+                    name='target', shape=[1], dtype='int64', lod_level=1)
+                crf_cost = fluid.layers.linear_chain_crf(
+                    input=feature_out,
+                    label=target,
+                    param_attr=fluid.ParamAttr(
+                        name='crfw', learning_rate=1e-1))
+                avg_cost = fluid.layers.mean(crf_cost)
+
+                sgd_optimizer = fluid.optimizer.SGD(
+                    learning_rate=fluid.layers.exponential_decay(
+                        learning_rate=0.01,
+                        decay_steps=100000,
+                        decay_rate=0.5,
+                        staircase=True))
+                sgd_optimizer.minimize(avg_cost)
+
+                train_data = paddle.batch(
+                    paddle.reader.shuffle(
+                        paddle.dataset.conll05.test(), buf_size=8192),
+                    batch_size=16)
+
+                place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                exe.run(startup)
+
+                train_cp = compiler.CompiledProgram(main).with_data_parallel(
+                    loss_name=avg_cost.name, build_strategy=build_strategy)
+
+                feeder = fluid.DataFeeder(
+                    feed_list=[
+                        word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                        mark, target
+                    ],
+                    place=fluid.CPUPlace())
 
             data = train_data()
             for i in range(10):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index 17f8f5a0b4..d0eca7d6df 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -41,14 +41,15 @@ class TestBase(unittest.TestCase):
                     fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace())
                 exe.run(startup_prog)
 
-        for _ in six.moves.xrange(iter):
-            exe_strategy = fluid.ExecutionStrategy()
-            exe_strategy._dry_run = True
-            exe_strategy.use_experimental_executor = use_experimental_executor
-            train_cp = compiler.CompiledProgram(main_prog).with_data_parallel(
-                loss_name=loss.name, exec_strategy=exe_strategy)
-            for _ in six.moves.xrange(iter_per_pe):
-                exe.run(train_cp)
+                exe_strategy = fluid.ExecutionStrategy()
+                exe_strategy._dry_run = True
+                exe_strategy.use_experimental_executor = use_experimental_executor
+                train_cp = compiler.CompiledProgram(
+                    main_prog).with_data_parallel(
+                        loss_name=loss.name, exec_strategy=exe_strategy)
+                for _ in six.moves.xrange(iter):
+                    for _ in six.moves.xrange(iter_per_pe):
+                        exe.run(train_cp)
 
 
 class TestMNISTDryRun(TestBase):