From 88881955e729596bf916bc8382df8fd8b5bc8e0a Mon Sep 17 00:00:00 2001
From: "liuwei(DLTP)" <liuwei12@baidu.com>
Date: Mon, 14 Jan 2019 10:24:18 +0800
Subject: [PATCH 01/78] fix github issue 15267 test=develop

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a4787e769f..99e1c2adfd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8480,7 +8480,7 @@ def shape(input):
 
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
+        dtype='int32')
     helper.append_op(
         type='shape', inputs={'Input': input}, outputs={'Out': out})
 

From b758fa50b2155121f94b043967eb36ebb0c87cf6 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Mon, 14 Jan 2019 11:09:27 +0800
Subject: [PATCH 02/78] fix github issue 15267 test=develop

---
 python/paddle/fluid/layers/nn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a4787e769f..56971cff43 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8479,8 +8479,7 @@ def shape(input):
     """
 
     helper = LayerHelper('shape', **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
+    out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
         type='shape', inputs={'Input': input}, outputs={'Out': out})
 

From 413543eb8f9ff6939eee457974034afcb3e08718 Mon Sep 17 00:00:00 2001
From: Wei Liu <liuwei12@baidu.com>
Date: Fri, 18 Jan 2019 09:52:36 +0800
Subject: [PATCH 03/78] print peak memory usage

---
 paddle/fluid/memory/detail/system_allocator.cc | 5 +++++
 paddle/fluid/memory/detail/system_allocator.h  | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 3e8fb83e9d..14dcaf756f 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -117,6 +117,11 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   if (result == cudaSuccess) {
     *index = 0;
     gpu_alloc_size_ += size;
+    if (gpu_alloc_size_ > s_memoryMap[gpu_id_]) {
+        s_memoryMap[gpu_id_] = gpu_alloc_size_;
+        VLOG(3) << "device: " << gpu_id_
+          << " maximum memory size : " <<(gpu_alloc_size_ >> 20) << " MiB";
+    }
     return p;
   } else {
     LOG(WARNING)
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index a0386a2dad..1ac1df6de7 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -44,6 +45,8 @@ class CPUAllocator : public SystemAllocator {
 #ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
+  std::unordered_map<int, uint64_t> s_memoryMap;
+
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
 
   virtual void* Alloc(size_t* index, size_t size);

From dde19a0ff8d6f02b9c4e61cc2116025e80e5a6d8 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Thu, 24 Jan 2019 16:00:10 +0800
Subject: [PATCH 04/78] add quantization freeze pass.

---
 paddle/fluid/pybind/ir.cc                     |  11 ++
 python/CMakeLists.txt                         |   1 +
 .../slim/quantization/quantization_pass.py    | 187 +++++++++++++++++-
 .../fluid/contrib/slim/tests/CMakeLists.txt   |   6 +
 .../slim/{unitest => tests}/__init__.py       |   0
 .../{unitest => tests}/configs/config.yaml    |   2 +-
 .../{unitest => tests}/configs/pruners.yaml   |   0
 .../{unitest => tests}/configs/pruners_0.yaml |   0
 .../slim/{unitest => tests}/test_factory.py   |   2 +-
 .../fluid/contrib/slim/tests/test_graph.py    |  80 ++++++++
 .../test_quantization_pass.py                 | 120 +++++++++++
 python/paddle/fluid/framework.py              |  60 +++++-
 12 files changed, 450 insertions(+), 19 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/__init__.py (100%)
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/configs/config.yaml (88%)
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/configs/pruners.yaml (100%)
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/configs/pruners_0.yaml (100%)
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/test_factory.py (95%)
 create mode 100644 python/paddle/fluid/contrib/slim/tests/test_graph.py
 rename python/paddle/fluid/contrib/slim/{unitest => tests}/test_quantization_pass.py (57%)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 24059140ab..9994a231a1 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -27,6 +28,10 @@ namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
 using paddle::framework::ir::GraphSafeRemoveNodes;
+using paddle::framework::ir::HasCircle;
+using paddle::framework::ir::GraphNum;
+using paddle::framework::ir::TopologySortOperations;
+using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -36,6 +41,12 @@ namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
   m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
+  m->def("has_circle", HasCircle);
+  m->def("graph_num", GraphNum);
+  m->def("topology_sort", TopologySortOperations,
+         return_value_policy::reference);
+  m->def("build_adjacency_list", BuildOperationAdjList,
+         return_value_policy::reference);
   py::class_<Graph, std::shared_ptr<Graph>>(
       *m, "Graph",
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 59e695e6fc..4cdf96efbd 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -64,6 +64,7 @@ if (WITH_TESTING)
   add_subdirectory(paddle/dataset/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
+  add_subdirectory(paddle/fluid/contrib/slim/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 266a106bc5..ae915dadfb 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+import numpy as np
 from .... import core
 from ....framework import IrGraph
 from ....framework import Program
@@ -88,10 +89,6 @@ class QuantizationTransformPass(object):
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
-        self._fake_quant_op_types = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
-        ]
-        self._fake_dequant_op_types = ['fake_dequantize_max_abs']
         self._is_test = None
         self._global_step = None
 
@@ -102,17 +99,17 @@ class QuantizationTransformPass(object):
         self._is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
-        params = [p.name() for p in graph.all_parameters()]
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
 
         def _transform_forward(graph, op):
             for var_node in op.inputs:
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
                 else:
-                    quant_bits = self._weight_bits if var_node.name() in params \
+                    quant_bits = self._weight_bits if var_node.name() in persistable_vars \
                     else self._activation_bits
                     quant_type = self._weight_quantize_type if var_node.name() \
-                        in params else self._activation_quantize_type
+                        in persistable_vars else self._activation_quantize_type
                     quant_var_node, scale_var_node = self._insert_quant_op(
                         graph, var_node, quant_bits, quant_type)
                     dequant_var_node = self._insert_dequant_op(
@@ -316,3 +313,179 @@ class QuantizationTransformPass(object):
         Return the scale name of quantized variable for the input `var_name`.
         """
         return "%s.scale" % (var_name)
+
+
+class QuantizationFreezePass(object):
+    def __init__(self,
+                 scope,
+                 place,
+                 weight_bits=8,
+                 activation_bits=8,
+                 weight_quantize_type='abs_max'):
+        assert scope is not None, \
+            'The scope cannot be set None.'
+        assert place is not None, \
+            'The place cannot be set None.'
+        self._scope = scope
+        self._place = place
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+        self._weight_quantize_type = weight_quantize_type
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
+        self._op_input_rename_map = collections.OrderedDict()
+        self._op_output_rename_map = collections.OrderedDict()
+        self._var_scale_map = collections.OrderedDict()
+
+    def apply(self, graph):
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        ops = graph.all_ops()
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._fake_quant_op_names:
+                input_arg_name = op_node.op().input('X')[0]
+                if input_arg_name in persistable_vars:
+                    if self._weight_quantize_type == 'abs_max':
+                        param = self._load_var(input_arg_name)
+                        scale_v = np.max(np.abs(param))
+                    else:
+                        scale_v = self._load_var(op_node.op().output('OutScale')
+                                                 [0])[0]
+                    self._var_scale_map[input_arg_name] = scale_v
+                else:
+                    scale_v = graph.var_node(op_node.op().output('OutScale')[0])
+                    self._var_scale_map[input_arg_name] = scale_v
+                if input_arg_name in persistable_vars:
+                    self._remove_fake_quant_and_dequant_op(graph, op_node)
+                    # quantize weight and restore
+                    param_v = self._load_var(input_arg_name)
+                    quantized_param_v = self._quant(param_v, scale_v,
+                                                    self.weight_bits)
+                    self._restore_var(input_arg_name, quantized_param_v)
+
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._fake_dequant_op_names:
+                self._remove_fake_quant_and_dequant_op(graph, op_node)
+
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._quantizable_ops:
+                self._insert_post_dequant_op(graph, op_node)
+
+        for op_node in ops:
+            # insert dequant_op after fc/conv, need to rename inputs of the followed ops
+            for var_node in op_node.inputs:
+                name = var_node.name()
+                if name in self._op_output_rename_map:
+                    old_in = graph.var_node(name)
+                    new_in = graph.var_node(self._op_output_rename_map[name])
+                    graph.update_input_link(old_in, new_in, op_node)
+
+        # remove the unused var node in the graph
+        self._remove_unused_var_nodes(graph)
+
+    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
+        k = op_node.op().output('Out')[0]
+        v = op_node.op().input('X')[0]
+        if v not in self._op_input_rename_map:
+            self._op_input_rename_map[k] = v
+        else:
+            self._op_input_rename_map[k] = self._op_input_rename_map[v]
+        graph.save_remove_nodes(op_node)
+
+    def _insert_post_dequant_op(self, graph, op_node):
+        max_range = None
+        scale_var_node = None
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        for var_node in op_node.op().inputs:
+            name = var_node.name()
+            if name in self._op_input_rename_map:
+                old_in = graph.var_node(name)
+                new_in = graph.var_node(self._op_input_rename_map[name])
+                graph.update_input_link(old_in, new_in, op_node)
+            original_var_name = self._original_var_name(name)
+            if original_var_name in persistable_vars:
+                param_range = (1 << (self._weight_bits - 1)) - 1
+                act_range = (1 << (self._activation_bits - 1)) - 1
+                scale_v = self._var_scale_map[original_var_name]
+                assert self._is_float(
+                    scale_v), 'The scale of parameter %s is not a float.' % (
+                        original_var_name)
+                max_range = param_range * act_range / scale_v
+            else:
+                assert isinstance(scale_v, core.Node)
+                scale_var_node = self._var_scale_map[original_var_name]
+
+        if len(op_node.op().outputs) != 1:
+            raise ValueError("Only support one output, but op %s has"
+                             " more than one output." % (op_node.name()))
+
+        output_var_node = op_node.op().outputs[0]
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(output_var_node.name()),
+            var_type=output_var_node.var().type(),
+            shape=output_var_node.var().shape(),
+            var_dtype=output_var_node.var().dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_dequantize_max_abs',
+            attrs={'max_range': float(max_range)},
+            inputs={'X': output_var_node,
+                    'Scale': scale_var_node},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(output_var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        self._op_output_rename_map[output_var_node.name(
+        )] = dequant_var_node.name()
+        return dequant_var_node
+
+    def _load_var(self, name):
+        return np.array(self._scope.find_var(name).get_tensor())
+
+    def _restore_var(self, name, arr):
+        t = self._scope.find_var(name).get_tensor()
+        t.set(arr, self._place)
+
+    def _remove_unused_var_nodes(self, graph):
+        all_used_vars = set()
+        ops = graph.all_ops()
+        for op_node in ops:
+            for input_node in op_node.inputs:
+                all_used_vars.add(input_node)
+            for output_node in op_node.outputs:
+                all_used_vars.add(output_node)
+
+        all_unused_vars = graph.all_vars() - all_used_vars
+        graph.safe_remove_nodes(all_unused_vars)
+
+    def _original_var_name(self, var_name):
+        """
+        Return the original variable name.
+        """
+        if var_name.endswith('.quantized.dequantized'):
+            return var_name[:-len('.quantized.dequantized')]
+        if var_name.endswith('.quantized'):
+            return var_name[:-len('.quantized')]
+        if var_name.endswith('.dequantized'):
+            return var_name[:-len('.dequantized')]
+        if var_name.endswith('.scale'):
+            return var_name[:-len('.scale')]
+        else:
+            return var_name
+
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+
+    def _is_float(v):
+        return isinstance(v, float) or isinstance(v, np.float32) \
+            or isinstance(v, np.float64)
+
+    def _quant(x, scale, num_bits):
+        return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
new file mode 100644
index 0000000000..79bec8c4ad
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/contrib/slim/unitest/__init__.py b/python/paddle/fluid/contrib/slim/tests/__init__.py
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/__init__.py
rename to python/paddle/fluid/contrib/slim/tests/__init__.py
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
similarity index 88%
rename from python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/config.yaml
index db488b9633..d9b49029d3 100644
--- a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
@@ -1,5 +1,5 @@
 version: 1.0
-include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
 pruners:
     pruner_1:
         class: 'RatioPruner'
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/unitest/test_factory.py
rename to python/paddle/fluid/contrib/slim/tests/test_factory.py
index 07f28aac90..2fc72b6475 100644
--- a/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
@@ -18,7 +18,7 @@ import unittest
 
 class TestFactory(unittest.TestCase):
     def test_parse(self):
-        factory = ConfigFactory('./unitest/configs/config.yaml')
+        factory = ConfigFactory('./configs/config.yaml')
 
         pruner = factory.instance('pruner_1')
         self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
new file mode 100644
index 0000000000..75e0c95b5c
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -0,0 +1,80 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestGraph(unittest.TestCase):
+    def test_graph_functions(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('conv2d') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'residual', marked_nodes)
+        self.assertFalse(graph.has_circle())
+        self.assertEqual(graph.graph_num(), 1)
+        nodes = graph.topology_sort()
+        self.assertEqual(len(nodes), len(graph.all_ops()))
+        nodes_map = graph.build_adjacency_list()
+        self.assertEqual(len(nodes_map), len(graph.all_ops()))
+        nodes_num = len(graph.all_nodes())
+        graph.safe_remove_nodes(marked_nodes)
+        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
similarity index 57%
rename from python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
rename to python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 1bd4b95d6b..9d933b21b7 100644
--- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -65,6 +65,28 @@ def residual_block(num):
     return loss
 
 
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
+
+
 class TestQuantizationTransformPass(unittest.TestCase):
     def setUp(self):
         self.quantizable_op_and_inputs = {
@@ -171,5 +193,103 @@ class TestQuantizationTransformPass(unittest.TestCase):
         self.residual_block_quant('range_abs_max')
 
 
+class TestQuantizeTranspiler(unittest.TestCase):
+    def freeze_graph(self, use_cuda, seed):
+        def build_program(main, startup, is_test):
+            main.random_seed = seed
+            startup.random_seed = seed
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    loss = conv_net(img, label)
+                    if not is_test:
+                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt.minimize(loss)
+            return [img, label], loss
+
+        random.seed(0)
+        np.random.seed(0)
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        test_program = fluid.Program()
+        feeds, loss = build_program(main, startup, False)
+        build_program(test_program, startup, True)
+        test_program = test_program.clone(for_test=True)
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        test_graph = IrGraph(core.Graph(test_graph.desc), for_test=True)
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(), program_exe=exe)
+        iters = 5
+        batch_size = 8
+        class_num = 10
+        exe.run(startup)
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+
+        with fluid.program_guard(main):
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(program=main,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+
+        with fluid.program_guard(test_program):
+            test_data = next(test_reader())
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             test_program)
+            # Testing during training
+            test_loss1, w_quant = exe.run(program=test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+
+            # Freeze program for inference, but the weight of fc/conv is still float type.
+            quant_transpiler.freeze_program(test_program, place)
+            test_loss2, = exe.run(program=test_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+            self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
+                                .get_tensor())
+            # fail: -432.0 != -433.0, this is due to the calculation precision
+            #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+
+            # Convert parameter to 8-bit.
+            quant_transpiler.convert_to_int8(test_program, place)
+            # Save the 8-bit parameter and model file.
+            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
+                                          [loss], exe, test_program)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
+                                                                 exe)
+            # Check the loaded 8-bit weight.
+            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
+                              .get_tensor())
+
+            self.assertEqual(w_8bit.dtype, np.int8)
+            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+
+    def not_test_freeze_program_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_program(True, seed=1)
+
+    def not_test_freeze_program_cpu(self):
+        with fluid.unique_name.guard():
+            self.freeze_program(False, seed=2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fc5e471ae3..83203b746c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1533,20 +1533,47 @@ class IrGraph(object):
     def is_test(self):
         return self._for_test
 
-    def all_parameters(self):
-        param_nodes = set()
-        for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
-                param_nodes.add(node)
-        return param_nodes
+    def all_nodes(self):
+        return {node for node in self.graph.nodes()}
 
     def all_vars(self):
         return {node for node in self.graph.nodes() if node.is_var()}
 
+    def all_persistable_vars(self):
+        persistable_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                persistable_nodes.add(node)
+        return persistable_nodes
+
     def all_ops(self):
         return {node for node in self.graph.nodes() if node.is_op()}
 
+    def var_node(self, name):
+        """
+        Get a variable node by name from this graph.
+        Args:
+            name(str): the name of the variable node.
+        Raises:
+            ValueError: The If input's type is not str, or this graph
+            doesn't have a variable with the giving name.
+        Returns:
+            Node: the variable node with the giving name.
+        """
+        if not isinstance(name, six.string_types):
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
+        target_var_node = None
+        var_nodes = self.all_vars()
+        for var_node in var_nodes:
+            if var_node.name() == name:
+                target_var_node = var_node
+        if target_var_node is None:
+            raise ValueError("var_node %s not in this graph" % name)
+        return target_var_node
+
     def create_param_node(self, name, var_type, shape, var_dtype):
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
@@ -1586,8 +1613,9 @@ class IrGraph(object):
         return self.graph.create_op_node(op_desc)
 
     def update_input_link(self, old_input_node, new_input_node, op_node):
-        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
-            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        assert old_input_node in self.graph.nodes() and new_input_node in \
+        self.graph.nodes() and op_node in self.graph.nodes(), \
+        'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
         old_input_node.outputs_remove(op_node)
         op_node.inputs_remove(old_input_node)
         new_input_node.outputs_append(op_node)
@@ -1596,7 +1624,7 @@ class IrGraph(object):
 
     def link_to(self, node_in, node_out):
         assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
-            'Th two arguments must be in the graph nodes.'
+            'The two arguments(node_in&node_out) must be in the graph nodes.'
         node_in.outputs_append(node_out)
         node_out.inputs_append(node_in)
 
@@ -1605,6 +1633,18 @@ class IrGraph(object):
             remove_nodes = set(remove_nodes)
         core.graph_safe_remove_nodes(self.graph, remove_nodes)
 
+    def has_circle(self):
+        return core.has_circle(self.graph)
+
+    def graph_num(self):
+        return core.graph_num(self.graph)
+
+    def topology_sort(self):
+        return core.topology_sort(self.graph)
+
+    def build_adjacency_list(self):
+        return core.build_adjacency_list(self.graph)
+
     def draw(self, save_path, name, marked_nodes=None):
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'

From c8095eeb82fdd742d704cf4a650a6e21b01da874 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Sat, 26 Jan 2019 00:31:12 +0800
Subject: [PATCH 05/78] add freeze pass, and UT is passed.

---
 paddle/fluid/pybind/ir.cc                     |  41 ++---
 .../slim/quantization/quantization_pass.py    |  39 +++--
 .../slim/tests/test_quantization_pass.py      | 141 +++++++++++-------
 python/paddle/fluid/framework.py              |   6 +-
 4 files changed, 138 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 9994a231a1..b7e7de4ee6 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/ir.h"
+#include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -119,42 +120,42 @@ void BindNode(py::module *m) {
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
       .def("inputs_remove",
            [](Node &self, int node_id) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
-                  it++) {
-               if ((*it)->id() == node_id) {
-                 self.inputs.erase(it);
-               }
+             auto pos = std::find_if(
+                 self.inputs.begin(), self.inputs.end(),
+                 [&node_id](const Node *n) { return n->id() == node_id; });
+             if (pos != self.inputs.end()) {
+               self.inputs.erase(pos);
              }
            })
       .def("inputs_remove",
            [](Node &self, Node &node) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
-                  it++) {
-               if (*it == &node) {
-                 self.inputs.erase(it);
-               }
+             auto pos =
+                 std::find(self.inputs.begin(), self.inputs.end(), &node);
+             if (pos != self.inputs.end()) {
+               self.inputs.erase(pos);
              }
            })
       .def("inputs_append",
            [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
       .def("outputs_remove",
            [](Node &self, int node_id) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
-                  it++) {
-               if ((*it)->id() == node_id) {
-                 self.outputs.erase(it);
-               }
+             auto pos = std::find_if(
+                 self.outputs.begin(), self.outputs.end(),
+                 [&node_id](const Node *n) { return n->id() == node_id; });
+             if (pos != self.outputs.end()) {
+               self.outputs.erase(pos);
              }
            })
       .def("outputs_remove",
            [](Node &self, Node &node) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
-                  it++) {
-               if (*it == &node) {
-                 self.outputs.erase(it);
-               }
+             auto pos =
+                 std::find(self.outputs.begin(), self.outputs.end(), &node);
+             if (pos != self.outputs.end()) {
+               self.outputs.erase(pos);
              }
            })
       .def("outputs_append",
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index ae915dadfb..ed965aaa0b 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -14,14 +14,14 @@
 
 import collections
 import numpy as np
+from ..... import compat as cpt
 from .... import core
 from ....framework import IrGraph
 from ....framework import Program
-from ....framework import Variable
 from ....initializer import Constant
 from .... import unique_name
 
-__all__ = ['QuantizationTransformPass']
+__all__ = ['QuantizationTransformPass', 'QuantizationFreezePass']
 
 
 class QuantizationTransformPass(object):
@@ -148,8 +148,13 @@ class QuantizationTransformPass(object):
             'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
             init_program = Program()
             for var_desc, initializer in self._need_initialized.iteritems():
-                var = Variable(init_program.global_block())
-                var._set_desc(var_desc)
+                var = init_program.global_block().create_var(
+                    name=var_desc.name(),
+                    shape=var_desc.shape(),
+                    dtype=var_desc.dtype(),
+                    type=var_desc.type(),
+                    lod_level=var_desc.lod_level(),
+                    persistable=var_desc.persistable())
                 initializer(var, init_program.global_block())
             self._program_exe.run(program=init_program, scope=self._scope)
 
@@ -158,7 +163,7 @@ class QuantizationTransformPass(object):
     def _create_global_step(self, graph):
         if self._weight_quantize_type == 'range_abs_max' or \
                 self._activation_quantize_type == 'range_abs_max':
-            counter_name = '@STEP_COUNTER@'
+            counter_name = cpt.to_text('@STEP_COUNTER@')
             for node in graph.all_vars():
                 if node.name() == counter_name:
                     self._global_step = node
@@ -363,14 +368,16 @@ class QuantizationFreezePass(object):
                     # quantize weight and restore
                     param_v = self._load_var(input_arg_name)
                     quantized_param_v = self._quant(param_v, scale_v,
-                                                    self.weight_bits)
+                                                    self._weight_bits)
                     self._restore_var(input_arg_name, quantized_param_v)
 
+        ops = graph.all_ops()
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._fake_dequant_op_names:
                 self._remove_fake_quant_and_dequant_op(graph, op_node)
 
+        ops = graph.all_ops()
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
@@ -382,7 +389,7 @@ class QuantizationFreezePass(object):
                 name = var_node.name()
                 if name in self._op_output_rename_map:
                     old_in = graph.var_node(name)
-                    new_in = graph.var_node(self._op_output_rename_map[name])
+                    new_in = self._op_output_rename_map[name]
                     graph.update_input_link(old_in, new_in, op_node)
 
         # remove the unused var node in the graph
@@ -395,23 +402,24 @@ class QuantizationFreezePass(object):
             self._op_input_rename_map[k] = v
         else:
             self._op_input_rename_map[k] = self._op_input_rename_map[v]
-        graph.save_remove_nodes(op_node)
+        graph.safe_remove_nodes(op_node)
 
     def _insert_post_dequant_op(self, graph, op_node):
         max_range = None
         scale_var_node = None
         persistable_vars = [p.name() for p in graph.all_persistable_vars()]
-        for var_node in op_node.op().inputs:
+        for var_node in op_node.inputs:
             name = var_node.name()
             if name in self._op_input_rename_map:
                 old_in = graph.var_node(name)
                 new_in = graph.var_node(self._op_input_rename_map[name])
+                new_in.clear_outputs()
                 graph.update_input_link(old_in, new_in, op_node)
             original_var_name = self._original_var_name(name)
+            scale_v = self._var_scale_map[original_var_name]
             if original_var_name in persistable_vars:
                 param_range = (1 << (self._weight_bits - 1)) - 1
                 act_range = (1 << (self._activation_bits - 1)) - 1
-                scale_v = self._var_scale_map[original_var_name]
                 assert self._is_float(
                     scale_v), 'The scale of parameter %s is not a float.' % (
                         original_var_name)
@@ -420,11 +428,11 @@ class QuantizationFreezePass(object):
                 assert isinstance(scale_v, core.Node)
                 scale_var_node = self._var_scale_map[original_var_name]
 
-        if len(op_node.op().outputs) != 1:
+        if len(op_node.outputs) != 1:
             raise ValueError("Only support one output, but op %s has"
                              " more than one output." % (op_node.name()))
 
-        output_var_node = op_node.op().outputs[0]
+        output_var_node = op_node.outputs[0]
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
             var_type=output_var_node.var().type(),
@@ -439,8 +447,7 @@ class QuantizationFreezePass(object):
         graph.link_to(output_var_node, dequant_op_node)
         graph.link_to(scale_var_node, dequant_op_node)
         graph.link_to(dequant_op_node, dequant_var_node)
-        self._op_output_rename_map[output_var_node.name(
-        )] = dequant_var_node.name()
+        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
         return dequant_var_node
 
     def _load_var(self, name):
@@ -483,9 +490,9 @@ class QuantizationFreezePass(object):
         """
         return "%s.dequantized" % (var_name)
 
-    def _is_float(v):
+    def _is_float(self, v):
         return isinstance(v, float) or isinstance(v, np.float32) \
             or isinstance(v, np.float64)
 
-    def _quant(x, scale, num_bits):
+    def _quant(self, x, scale, num_bits):
         return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 9d933b21b7..bb8f51cc8c 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -17,9 +17,11 @@ import random
 import numpy as np
 import paddle.fluid as fluid
 import six
+import paddle
 from paddle.fluid.framework import Program
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
 from paddle.fluid import core
 
 
@@ -148,11 +150,11 @@ class TestQuantizationTransformPass(unittest.TestCase):
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
 
-    def test_linear_fc_quant_abs_max(self):
+    def no_test_linear_fc_quant_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
         self.linear_fc_quant('abs_max')
 
-    def test_linear_fc_quant_range_abs_max(self):
+    def no_test_linear_fc_quant_range_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_range_abs_max'
         self.linear_fc_quant('range_abs_max')
 
@@ -184,17 +186,17 @@ class TestQuantizationTransformPass(unittest.TestCase):
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
 
-    def test_residual_block_abs_max(self):
+    def no_test_residual_block_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
         self.residual_block_quant('abs_max')
 
-    def test_residual_block_range_abs_max(self):
+    def no_test_residual_block_range_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_range_abs_max'
         self.residual_block_quant('range_abs_max')
 
 
-class TestQuantizeTranspiler(unittest.TestCase):
-    def freeze_graph(self, use_cuda, seed):
+class TestQuantizationFreezePass(unittest.TestCase):
+    def freeze_graph(self, use_cuda, seed, quant_type):
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
@@ -220,16 +222,21 @@ class TestQuantizeTranspiler(unittest.TestCase):
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
         main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        test_graph = IrGraph(core.Graph(test_graph.desc), for_test=True)
+        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
         transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(), program_exe=exe)
+            scope=scope, program_exe=exe, activation_quantize_type=quant_type)
+        transform_pass.apply(main_graph)
+        transform_pass.apply(test_graph)
+
         iters = 5
         batch_size = 8
-        class_num = 10
-        exe.run(startup)
+        dev_name = '_gpu_' if use_cuda else '_cpu_'
 
         train_reader = paddle.batch(
             paddle.reader.shuffle(
@@ -238,57 +245,87 @@ class TestQuantizeTranspiler(unittest.TestCase):
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-
-        with fluid.program_guard(main):
+        with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(program=main,
+                loss_v = exe.run(program=main_graph.to_program(),
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
+                print('{}: {}'.format(dev_name, loss_v))
+
+        marked_nodes = set()
+        for op in main_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+
+        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        origin_marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                origin_marked_nodes.add(op)
+        test_graph.draw('.', 'test_origin' + dev_name + quant_type,
+                        origin_marked_nodes)
+        freeze_pass.apply(test_graph)
+        freeze_marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                freeze_marked_nodes.add(op)
+        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+                        freeze_marked_nodes)
+
+    # with fluid.program_guard(test_program):
+    #     test_data = next(test_reader())
+    #     w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+    #                                      test_program)
+    #     # Testing during training
+    #     test_loss1, w_quant = exe.run(program=test_program,
+    #                                   feed=feeder.feed(test_data),
+    #                                   fetch_list=[loss, w_var])
+
+    #     # Freeze program for inference, but the weight of fc/conv is still float type.
+    #     quant_transpiler.freeze_program(test_program, place)
+    #     test_loss2, = exe.run(program=test_program,
+    #                           feed=feeder.feed(test_data),
+    #                           fetch_list=[loss])
+    #     self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+    #     w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
+    #                         .get_tensor())
+    #     # fail: -432.0 != -433.0, this is due to the calculation precision
+    #     #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+
+    #     # Convert parameter to 8-bit.
+    #     quant_transpiler.convert_to_int8(test_program, place)
+    #     # Save the 8-bit parameter and model file.
+    #     fluid.io.save_inference_model('model_8bit', ['image', 'label'],
+    #                                   [loss], exe, test_program)
+    #     # Test whether the 8-bit parameter and model file can be loaded successfully.
+    #     [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
+    #                                                          exe)
+    #     # Check the loaded 8-bit weight.
+    #     w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
+    #                       .get_tensor())
+
+    #     self.assertEqual(w_8bit.dtype, np.int8)
+    #     self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+
+    def test_freeze_program_cuda_dynamic(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_graph(True, seed=1, quant_type='abs_max')
+
+    def test_freeze_program_cpu_dynamic(self):
+        with fluid.unique_name.guard():
+            self.freeze_graph(False, seed=2, quant_type='abs_max')
 
-        with fluid.program_guard(test_program):
-            test_data = next(test_reader())
-            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
-                                             test_program)
-            # Testing during training
-            test_loss1, w_quant = exe.run(program=test_program,
-                                          feed=feeder.feed(test_data),
-                                          fetch_list=[loss, w_var])
-
-            # Freeze program for inference, but the weight of fc/conv is still float type.
-            quant_transpiler.freeze_program(test_program, place)
-            test_loss2, = exe.run(program=test_program,
-                                  feed=feeder.feed(test_data),
-                                  fetch_list=[loss])
-            self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
-                                .get_tensor())
-            # fail: -432.0 != -433.0, this is due to the calculation precision
-            #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-
-            # Convert parameter to 8-bit.
-            quant_transpiler.convert_to_int8(test_program, place)
-            # Save the 8-bit parameter and model file.
-            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
-                                          [loss], exe, test_program)
-            # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
-                                                                 exe)
-            # Check the loaded 8-bit weight.
-            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
-                              .get_tensor())
-
-            self.assertEqual(w_8bit.dtype, np.int8)
-            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-
-    def not_test_freeze_program_cuda(self):
+    def test_freeze_program_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
-                self.freeze_program(True, seed=1)
+                self.freeze_graph(True, seed=1, quant_type='range_abs_max')
 
-    def not_test_freeze_program_cpu(self):
+    def test_freeze_program_cpu_static(self):
         with fluid.unique_name.guard():
-            self.freeze_program(False, seed=2)
+            self.freeze_graph(False, seed=2, quant_type='range_abs_max')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 83203b746c..5f121c63f8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import collections
 from collections import defaultdict
+from collections import Iterable
 import contextlib
 import os
 import re
@@ -1630,7 +1631,10 @@ class IrGraph(object):
 
     def safe_remove_nodes(self, remove_nodes):
         if not isinstance(remove_nodes, set):
-            remove_nodes = set(remove_nodes)
+            if isinstance(remove_nodes, Iterable):
+                remove_nodes = set(remove_nodes)
+            else:
+                remove_nodes = {remove_nodes}
         core.graph_safe_remove_nodes(self.graph, remove_nodes)
 
     def has_circle(self):

From c64f22048a829808b3bfda5d1922d6796aff7e37 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Sat, 26 Jan 2019 15:56:54 +0800
Subject: [PATCH 06/78] add convert_to_int8 pass and transform_for_mobile pass
 and their UTs.

---
 .../slim/quantization/quantization_pass.py    | 106 +++++++++++++-
 .../slim/tests/test_quantization_pass.py      | 135 +++++++++++-------
 .../contrib/tests/test_quantize_transpiler.py |  26 +++-
 3 files changed, 207 insertions(+), 60 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index ed965aaa0b..1d0fa6b376 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -21,7 +21,10 @@ from ....framework import Program
 from ....initializer import Constant
 from .... import unique_name
 
-__all__ = ['QuantizationTransformPass', 'QuantizationFreezePass']
+__all__ = [
+    'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
+    'TransformForMobilePass'
+]
 
 
 class QuantizationTransformPass(object):
@@ -394,6 +397,7 @@ class QuantizationFreezePass(object):
 
         # remove the unused var node in the graph
         self._remove_unused_var_nodes(graph)
+        return graph
 
     def _remove_fake_quant_and_dequant_op(self, graph, op_node):
         k = op_node.op().output('Out')[0]
@@ -453,9 +457,9 @@ class QuantizationFreezePass(object):
     def _load_var(self, name):
         return np.array(self._scope.find_var(name).get_tensor())
 
-    def _restore_var(self, name, arr):
-        t = self._scope.find_var(name).get_tensor()
-        t.set(arr, self._place)
+    def _restore_var(self, name, array):
+        tensor = self._scope.find_var(name).get_tensor()
+        tensor.set(array, self._place)
 
     def _remove_unused_var_nodes(self, graph):
         all_used_vars = set()
@@ -496,3 +500,97 @@ class QuantizationFreezePass(object):
 
     def _quant(self, x, scale, num_bits):
         return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+
+
+class ConvertToInt8Pass(object):
+    def __init__(self, scope, place):
+        assert scope is not None, \
+            'The scope cannot be set None.'
+        assert place is not None, \
+            'The place cannot be set None.'
+        self._scope = scope
+        self._place = place
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+
+    def apply(self, graph):
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        ops = graph.all_ops()
+        input_map = {}
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._quantizable_ops:
+                for var_node in op_node.inputs:
+                    name = var_node.name()
+                    if name in persistable_vars:
+                        if name not in input_map:
+                            int8_var_node = self._convert_to_int8(graph,
+                                                                  var_node)
+                            input_map[name] = int8_var_node
+                        graph.update_input_link(var_node, input_map[name],
+                                                op_node)
+
+        # remove the unused var node in the graph
+        self._remove_unused_var_nodes(graph)
+        return graph
+
+    def _convert_to_int8(self, graph, var_node):
+        int8_var_node_name = var_node.name() + ".int8"
+        int8_var_node = graph.create_param_node(
+            name=cpt.to_text(int8_var_node_name),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=core.VarDesc.VarType.INT8)
+        array = self._load_var(var_node.name())
+        self._scope.var(int8_var_node_name)
+        self._store_var(int8_var_node_name, array, np.int8)
+        return int8_var_node
+
+    def _load_var(self, name):
+        return np.array(self._scope.find_var(name).get_tensor())
+
+    def _store_var(self, name, array, dtype):
+        tensor = self._scope.find_var(name).get_tensor()
+        tensor.set(array.astype(dtype), self._place)
+
+    def _remove_unused_var_nodes(self, graph):
+        all_used_vars = set()
+        ops = graph.all_ops()
+        for op_node in ops:
+            for input_node in op_node.inputs:
+                all_used_vars.add(input_node)
+            for output_node in op_node.outputs:
+                all_used_vars.add(output_node)
+
+        all_unused_vars = graph.all_vars() - all_used_vars
+        graph.safe_remove_nodes(all_unused_vars)
+
+
+class TransformForMobilePass(object):
+    def __init__(self):
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
+
+    def apply(self, graph):
+        ops = graph.all_ops()
+        for op_node in ops:
+            name = op_node.name()
+            if name in self._fake_quant_op_names:
+                op_node.op().set_type('quantize')
+                quant_node = graph.create_op_node_from_desc(op_node.op())
+                for input_node in op_node.inputs:
+                    graph.link_to(input_node, quant_node)
+                for output_node in op_node.outputs:
+                    graph.link_to(quant_node, output_node)
+                graph.safe_remove_nodes(op_node)
+            if name in self._fake_dequant_op_names:
+                op_node.op().set_type('dequantize')
+                dequant_node = graph.create_op_node_from_desc(op_node.op())
+                for input_node in op_node.inputs:
+                    graph.link_to(input_node, dequant_node)
+                for output_node in op_node.outputs:
+                    graph.link_to(dequant_node, output_node)
+                graph.safe_remove_nodes(op_node)
+
+        return graph
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index bb8f51cc8c..a8d7507246 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -18,10 +18,11 @@ import numpy as np
 import paddle.fluid as fluid
 import six
 import paddle
-from paddle.fluid.framework import Program
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
+from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
+from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid import core
 
 
@@ -233,10 +234,22 @@ class TestQuantizationFreezePass(unittest.TestCase):
             scope=scope, program_exe=exe, activation_quantize_type=quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
+        dev_name = '_gpu_' if use_cuda else '_cpu_'
+        marked_nodes = set()
+        for op in main_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
 
+        quantized_main_program = main_graph.to_program()
+        quantized_test_program = test_graph.to_program()
         iters = 5
         batch_size = 8
-        dev_name = '_gpu_' if use_cuda else '_cpu_'
 
         train_reader = paddle.batch(
             paddle.reader.shuffle(
@@ -248,66 +261,86 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(program=main_graph.to_program(),
+                loss_v = exe.run(program=quantized_main_program,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
-                print('{}: {}'.format(dev_name, loss_v))
+                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
+        test_data = next(test_reader())
+        with fluid.program_guard(quantized_test_program):
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             quantized_test_program)
+        # Testing
+        with fluid.scope_guard(scope):
+            test_loss1, w_quant = exe.run(program=quantized_test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+
+        # Freeze graph for inference, but the weight of fc/conv is still float type.
+        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        freeze_pass.apply(test_graph)
         marked_nodes = set()
-        for op in main_graph.all_ops():
+        for op in test_graph.all_ops():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
-        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+                        marked_nodes)
 
-        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
-        origin_marked_nodes = set()
+        server_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            test_loss2, = exe.run(program=server_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+        print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
+        # Maybe failed, this is due to the calculation precision
+        self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))
+        print('{}: {}'.format('w_quant' + dev_name + quant_type,
+                              np.sum(w_quant)))
+
+        # Convert parameter to 8-bit.
+        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
+        convert_int8_pass.apply(test_graph)
+        marked_nodes = set()
         for op in test_graph.all_ops():
             if op.name().find('quantize') > -1:
-                origin_marked_nodes.add(op)
-        test_graph.draw('.', 'test_origin' + dev_name + quant_type,
-                        origin_marked_nodes)
-        freeze_pass.apply(test_graph)
-        freeze_marked_nodes = set()
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
+        server_program_int8 = test_graph.to_program()
+        # Save the 8-bit parameter and model file.
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          server_program_int8)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model(
+                'server_int8' + dev_name + quant_type, exe)
+        # Check the loaded 8-bit weight.
+        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
+        self.assertEqual(w_8bit.dtype, np.int8)
+        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+        print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))
+
+        mobile_pass = TransformForMobilePass()
+        mobile_pass.apply(test_graph)
+        marked_nodes = set()
         for op in test_graph.all_ops():
             if op.name().find('quantize') > -1:
-                freeze_marked_nodes.add(op)
-        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
-                        freeze_marked_nodes)
-
-    # with fluid.program_guard(test_program):
-    #     test_data = next(test_reader())
-    #     w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
-    #                                      test_program)
-    #     # Testing during training
-    #     test_loss1, w_quant = exe.run(program=test_program,
-    #                                   feed=feeder.feed(test_data),
-    #                                   fetch_list=[loss, w_var])
-
-    #     # Freeze program for inference, but the weight of fc/conv is still float type.
-    #     quant_transpiler.freeze_program(test_program, place)
-    #     test_loss2, = exe.run(program=test_program,
-    #                           feed=feeder.feed(test_data),
-    #                           fetch_list=[loss])
-    #     self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-    #     w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
-    #                         .get_tensor())
-    #     # fail: -432.0 != -433.0, this is due to the calculation precision
-    #     #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-
-    #     # Convert parameter to 8-bit.
-    #     quant_transpiler.convert_to_int8(test_program, place)
-    #     # Save the 8-bit parameter and model file.
-    #     fluid.io.save_inference_model('model_8bit', ['image', 'label'],
-    #                                   [loss], exe, test_program)
-    #     # Test whether the 8-bit parameter and model file can be loaded successfully.
-    #     [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
-    #                                                          exe)
-    #     # Check the loaded 8-bit weight.
-    #     w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
-    #                       .get_tensor())
-
-    #     self.assertEqual(w_8bit.dtype, np.int8)
-    #     self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
+                        marked_nodes)
+
+        mobile_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          mobile_program)
 
     def test_freeze_program_cuda_dynamic(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 86fa84ad4b..ade2a388f2 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase):
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
 
-        quant_transpiler = QuantizeTranspiler()
-        quant_transpiler.training_transpile(main)
-        quant_transpiler.training_transpile(test_program)
+        quant_type = 'abs_max'
+        quant_transpiler = QuantizeTranspiler(
+            activation_quantize_type=quant_type)
+        quant_transpiler.training_transpile(main, startup)
+        quant_transpiler.training_transpile(test_program, startup)
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -223,12 +225,14 @@ class TestQuantizeTranspiler(unittest.TestCase):
             paddle.dataset.mnist.test(), batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
+        dev_name = '_gpu_' if use_cuda else '_cpu_'
         with fluid.program_guard(main):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(program=main,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
+                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         with fluid.program_guard(test_program):
             test_data = next(test_reader())
@@ -245,11 +249,19 @@ class TestQuantizeTranspiler(unittest.TestCase):
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
             self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+            print('{}: {}'.format('test_loss1' + dev_name + quant_type,
+                                  test_loss1))
+            print('{}: {}'.format('test_loss2' + dev_name + quant_type,
+                                  test_loss2))
             w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
                                 .get_tensor())
             # fail: -432.0 != -433.0, this is due to the calculation precision
             #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
 
+            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                                  np.sum(w_freeze)))
+            print('{}: {}'.format('w_quant' + dev_name + quant_type,
+                                  np.sum(w_quant)))
             # Convert parameter to 8-bit.
             quant_transpiler.convert_to_int8(test_program, place)
             # Save the 8-bit parameter and model file.
@@ -264,13 +276,17 @@ class TestQuantizeTranspiler(unittest.TestCase):
 
             self.assertEqual(w_8bit.dtype, np.int8)
             self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+            print('{}: {}'.format('w_8bit' + dev_name + quant_type,
+                                  np.sum(w_8bit)))
+            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                                  np.sum(w_freeze)))
 
-    def not_test_freeze_program_cuda(self):
+    def test_freeze_program_cuda(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_program(True, seed=1)
 
-    def not_test_freeze_program_cpu(self):
+    def test_freeze_program_cpu(self):
         with fluid.unique_name.guard():
             self.freeze_program(False, seed=2)
 

From c67b29c178f46db9d37234993729f29e216824bf Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Sat, 26 Jan 2019 19:46:02 +0800
Subject: [PATCH 07/78] fix some bugs of graph.to_program and get_pass.

---
 paddle/fluid/pybind/ir.cc                             |  6 ------
 paddle/fluid/pybind/pybind.cc                         | 11 ++++-------
 .../contrib/slim/tests/test_quantization_pass.py      |  4 ++--
 .../fluid/contrib/tests/test_quantize_transpiler.py   |  2 +-
 python/paddle/fluid/framework.py                      |  4 ++--
 5 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index b7e7de4ee6..1cd1be8e8d 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -58,7 +58,6 @@ void BindGraph(py::module *m) {
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
-      .def("get_program", &Graph::Get<ProgramDesc>)
       .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
@@ -75,11 +74,6 @@ void BindGraph(py::module *m) {
            [](Graph &self, const std::string &attr_name, double attr) {
              return self.Set(attr_name, new double(attr));
            })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const ProgramDesc &attr) {
-             return self.Set(attr_name, new ProgramDesc(attr));
-           })
       .def("set",
            [](Graph &self, const std::string &attr_name,
               const std::unordered_set<const Node *> &attr) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c470483756..e63a3b6871 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -788,8 +788,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
-  m.def("get_pass", [](const py::bytes &binary_str) {
-    std::string pass_type(binary_str);
+  m.def("get_pass", [](const std::string &pass_type) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
     return std::shared_ptr<framework::ir::Pass>(std::move(pass));
   });
@@ -797,10 +796,9 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
       .def("has", &ir::Pass::Has)
-      .def("set",
-           [](ir::Pass &self, const std::string &attr_name,
-              const ProgramDesc &attr) {
-             return self.Set(attr_name, new ProgramDesc(attr));
+      .def("set_not_owned",
+           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
+             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
            })
       .def(
           "set",
@@ -809,7 +807,6 @@ All parameter, weight, gradient are variables in Paddle.
           })
       .def("set", [](ir::Pass &self, const std::string &name,
                      int val) { self.Set<const int>(name, new int(val)); })
-      .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
         std::unique_ptr<ir::Graph> origin_graph(graph.get());
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index a8d7507246..845db3ebb8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -248,8 +248,8 @@ class TestQuantizationFreezePass(unittest.TestCase):
 
         quantized_main_program = main_graph.to_program()
         quantized_test_program = test_graph.to_program()
-        iters = 5
-        batch_size = 8
+        iters = 10
+        batch_size = 128
 
         train_reader = paddle.batch(
             paddle.reader.shuffle(
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index ade2a388f2..8d2bd79e04 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -204,7 +204,7 @@ class TestQuantizeTranspiler(unittest.TestCase):
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
 
-        quant_type = 'abs_max'
+        quant_type = 'range_abs_max'
         quant_transpiler = QuantizeTranspiler(
             activation_quantize_type=quant_type)
         quant_transpiler.training_transpile(main, startup)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5f121c63f8..1b4b7f18e2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1683,9 +1683,9 @@ class IrGraph(object):
 
     def to_program(self):
         convert_pass = core.get_pass('graph_to_program_pass')
-        convert_pass.set('program', Program().desc)
+        desc = core.ProgramDesc()
+        convert_pass.set_not_owned('program', desc)
         convert_pass.apply(self.graph)
-        desc = convert_pass.get_program('program')
         program = Program._construct_from_desc(desc)
         return program
 

From 0db41a9c444db2cef56a32ff608d7a57aaa5fb0c Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Mon, 28 Jan 2019 19:26:02 +0800
Subject: [PATCH 08/78] add op_role attr when creating op node.

---
 .../slim/quantization/quantization_pass.py    | 25 +++++++++++++++----
 .../slim/tests/test_quantization_pass.py      | 13 +++++++---
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 1d0fa6b376..8567b2f396 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -180,9 +180,14 @@ class QuantizationTransformPass(object):
                     Constant(value=0, force_cpu=True)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
+                # The attribute of `op_role` is needed by ParallelExecutor.
                 increment_op = graph.create_op_node(
                     op_type='increment',
-                    attrs={'step': 1.0},
+                    attrs={
+                        'step': 1.0,
+                        'op_role':
+                        core.op_proto_and_checker_maker.OpRole.Forward
+                    },
                     inputs={'X': global_step_in},
                     outputs={'Out': global_step_out})
                 graph.link_to(global_step_in, increment_op)
@@ -217,7 +222,10 @@ class QuantizationTransformPass(object):
             var_dtype=var_node.var().dtype())
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
-            attrs={'bit_length': quant_bits},
+            attrs={
+                'bit_length': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
             inputs={'X': var_node},
             outputs={'Out': quant_var_node,
                      'OutScale': scale_var_node})
@@ -262,7 +270,8 @@ class QuantizationTransformPass(object):
         attrs = {
             'window_size': self._window_size,
             'bit_length': quant_bits,
-            'is_test': self._is_test
+            'is_test': self._is_test,
+            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
         }
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_range_abs_max',
@@ -295,7 +304,10 @@ class QuantizationTransformPass(object):
         max_range = (1 << (quant_bits - 1)) - 1
         dequant_op_node = graph.create_op_node(
             op_type='fake_dequantize_max_abs',
-            attrs={'max_range': float(max_range)},
+            attrs={
+                'max_range': float(max_range),
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
             inputs={'X': var_node,
                     'Scale': scale_var_node},
             outputs={'Out': dequant_var_node})
@@ -444,7 +456,10 @@ class QuantizationFreezePass(object):
             var_dtype=output_var_node.var().dtype())
         dequant_op_node = graph.create_op_node(
             op_type='fake_dequantize_max_abs',
-            attrs={'max_range': float(max_range)},
+            attrs={
+                'max_range': float(max_range),
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
             inputs={'X': output_var_node,
                     'Scale': scale_var_node},
             outputs={'Out': dequant_var_node})
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 845db3ebb8..cdd5b68803 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -251,6 +251,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
         iters = 10
         batch_size = 128
 
+        train_exe = fluid.ParallelExecutor(
+            main_program=quantized_main_program,
+            use_cuda=bool(use_cuda),
+            loss_name=loss.name,
+            scope=scope)
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
@@ -261,9 +266,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(program=quantized_main_program,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss])
+                #loss_v = exe.run(program=quantized_main_program,
+                #                 feed=feeder.feed(data),
+                #                 fetch_list=[loss])
+                loss_v = train_exe.run(feed=feeder.feed(data),
+                                       fetch_list=[loss.name])
                 print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         test_data = next(test_reader())

From a7efab7ec103c97fb86b2f8aace12bc185b6a21a Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Wed, 30 Jan 2019 23:30:19 +0800
Subject: [PATCH 09/78] add comments for public API. test=develop

---
 .../slim/quantization/quantization_pass.py    |  66 +++++++
 .../slim/tests/test_quantization_pass.py      |  26 +--
 python/paddle/fluid/framework.py              | 173 +++++++++++++++++-
 3 files changed, 242 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 8567b2f396..216c3601fe 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -39,7 +39,13 @@ class QuantizationTransformPass(object):
         """
         Convert and rewrite the IrGraph according to weight and
         activation quantization type.
+
         Args:
+            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
+            type, this pass will create some new parameters. The scope is used to
+            initialize these new parameters.
+            program_exe(fluid.Executor): program_exe is used to initialize new
+            parameters described above.
             weight_bits (int): quantization bit number for weights,
                 the bias is not quantized.
             activation_bits (int): quantization bit number for activation.
@@ -53,6 +59,7 @@ class QuantizationTransformPass(object):
                 support 'abs_max'. The 'range_abs_max' usually is not used for
                 weight, since weights are fixed once the model is well trained.
             window_size (int): the window size for 'range_abs_max' quantization.
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -96,6 +103,14 @@ class QuantizationTransformPass(object):
         self._global_step = None
 
     def apply(self, graph):
+        """
+        Quantize the graph for training process. According to weight and
+        activation quantization type, the graph will be added some fake
+        quantize operators and fake dequantize operators.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
         assert isinstance(graph,
                           IrGraph), 'graph must be the instance of IrGraph.'
         self._need_initialized.clear()
@@ -336,6 +351,23 @@ class QuantizationTransformPass(object):
 
 
 class QuantizationFreezePass(object):
+    """
+    The freeze pass is used to adjust the quantize operator order, for example:
+        1) `activation -> quant -> dequant -> conv2d` will be freezed into
+        `activation -> quant -> conv2d -> dequant`
+        2) `weight -> quant -> dequant -> conv2d` will be freezed into `weight -> conv2d`,
+        and weight will be sacled offline.
+
+    Args:
+        scope(fluid.Scope): scope is used to get the weight tensor values.
+        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
+        weight_bits (int): quantization bit number for weights.
+        activation_bits (int): quantization bit number for activation.
+        weight_quantize_type (str): quantization type for weights, support 'abs_max'.
+        The 'range_abs_max' usually is not used for weight, since weights are fixed once the
+        model is well trained.
+    """
+
     def __init__(self,
                  scope,
                  place,
@@ -361,6 +393,12 @@ class QuantizationFreezePass(object):
         self._var_scale_map = collections.OrderedDict()
 
     def apply(self, graph):
+        """
+        Adjust quantize/dequantize operators order for the inference process.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
         persistable_vars = [p.name() for p in graph.all_persistable_vars()]
         ops = graph.all_ops()
         for op_node in ops:
@@ -518,6 +556,15 @@ class QuantizationFreezePass(object):
 
 
 class ConvertToInt8Pass(object):
+    """
+    Convert the weights into int8_t type.
+
+    Args:
+        scope(fluid.Scope): scope is used to get the weight tensor values.
+        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the
+        8bits weight tensors.
+    """
+
     def __init__(self, scope, place):
         assert scope is not None, \
             'The scope cannot be set None.'
@@ -528,6 +575,13 @@ class ConvertToInt8Pass(object):
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
 
     def apply(self, graph):
+        """
+        Convert weights' tpye of the graph. After that, the data type of the
+        graph weigths is int8_t.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
         persistable_vars = [p.name() for p in graph.all_persistable_vars()]
         ops = graph.all_ops()
         input_map = {}
@@ -581,6 +635,10 @@ class ConvertToInt8Pass(object):
 
 
 class TransformForMobilePass(object):
+    """
+    This pass is used to convert the freezed graph for paddle-mobile execution.
+    """
+
     def __init__(self):
         self._fake_quant_op_names = [
             'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
@@ -588,6 +646,14 @@ class TransformForMobilePass(object):
         self._fake_dequant_op_names = ['fake_dequantize_max_abs']
 
     def apply(self, graph):
+        """
+        Because paddle-mobile use `quantize` an `dequantize` as the names of
+        quantize operator and dequantize operator, the `apply` function just
+        realize this logic.
+
+        Args:
+            graph(IrGraph): the graph will be transformed.
+        """
         ops = graph.all_ops()
         for op_node in ops:
             name = op_node.name()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index cdd5b68803..d988edf135 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -248,8 +248,8 @@ class TestQuantizationFreezePass(unittest.TestCase):
 
         quantized_main_program = main_graph.to_program()
         quantized_test_program = test_graph.to_program()
-        iters = 10
-        batch_size = 128
+        iters = 5
+        batch_size = 16
 
         train_exe = fluid.ParallelExecutor(
             main_program=quantized_main_program,
@@ -271,7 +271,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                 #                 fetch_list=[loss])
                 loss_v = train_exe.run(feed=feeder.feed(data),
                                        fetch_list=[loss.name])
-                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
+                #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         test_data = next(test_reader())
         with fluid.program_guard(quantized_test_program):
@@ -299,15 +299,15 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
         self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-        print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
-        print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        #print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        #print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
         w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
         # Maybe failed, this is due to the calculation precision
-        self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                              np.sum(w_freeze)))
-        print('{}: {}'.format('w_quant' + dev_name + quant_type,
-                              np.sum(w_quant)))
+        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
+        #print('{}: {}'.format('w_quant' + dev_name + quant_type,
+        #                      np.sum(w_quant)))
 
         # Convert parameter to 8-bit.
         convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
@@ -330,9 +330,9 @@ class TestQuantizationFreezePass(unittest.TestCase):
         w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
         self.assertEqual(w_8bit.dtype, np.int8)
         self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-        print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
-        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                              np.sum(w_freeze)))
+        #print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
 
         mobile_pass = TransformForMobilePass()
         mobile_pass.apply(test_graph)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 1b4b7f18e2..1a0a69b5c4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1516,12 +1516,16 @@ class Block(object):
 
 class IrGraph(object):
     """
-    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    Python IrGraph. Beneath it is a core.Graph, which is used for
+    create a c++ Ir Pass Graph. An IrGraph is just a graph view of
+    a Program. In an IrGraph, both Variables and Operators are graph
+    nodes.
     """
 
     def __init__(self, graph, for_test=False):
         """
-        Construct the IrGraph using core.Graph.
+        Construct an IrGraph using core.Graph.
+
         Args:
             graph(core.Graph): C++ Graph.
             for_test(bool): True for the test graph and false for the train graph.
@@ -1532,15 +1536,27 @@ class IrGraph(object):
         self._for_test = for_test
 
     def is_test(self):
+        """
+        If the graph is used for testing, the function returns true. Otherwise, returns false.
+        """
         return self._for_test
 
     def all_nodes(self):
+        """
+        Return all nodes included in the graph as a set.
+        """
         return {node for node in self.graph.nodes()}
 
     def all_vars(self):
+        """
+        Return all variable nodes included in the graph as a set.
+        """
         return {node for node in self.graph.nodes() if node.is_var()}
 
     def all_persistable_vars(self):
+        """
+        Return all persistable variable nodes included in the graph as a set.
+        """
         persistable_nodes = set()
         for node in self.graph.nodes():
             if node.is_var() and node.var() is not None and node.var(
@@ -1549,18 +1565,24 @@ class IrGraph(object):
         return persistable_nodes
 
     def all_ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
         return {node for node in self.graph.nodes() if node.is_op()}
 
     def var_node(self, name):
         """
-        Get a variable node by name from this graph.
+        Get a variable node by name from the graph.
+
         Args:
             name(str): the name of the variable node.
+
         Raises:
             ValueError: The If input's type is not str, or this graph
             doesn't have a variable with the giving name.
+
         Returns:
-            Node: the variable node with the giving name.
+            core.Node: the variable node with the giving name.
         """
         if not isinstance(name, six.string_types):
             raise TypeError(
@@ -1576,6 +1598,19 @@ class IrGraph(object):
         return target_var_node
 
     def create_param_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a persistable variable node in the graph. In IrGraph,
+        it can not distinguish between persistable variables and parameters.
+
+        Args:
+            name(str): the name of the persistable variable node.
+            vart_type(core.VarDesc.VarType): the type of the persistable variable node.
+            shape(list): the shape of the persistable variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
+
+        Returns:
+            core.Node: the created persistable variable node.
+        """
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
@@ -1584,6 +1619,20 @@ class IrGraph(object):
         return self.graph.create_var_node(var_desc)
 
     def create_var_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a variable node in the graph. The created variable node is
+        not persistable.
+
+        Args:
+            name(str): the name of the variable node.
+            vart_type(core.VarDesc.VarType): the type of the variable node.
+            shape(list): the shape of the variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the variable node.
+
+        Returns:
+            core.Node: the created variable node.
+        """
+
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
@@ -1591,9 +1640,31 @@ class IrGraph(object):
         return self.graph.create_var_node(var_desc)
 
     def create_var_node_from_desc(self, var_desc):
+        """
+        Create a variable node by using an existing VarDesc in the graph.
+        Depend on the giving VarDesc, the created variable node may be persistable.
+
+        Args:
+            var_desc(core.VarDesc): the giving variable description.
+
+        Returns:
+            core.Node: the created variable node.
+        """
         return self.graph.create_var_node(var_desc)
 
     def create_op_node(self, op_type, attrs, inputs, outputs):
+        """
+        Create a operator node in the graph.
+
+        Args:
+            op_type(str): the type of the operator node.
+            attrs(dict): the attributes of the operator node.
+            inputs(dict): the inputs of the operator node.
+            outputs(dict): the outpus of the operator node.
+
+        Returns:
+            core.Node: the created operator node.
+        """
         op_desc = core.OpDesc()
         op_desc.set_type(op_type)
         for attr, value in attrs.iteritems():
@@ -1611,9 +1682,26 @@ class IrGraph(object):
         return self.graph.create_op_node(op_desc)
 
     def create_op_node_from_desc(self, op_desc):
+        """
+        Create a operator node by using an existing OpDesc in the graph.
+
+        Args:
+            op_desc(core.VarDesc): the giving operator description.
+
+        Returns:
+            core.Node: the created operator node.
+        """
         return self.graph.create_op_node(op_desc)
 
     def update_input_link(self, old_input_node, new_input_node, op_node):
+        """
+        Update the input's link of a operator node.
+
+        Args:
+            old_input_node(core.Node): the old input node of the giving op_node.
+            new_input_node(core.Node): the new input node of the giving op_node.
+            op_node(core.Node): the operator node that is needed to update input's link.
+        """
         assert old_input_node in self.graph.nodes() and new_input_node in \
         self.graph.nodes() and op_node in self.graph.nodes(), \
         'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
@@ -1624,12 +1712,26 @@ class IrGraph(object):
         op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
 
     def link_to(self, node_in, node_out):
+        """
+        Connect two nodes.
+
+        Args:
+            node_in(core.Node): the input node.
+            node_out(core.Node): the output node.
+        """
         assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
             'The two arguments(node_in&node_out) must be in the graph nodes.'
         node_in.outputs_append(node_out)
         node_out.inputs_append(node_in)
 
     def safe_remove_nodes(self, remove_nodes):
+        """
+        Remove nodes safely since links connected to these removed nodes are
+        also removed.
+
+        Args:
+            remove_nodes(set): the nodes prepared to be removed.
+        """
         if not isinstance(remove_nodes, set):
             if isinstance(remove_nodes, Iterable):
                 remove_nodes = set(remove_nodes)
@@ -1638,18 +1740,57 @@ class IrGraph(object):
         core.graph_safe_remove_nodes(self.graph, remove_nodes)
 
     def has_circle(self):
+        """
+        Check if the graph has a circle.
+
+        Returns:
+            bool: True if the graph has a circle else False.
+        """
         return core.has_circle(self.graph)
 
     def graph_num(self):
+        """
+        Count the number of unconnected graphs in this graph.
+
+        Returns:
+            int: the number of unconnected graphs.
+        """
         return core.graph_num(self.graph)
 
     def topology_sort(self):
+        """
+        Perform the topology sort operation on the graph.
+
+        Notes: the `graph` cannot contain a circle.
+
+        Returns:
+            set(core.Node): nodes in topology order.
+        """
         return core.topology_sort(self.graph)
 
     def build_adjacency_list(self):
+        """
+        Build an adjacency list of operations for the `graph`.
+
+        Returns:
+            dict{core.Node: set(core.Node)}: the adjacency list.
+        """
         return core.build_adjacency_list(self.graph)
 
-    def draw(self, save_path, name, marked_nodes=None):
+    def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
+        """
+        Draw the graph. If `dot` command is installed, the drawn graph
+        will be saved as pdf file type, otherwise dot file type is used.
+
+        Args:
+            save_path(str): the save path of drawn graph.
+            name(str): the name of drawn graph.
+            marked_nodes(set(core.Node)): nodes that are needed to be marked.
+            Default value is None.
+            remove_ctr_var(bool): If it is set True, all control variable nodes
+            in the graph will be removed. Default value is True.
+        """
+
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
             exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
@@ -1659,15 +1800,17 @@ class IrGraph(object):
                 print('The {} is saved as the dot filetype.'.format(
                     dot_file_path))
 
-        remove_ctr_vars = set()
+        if remove_ctr_var:
+            remove_ctr_vars = set()
+            for node in self.graph.nodes():
+                if node.is_ctrl_var():
+                    remove_ctr_vars.add(node)
+            self.safe_remove_nodes(remove_ctr_vars)
         ops_num = 0
         for node in self.graph.nodes():
-            if node.is_ctrl_var():
-                remove_ctr_vars.add(node)
-            elif node.is_op():
+            if node.is_op():
                 ops_num += 1
         print('Total ops num = {}.'.format(ops_num))
-        self.safe_remove_nodes(remove_ctr_vars)
         if marked_nodes is not None:
             if not isinstance(marked_nodes, set):
                 marked_nodes = set(marked_nodes)
@@ -1682,6 +1825,16 @@ class IrGraph(object):
         _convert_to_pdf(viz_dot_path)
 
     def to_program(self):
+        """
+        Convert the graph into a Program.
+
+        Notes: When the graph includes backward operator nodes, the
+        conversion process may be failed. Usually, this function is
+        only used to convert a test graph.
+
+        Returns:
+            Program: a program converted from the graph.
+        """
         convert_pass = core.get_pass('graph_to_program_pass')
         desc = core.ProgramDesc()
         convert_pass.set_not_owned('program', desc)

From 28dfad5e27c01311d7fe49d20a97dd6ebc2d3187 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Thu, 31 Jan 2019 13:31:10 +0800
Subject: [PATCH 10/78] fix some bugs about python3. test=develop

---
 .../slim/quantization/quantization_pass.py    |  3 +-
 .../slim/tests/test_quantization_pass.py      | 38 +++++++++----------
 .../contrib/tests/test_quantize_transpiler.py | 20 ++--------
 python/paddle/fluid/framework.py              |  6 +--
 4 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 216c3601fe..18b58e6f38 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -14,6 +14,7 @@
 
 import collections
 import numpy as np
+import six
 from ..... import compat as cpt
 from .... import core
 from ....framework import IrGraph
@@ -165,7 +166,7 @@ class QuantizationTransformPass(object):
             assert self._program_exe is not None, \
             'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
             init_program = Program()
-            for var_desc, initializer in self._need_initialized.iteritems():
+            for var_desc, initializer in six.iteritems(self._need_initialized):
                 var = init_program.global_block().create_var(
                     name=var_desc.name(),
                     shape=var_desc.shape(),
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index d988edf135..2f291132f3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -151,11 +151,11 @@ class TestQuantizationTransformPass(unittest.TestCase):
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
 
-    def no_test_linear_fc_quant_abs_max(self):
+    def test_linear_fc_quant_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
         self.linear_fc_quant('abs_max')
 
-    def no_test_linear_fc_quant_range_abs_max(self):
+    def test_linear_fc_quant_range_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_range_abs_max'
         self.linear_fc_quant('range_abs_max')
 
@@ -187,11 +187,11 @@ class TestQuantizationTransformPass(unittest.TestCase):
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
 
-    def no_test_residual_block_abs_max(self):
+    def test_residual_block_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
         self.residual_block_quant('abs_max')
 
-    def no_test_residual_block_range_abs_max(self):
+    def test_residual_block_range_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_range_abs_max'
         self.residual_block_quant('range_abs_max')
 
@@ -249,13 +249,13 @@ class TestQuantizationFreezePass(unittest.TestCase):
         quantized_main_program = main_graph.to_program()
         quantized_test_program = test_graph.to_program()
         iters = 5
-        batch_size = 16
+        batch_size = 8
 
-        train_exe = fluid.ParallelExecutor(
-            main_program=quantized_main_program,
-            use_cuda=bool(use_cuda),
-            loss_name=loss.name,
-            scope=scope)
+        #train_exe = fluid.ParallelExecutor(
+        #    main_program=quantized_main_program,
+        #    use_cuda=bool(use_cuda),
+        #    loss_name=loss.name,
+        #    scope=scope)
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
@@ -266,11 +266,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
-                #loss_v = exe.run(program=quantized_main_program,
-                #                 feed=feeder.feed(data),
-                #                 fetch_list=[loss])
-                loss_v = train_exe.run(feed=feeder.feed(data),
-                                       fetch_list=[loss.name])
+                loss_v = exe.run(program=quantized_main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                #loss_v = train_exe.run(feed=feeder.feed(data),
+                #                       fetch_list=[loss.name])
                 #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         test_data = next(test_reader())
@@ -349,21 +349,21 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                           ['image', 'label'], [loss], exe,
                                           mobile_program)
 
-    def test_freeze_program_cuda_dynamic(self):
+    def test_freeze_graph_cuda_dynamic(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(True, seed=1, quant_type='abs_max')
 
-    def test_freeze_program_cpu_dynamic(self):
+    def test_freeze_graph_cpu_dynamic(self):
         with fluid.unique_name.guard():
             self.freeze_graph(False, seed=2, quant_type='abs_max')
 
-    def test_freeze_program_cuda_static(self):
+    def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(True, seed=1, quant_type='range_abs_max')
 
-    def test_freeze_program_cpu_static(self):
+    def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
             self.freeze_graph(False, seed=2, quant_type='range_abs_max')
 
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 8d2bd79e04..77fdf0087b 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -204,7 +204,7 @@ class TestQuantizeTranspiler(unittest.TestCase):
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
 
-        quant_type = 'range_abs_max'
+        quant_type = 'range_abs_max'  # 'range_abs_max' or 'abs_max'
         quant_transpiler = QuantizeTranspiler(
             activation_quantize_type=quant_type)
         quant_transpiler.training_transpile(main, startup)
@@ -225,14 +225,12 @@ class TestQuantizeTranspiler(unittest.TestCase):
             paddle.dataset.mnist.test(), batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
-        dev_name = '_gpu_' if use_cuda else '_cpu_'
         with fluid.program_guard(main):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(program=main,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
-                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         with fluid.program_guard(test_program):
             test_data = next(test_reader())
@@ -249,19 +247,11 @@ class TestQuantizeTranspiler(unittest.TestCase):
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
             self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-            print('{}: {}'.format('test_loss1' + dev_name + quant_type,
-                                  test_loss1))
-            print('{}: {}'.format('test_loss2' + dev_name + quant_type,
-                                  test_loss2))
             w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
                                 .get_tensor())
             # fail: -432.0 != -433.0, this is due to the calculation precision
             #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
 
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
-            print('{}: {}'.format('w_quant' + dev_name + quant_type,
-                                  np.sum(w_quant)))
             # Convert parameter to 8-bit.
             quant_transpiler.convert_to_int8(test_program, place)
             # Save the 8-bit parameter and model file.
@@ -276,17 +266,13 @@ class TestQuantizeTranspiler(unittest.TestCase):
 
             self.assertEqual(w_8bit.dtype, np.int8)
             self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-            print('{}: {}'.format('w_8bit' + dev_name + quant_type,
-                                  np.sum(w_8bit)))
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
 
-    def test_freeze_program_cuda(self):
+    def not_test_freeze_program_cuda(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_program(True, seed=1)
 
-    def test_freeze_program_cpu(self):
+    def not_test_freeze_program_cpu(self):
         with fluid.unique_name.guard():
             self.freeze_program(False, seed=2)
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4ca2c544e4..dcb20704fe 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1681,14 +1681,14 @@ class IrGraph(object):
         """
         op_desc = core.OpDesc()
         op_desc.set_type(op_type)
-        for attr, value in attrs.iteritems():
+        for attr, value in six.iteritems(attrs):
             self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_nodes in inputs.iteritems():
+        for input_name, var_nodes in six.iteritems(inputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
             op_desc.set_input(input_name,
                               [var_node.name() for var_node in var_nodes])
-        for output_name, var_nodes in outputs.iteritems():
+        for output_name, var_nodes in six.iteritems(outputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
             op_desc.set_output(output_name,

From 01d9bf9264e3b906244aea2a1055a65449ff21a8 Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Sun, 3 Feb 2019 14:10:56 +0800
Subject: [PATCH 11/78] Fix batch_norm API for data_layout.

test=develop
---
 python/paddle/fluid/layers/nn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0e4b5aadc0..46ce58fd2d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2930,6 +2930,7 @@ def batch_norm(input,
             "momentum": momentum,
             "epsilon": epsilon,
             "is_test": is_test,
+            "data_layout": data_layout,
             "use_mkldnn": False,
             "fuse_with_relu": fuse_with_relu,
             "use_global_stats": use_global_stats

From 4975a9050a93829b36c8bab64958d3c762628126 Mon Sep 17 00:00:00 2001
From: Gabor Buella <gabor.buella@intel.com>
Date: Tue, 5 Feb 2019 12:59:43 +0100
Subject: [PATCH 12/78] Tests - add some missing to_string calls

```
/home/tej/code/gbuella_paddle/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc:167:40: error: adding 'int' to a string does not append to the string [-Werror,-Wstring-plus-int]
    std::string prefix = "seqpool_op_" + i;
                         ~~~~~~~~~~~~~~^~~
/home/tej/code/gbuella_paddle/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc:167:40: note: use array indexing to silence this warning
    std::string prefix = "seqpool_op_" + i;
                                       ^
                         &             [  ]
1 error generated.
```

test=develop
---
 .../details/fused_broadcast_op_handle_test.cc | 31 ++++++++++---------
 .../ir/seqpool_concat_fuse_pass_tester.cc     |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index be0d941c4f..6d53dac5c0 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
            ->Var(details::kLocalExecScopeName)
            ->GetMutable<Scope*>() = &local_scope;
       for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
-        local_scope.Var("out_var" + j);
-        if (i == j) local_scope.Var("in_var" + j);
+        local_scope.Var("out_var" + std::to_string(j));
+        if (i == j) local_scope.Var("in_var" + std::to_string(j));
       }
       param_scopes_.emplace_back(&local_scope);
     }
@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
       // add input var handle
-      nodes_.emplace_back(
-          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
-      VarHandle* in_var_handle =
-          new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
-                        "in_var" + i, place_list_[input_scope_idxes[i]]);
+      nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
+                                                ir::Node::Type::kVariable));
+      VarHandle* in_var_handle = new VarHandle(
+          nodes_.back().get(), 1, input_scope_idxes[i],
+          "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
 
       // add output var handle
       for (size_t j = 0; j < place_list_.size(); ++j) {
-        nodes_.emplace_back(
-            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
-        VarHandle* out_var_handle = new VarHandle(
-            nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
+        nodes_.emplace_back(ir::CreateNodeForTest(
+            "out_node" + std::to_string(i), ir::Node::Type::kVariable));
+        VarHandle* out_var_handle =
+            new VarHandle(nodes_.back().get(), 2, j,
+                          "out_var" + std::to_string(i), place_list_[j]);
         vars_.emplace_back(out_var_handle);
         op_handle_->AddOutput(out_var_handle);
       }
@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     std::vector<std::vector<float>> send_vec;
     f::LoD lod{{0, 10, 20}};
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + i);
+      const std::string varname("in_var" + std::to_string(i));
       float val_scalar = static_cast<float>(i);
       send_vec.push_back(
           InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     WaitAll();
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + i);
+      const std::string& varname("out_var" + std::to_string(i));
       for (size_t j = 0; j < place_list_.size(); ++j) {
         LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
       }
@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
     int height = static_cast<int>(kDims[0] * 2);
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + i);
+      const std::string varname("in_var" + std::to_string(i));
       float val_scalar = static_cast<float>(i);
       send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
                                              rows, height, val_scalar));
@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     WaitAll();
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + i);
+      const std::string& varname("out_var" + std::to_string(i));
       for (size_t j = 0; j < place_list_.size(); ++j) {
         SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
                           height);
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index 456a03192c..35d1d5129b 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
   };
   std::vector<std::string> concat_inputs;
   for (int i = 0; i < num_inputs_of_concat; ++i) {
-    std::string prefix = "seqpool_op_" + i;
+    std::string prefix = "seqpool_op_" + std::to_string(i);
     new_var(prefix + "in");
     new_var(prefix + "out");
     new_var(prefix + "out_unused");

From 94dd50c33fd19607a2fff798c44406b415576c38 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 6 Feb 2019 21:47:05 +0800
Subject: [PATCH 13/78] add details. test=develop

---
 .../operators/elementwise/elementwise_op.h      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index d04bb8f338..5443132641 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -264,6 +264,20 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
   }
 };
 
+class ElementwiseGradOpInplace : public framework::InplaceInToOut {
+ public:
+  using framework::InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return std::unordered_map<std::string, std::string>{
+        {framework::GradVarName("Out"), framework::GradVarName("X")},
+    };
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -316,4 +330,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
                     op_type##GradMaker,                                \
                     ::paddle::operators::ElementwiseOpInplace);        \
   REGISTER_OPERATOR(op_type##_grad,                                    \
-                    ::paddle::operators::ElementwiseOpExplicitGrad)
+                    ::paddle::operators::ElementwiseOpExplicitGrad,    \
+                    ::paddle::operators::ElementwiseGradOpInplace)

From b99db0e2c212881ace3d94dc220424b7a0dde43e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Feb 2019 21:39:58 +0800
Subject: [PATCH 14/78] cpu reduce mode did not need to broadcast test=develop

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cc..30a3549ffe 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -925,9 +925,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (need_broadcast_var_ ||
-      (UseGPU() &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+  if (UseGPU() && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
     if (strategy_.fuse_broadcast_op_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {

From 76072261f8548618455db5156239802a359dbe4d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Feb 2019 08:33:23 +0800
Subject: [PATCH 15/78] fix compiler test=develop

---
 paddle/fluid/framework/details/build_strategy.cc |  3 +++
 python/paddle/fluid/compiler.py                  | 15 +++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 51ce973272..a81f284268 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -133,12 +133,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
     if (strategy_.is_distribution_) {
+      VLOG(3) << "dist train mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        VLOG(3) << "allreduce mode";
         multi_devices_pass =
             AppendPass("allreduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        VLOG(3) << "reduce mode";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index ef02429428..1c194830e1 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -19,6 +19,7 @@ import sys
 from .. import compat as cpt
 
 from . import core
+from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -34,6 +35,15 @@ def _place_obj(place):
     return p
 
 
+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else framework.default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
 class CompiledProgram(object):
     """
     Compiles a Program for execution.
@@ -110,6 +120,8 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
+        self._build_strategy.is_distribution = _is_pserver_mode(
+            self._program) or self._build_strategy.num_trainers > 1
         return self
 
     def with_inference_optimize(self, config):
@@ -185,8 +197,7 @@ class CompiledProgram(object):
             self._build_strategy.trainers_endpoints = trainers_endpoints
 
         self._persistable_vars = set([
-            cpt.to_text(v.name)
-            for v in [
+            cpt.to_text(v.name) for v in [
                 var for var in self._program.list_vars()
                 if var.persistable and var.type != core.VarDesc.VarType.RAW
             ]

From abf17226f87d88b63fb446125577ea88bcfe72ca Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Feb 2019 10:02:01 +0800
Subject: [PATCH 16/78] fix code style test=develop

---
 python/paddle/fluid/compiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 1c194830e1..f3935e22b4 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -197,7 +197,8 @@ class CompiledProgram(object):
             self._build_strategy.trainers_endpoints = trainers_endpoints
 
         self._persistable_vars = set([
-            cpt.to_text(v.name) for v in [
+            cpt.to_text(v.name)
+            for v in [
                 var for var in self._program.list_vars()
                 if var.persistable and var.type != core.VarDesc.VarType.RAW
             ]

From 104d3b4e680c020d60ffa1977e7df118371fd53f Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sat, 9 Feb 2019 14:40:38 +0800
Subject: [PATCH 17/78] add details. test=develop

---
 paddle/fluid/operators/elementwise/elementwise_op.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 5443132641..91e4415265 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -272,9 +272,12 @@ class ElementwiseGradOpInplace : public framework::InplaceInToOut {
   std::unordered_map<std::string, std::string> Apply(
       const framework::OpDesc &op_desc,
       framework::BlockDesc *block) const override {
-    return std::unordered_map<std::string, std::string>{
-        {framework::GradVarName("Out"), framework::GradVarName("X")},
-    };
+    std::unordered_map<std::string, std::string> ret;
+    if (block->HasVar(framework::GradVarName("X")) &&
+        block->HasVar(framework::GradVarName("Out"))) {
+      ret[framework::GradVarName("Out")] = framework::GradVarName("X");
+    }
+    return ret;
   }
 };
 

From d85c2e4e5ccd878a7995f90ecc2a40092cf9390a Mon Sep 17 00:00:00 2001
From: Chunwei <yanchunwei@outlook.com>
Date: Mon, 11 Feb 2019 10:58:14 +0800
Subject: [PATCH 18/78] fix anakin compile dependency

test=develop
---
 paddle/fluid/inference/api/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index ad0af4005a..85755fc471 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context)
     function(anakin_target target_name)
       target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endfunction()

From f85245b409fdb9675457a1f7bfef2db180d52628 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 11 Feb 2019 11:12:01 +0800
Subject: [PATCH 19/78] test=develop

---
 .../contrib/decoder/beam_search_decoder.py    |  6 ++--
 python/paddle/fluid/contrib/inferencer.py     |  4 +--
 python/paddle/fluid/contrib/trainer.py        |  4 +--
 python/paddle/fluid/executor.py               |  4 +--
 python/paddle/fluid/framework.py              | 14 ++++-----
 python/paddle/fluid/imperative/base.py        |  4 +--
 python/paddle/fluid/initializer.py            |  4 +--
 python/paddle/fluid/layers/control_flow.py    |  4 +--
 python/paddle/fluid/layers/io.py              |  4 +--
 python/paddle/fluid/optimizer.py              |  2 +-
 python/paddle/fluid/profiler.py               |  2 +-
 python/paddle/fluid/recordio_writer.py        |  4 +--
 python/paddle/fluid/unique_name.py            |  4 +--
 python/paddle/fluid/wrapped_decorator.py      | 30 +++++++++++++++++++
 14 files changed, 60 insertions(+), 30 deletions(-)
 create mode 100644 python/paddle/fluid/wrapped_decorator.py

diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index f2b7ac8375..d0ca4fd485 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,7 +22,7 @@ This API is still under active development and may change drastically.
 
 from __future__ import print_function
 
-import contextlib
+from ...wrapped_decorator import contextmanager
 import numpy as np
 import six
 
@@ -419,7 +419,7 @@ class TrainingDecoder(object):
         self._state_cell = state_cell
         self._state_cell._enter_decoder(self)
 
-    @contextlib.contextmanager
+    @contextmanager
     def block(self):
         """
         Define the behavior of the decoder for each RNN time step.
@@ -613,7 +613,7 @@ class BeamSearchDecoder(object):
         self._word_dim = word_dim
         self._input_var_dict = input_var_dict
 
-    @contextlib.contextmanager
+    @contextmanager
     def block(self):
         """
         Define the behavior of the decoder for each RNN time step.
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b8d5f4ffea..41a0d55b57 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import contextlib
+from ..wrapped_decorator import contextmanager
 
 from .. import core
 
@@ -105,7 +105,7 @@ class Inferencer(object):
 
         return results
 
-    @contextlib.contextmanager
+    @contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(main_program=self.inference_program):
             with executor.scope_guard(self.scope):
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 8569e486f9..798014cb1e 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import contextlib
+from .wrapped_decorator import contextmanager
 import os
 import errno
 import shutil
@@ -453,7 +453,7 @@ class Trainer(object):
             io.save_inference_model(param_path, feeded_var_names, target_vars,
                                     exe)
 
-    @contextlib.contextmanager
+    @contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
                 main_program=self.train_program,
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index d3ff14a179..6c49c56408 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import os
 import multiprocessing
 import numpy as np
-import contextlib
+from .wrapped_decorator import contextmanager
 import six
 from .framework import Program, default_main_program, Variable
 from . import core
@@ -49,7 +49,7 @@ def _switch_scope(scope):
     return ex
 
 
-@contextlib.contextmanager
+@contextmanager
 def scope_guard(scope):
     """
     Change the global/default scope instance by Python `with` statement. All
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index c0b0ad8a20..f94c8136ca 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import collections
 from collections import defaultdict
-import contextlib
+from .wrapped_decorator import contextmanager
 import os
 import re
 import traceback
@@ -111,7 +111,7 @@ class NameScope(object):
 _name_scope = NameScope()
 
 
-@contextlib.contextmanager
+@contextmanager
 def name_scope(prefix=None):
     """
     Generate hierarchical name prefix for the operators.
@@ -1775,7 +1775,7 @@ class Program(object):
     def set_op_role_var(self, var_name):
         self._op_role_var = [var_name]
 
-    @contextlib.contextmanager
+    @contextmanager
     def _optimized_guard(self, param_and_grads):
         """
         A with guard to set :code:`Optimization` :code:`OpRole` and
@@ -1805,7 +1805,7 @@ class Program(object):
         self._op_role_var = tmp_var
         self._current_role = tmp_role
 
-    @contextlib.contextmanager
+    @contextmanager
     def _lr_schedule_guard(self, is_with_opt=False):
         """
         A with guard to set :code:`LRSched` :code:`OpRole` and
@@ -2459,7 +2459,7 @@ def switch_startup_program(program):
     return prev_program
 
 
-@contextlib.contextmanager
+@contextmanager
 def program_guard(main_program, startup_program=None):
     """
     Change the global main program and startup program with `with` statement.
@@ -2524,7 +2524,7 @@ def _get_var(name, program=None):
     return program.global_block().var(name)
 
 
-@contextlib.contextmanager
+@contextmanager
 def _imperative_guard(tracer):
     global _imperative_tracer_
     tmp_trace = _imperative_tracer_
@@ -2535,7 +2535,7 @@ def _imperative_guard(tracer):
     _imperative_tracer_ = tmp_trace
 
 
-@contextlib.contextmanager
+@contextmanager
 def _imperative_place_guard(place):
     global _imperative_current_expected_place_
     tmp_place = _imperative_current_expected_place_
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index ff3984b11f..2f8b3534aa 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
+from ..wrapped_decorator import contextmanager
 import numpy as np
 
 from paddle.fluid import core
@@ -24,7 +24,7 @@ def enabled():
     return framework._in_imperative_mode()
 
 
-@contextlib.contextmanager
+@contextmanager
 def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 5be21ff7f7..8f3f03cb1a 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 from . import framework
 import numpy as np
-import contextlib
+from .wrapped_decorator import contextmanager
 from .core import VarDesc
 from . import unique_name
 
@@ -49,7 +49,7 @@ def force_init_on_cpu():
     return _force_init_on_cpu_
 
 
-@contextlib.contextmanager
+@contextmanager
 def init_on_cpu():
     """
     Force the variable to be inited on CPU.
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index a7494aacea..1d639144e2 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-import contextlib
+from ..wrapped_decorator import contextmanager
 
 from .layer_function_generator import autodoc, templatedoc
 from .tensor import assign, fill_constant
@@ -1532,7 +1532,7 @@ class DynamicRNN(object):
             outputs={'Out': [x_reordered]})
         return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
 
-    @contextlib.contextmanager
+    @contextmanager
     def block(self):
         """
         The block for user to define operators in RNN. See the class docstring
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 1762bd3e34..58c892315f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-import contextlib
+from ..wrapped_decorator import contextmanager
 import multiprocessing
 import os
 import six
@@ -1116,7 +1116,7 @@ class Preprocessor(object):
     def _is_completed(self):
         return self.sub_block and self.source_var_names and self.sink_var_names
 
-    @contextlib.contextmanager
+    @contextmanager
     def block(self):
         self.status = Preprocessor.IN_SUB_BLOCK
         self.sub_block = self.main_prog._create_block()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index e0e781a322..e89103f18d 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 from collections import defaultdict
-from contextlib import contextmanager
+from .wrapped_decorator import contextmanager
 
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index e05885f5f5..08f5b38310 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 from . import core
-from contextlib import contextmanager
+from .wrapped_decorator import contextmanager
 import os
 import six
 
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 076a942cdd..5302dbb356 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -15,14 +15,14 @@
 from __future__ import print_function
 
 import os
-import contextlib
+from .wrapped_decorator import contextmanager
 from . import core
 __all__ = [
     'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]
 
 
-@contextlib.contextmanager
+@contextmanager
 def create_recordio_writer(filename,
                            compressor=core.RecordIOWriter.Compressor.Snappy,
                            max_num_records=1000):
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index b9957a699e..e1ec726ec4 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import collections
-import contextlib
+from .wrapped_decorator import contextmanager
 import six
 import sys
 
@@ -68,7 +68,7 @@ def switch(new_generator=None):
     return old
 
 
-@contextlib.contextmanager
+@contextmanager
 def guard(new_generator=None):
     if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py
new file mode 100644
index 0000000000..224afcca5a
--- /dev/null
+++ b/python/paddle/fluid/wrapped_decorator.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import decorator
+import contextlib
+
+__all__ = ['wrap_decorator', 'contextmanager']
+
+
+def wrap_decorator(decorator_func):
+    @decorator.decorator
+    def __impl__(func, *args, **kwargs):
+        wrapped_func = decorator_func(func)
+        return wrapped_func(*args, **kwargs)
+
+    return __impl__
+
+
+contextmanager = wrap_decorator(contextlib.contextmanager)

From 42f6d0f899caffe1b3aeebaf821c8ac062ddea3b Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 11 Feb 2019 04:52:02 +0000
Subject: [PATCH 20/78] modify API.spec test=develop

---
 paddle/fluid/API.spec                  | 24 ++++++++++++------------
 python/paddle/fluid/contrib/trainer.py |  2 +-
 python/requirements.txt                |  1 +
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f50a38842a..df961be911 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
@@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo
 paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
@@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes',
 paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
@@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo
 paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
 paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
@@ -346,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st
 paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
-paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
@@ -456,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', '
 paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
-paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
@@ -491,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'],
 paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
-paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
 paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
 paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 798014cb1e..f448c309b0 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from .wrapped_decorator import contextmanager
+from ..wrapped_decorator import contextmanager
 import os
 import errno
 import shutil
diff --git a/python/requirements.txt b/python/requirements.txt
index 03d5e33e88..5a70f1aa3f 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -11,3 +11,4 @@ graphviz
 six
 funcsigs
 pyyaml
+decorator

From 1905f1a108988e1d74d7d73a0e8b3d55a2c99af6 Mon Sep 17 00:00:00 2001
From: Dun Liang <randonlang@gmail.com>
Date: Mon, 11 Feb 2019 18:27:41 +0800
Subject: [PATCH 21/78] bug fix && test=develop

---
 paddle/fluid/framework/ir/graph.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b7f7c3d82e..feb3330176 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -142,7 +142,7 @@ class Graph {
     // TODO(panyx0718): control var name should be really unique.
     const std::string name = string::Sprintf(
         "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
-        node_set_.size());
+        num_node_created_);
     auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
     x->SetId(num_node_created_++);
     return x;

From f4a0e68481952219d4d5e18ac758247428a03cfa Mon Sep 17 00:00:00 2001
From: baojun <32073718+baojun-nervana@users.noreply.github.com>
Date: Mon, 11 Feb 2019 04:02:32 -0800
Subject: [PATCH 22/78] Fix ngraph compile WITH_DISTRIBUTE=ON (#15636)

* fix compile issue with_distribute test=develop

* simplified logic test=develop

* use ngraph dependency test=develop

* set cpu only test=develop

* update test and eliminate fp16 test test=develop
---
 paddle/fluid/framework/CMakeLists.txt         | 21 +++---
 .../fluid/operators/ngraph/ngraph_engine_op.h |  2 +-
 .../ngraph/test_accuracy_ngraph_op.py         | 31 +++++++--
 .../unittests/ngraph/test_conv2d_ngraph_op.py | 26 ++++++-
 .../ngraph/test_elementwise_add_ngraph_op.py  | 67 ++-----------------
 .../unittests/ngraph/test_mean_ngraph_op.py   |  8 +--
 .../unittests/ngraph/test_mul_ngraph_op.py    | 39 +++++++----
 .../unittests/ngraph/test_pool2d_ngraph_op.py | 26 ++++++-
 .../unittests/ngraph/test_scale_ngraph_op.py  | 18 +++--
 .../unittests/ngraph/test_top_k_ngraph_op.py  |  4 ++
 10 files changed, 133 insertions(+), 109 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 910318a49c..7ddf1ab44f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
-if(WITH_DISTRIBUTE)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
-        lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
+if(WITH_NGRAPH)
+  set(NGRAPH_EXE_DEPS ngraph_engine)
+else()
+  set(NGRAPH_EXE_DEPS)
+endif()
 
-   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
+    lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if (WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
-  else ()
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
index d2974298b0..2f194a9b87 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, ctx.GetPlace());
+        framework::proto::VarType::FP32, platform::CPUPlace());
     return kt;
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
index 13a33e2047..84b9198dbf 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
@@ -16,14 +16,37 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
 
 
-class TestNGRAPHAccuracyOp(TestAccuracyOp):
+class TestNGRAPHAccuracyOp(OpTest):
     def setUp(self):
-        super(TestNGRAPHAccuracyOp, self).setUp()
+        self.op_type = "accuracy"
+        self.dtype = np.float32
+        self.init_dtype()
+        n = 128
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1))
+        label = np.random.randint(0, 2, (n, 1))
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int64"),
+            'Total': np.array([n]).astype("int64")
+        }
+        self._cpu_only = True
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
index e5424e8a6e..dbc8557b4e 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -15,35 +15,59 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_conv2d_op import *
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestNGRAPH(TestConv2dOp):
+    def setUp(self):
+        super(TestNGRAPH, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPH, self).init_kernel_type()
 
 
 class TestNGRAPHWithPad(TestWithPad):
+    def setUp(self):
+        super(TestNGRAPHWithPad, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPHWithPad, self).init_kernel_type()
 
 
 class TestNGRAPHWithStride(TestWithStride):
+    def setUp(self):
+        super(TestNGRAPHWithStride, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPHWithStride, self).init_kernel_type()
 
 
 class TestNGRAPHWithGroup(TestWithGroup):
+    def setUp(self):
+        super(TestNGRAPHWithGroup, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPHWithGroup, self).init_kernel_type()
 
 
 class TestNGRAPHWith1x1(TestWith1x1):
+    def setUp(self):
+        super(TestNGRAPHWith1x1, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPHWith1x1, self).init_kernel_type()
 
 
 class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def setUp(self):
+        super(TestNGRAPHWithInput1x1Filter1x1, self).setUp()
+        self._cpu_only = True
+
     def init_kernel_type(self):
         super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
 
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
index 67722db89b..67f749bfee 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
@@ -14,73 +14,16 @@
 
 from __future__ import print_function
 import unittest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import *
+from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
 
 
 class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_scalar, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_scalar2, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_Vector, self).init_input_output()
-
-
-class TesNGRAPHtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
-    def init_input_output(self):
-        super(TesNGRAPHtElementwiseAddOp_broadcast_0, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_broadcast_1, self).init_input_output()
+    def setUp(self):
+        super(TestNGRAPHElementwiseAddOp, self).setUp()
+        self._cpu_only = True
 
-
-class TestNGRAPHElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
     def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_broadcast_2, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_broadcast_3, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_broadcast_4, self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_rowwise_add_0(
-        TestElementwiseAddOp_rowwise_add_0):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_rowwise_add_0,
-              self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_rowwise_add_1(
-        TestElementwiseAddOp_rowwise_add_1):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_rowwise_add_1,
-              self).init_input_output()
-
-
-class TestNGRAPHElementwiseAddOp_channelwise_add(
-        TestElementwiseAddOp_channelwise_add):
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp_channelwise_add,
-              self).init_input_output()
+        super(TestNGRAPHElementwiseAddOp, self).init_input_output()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
index 5535427ea8..11881ac6e5 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
@@ -14,17 +14,13 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp
+from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
 
 
 class TestNGRAPHMeanOp(TestMeanOp):
     def setUp(self):
         super(TestNGRAPHMeanOp, self).setUp()
-
-
-class TestNGRAPHFP16MeanOp(TestFP16MeanOp):
-    def setUp(self):
-        super(TestNGRAPHFP16MeanOp, self).setUp()
+        self._cpu_only = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
index 6aba62f7c0..a916c8d450 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -15,27 +15,38 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+class TestNGRAPHMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float32
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((2, 4)).astype(self.dtype),
+            'Y': np.random.random((4, 4)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+        self._cpu_only = True
 
-
-class TestNGRAPHMulOp(TestMulOp):
     def init_dtype_type(self):
         pass
 
+    def test_check_output(self):
+        self.check_output()
 
-class TestNGRAPHMulOp2(TestMulOp2):
-    def init_dtype_type(self):
-        pass
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-class TestNGRAPHFP16MulOp1(TestFP16MulOp1):
-    def init_dtype_type(self):
-        pass
-
-
-class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
-    def init_dtype_type(self):
-        pass
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
index 95e592e8ec..96a2b72d8a 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -14,35 +14,59 @@
 
 from __future__ import print_function
 
-from paddle.fluid.tests.unittests.test_pool2d_op import *
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
 class TestNGRAPHPool2D_Op(TestPool2D_Op):
+    def setUp(self):
+        super(TestNGRAPHPool2D_Op, self).setUp()
+        self._cpu_only = True
+
     def init_test_case(self):
         super(TestNGRAPHPool2D_Op, self).init_test_case()
 
 
 class TestNGRAPHCase1(TestCase1):
+    def setUp(self):
+        super(TestNGRAPHCase1, self).setUp()
+        self._cpu_only = True
+
     def init_test_case(self):
         super(TestNGRAPHCase1, self).init_test_case()
 
 
 class TestNGRAPHCase2(TestCase2):
+    def setUp(self):
+        super(TestNGRAPHCase2, self).setUp()
+        self._cpu_only = True
+
     def init_test_case(self):
         super(TestNGRAPHCase2, self).init_test_case()
 
 
 class TestNGRAPHCase3(TestCase3):
+    def setUp(self):
+        super(TestNGRAPHCase3, self).setUp()
+        self._cpu_only = True
+
     def init_pool_type(self):
         super(TestNGRAPHCase3, self).init_pool_type()
 
 
 class TestNGRAPHCase4(TestCase4):
+    def setUp(self):
+        super(TestNGRAPHCase4, self).setUp()
+        self._cpu_only = True
+
     def init_pool_type(self):
         super(TestNGRAPHCase4, self).init_pool_type()
 
 
 class TestNGRAPHCase5(TestCase5):
+    def setUp(self):
+        super(TestNGRAPHCase5, self).setUp()
+        self._cpu_only = True
+
     def init_pool_type(self):
         super(TestNGRAPHCase5, self).init_pool_type()
 
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
index b42a1f73fa..4da5ca4583 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
@@ -13,25 +13,23 @@
 # limitations under the License.
 from __future__ import print_function
 import unittest
-from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows
+from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
 
 
 class TestNGRAPHScaleOp(TestScaleOp):
-    def init_dtype_type(self):
-        pass
+    def setUp(self):
+        super(TestNGRAPHScaleOp, self).setUp()
+        self._cpu_only = True
 
-
-class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
     def init_dtype_type(self):
         pass
 
 
-class TestNGRAPHScaleFp16Op(TestScaleFp16Op):
-    def init_dtype_type(self):
-        pass
-
+class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
+    def setUp(self):
+        super(TestNGRAPHScaleOpSelectedRows, self).setUp()
+        self._cpu_only = True
 
-class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows):
     def init_dtype_type(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
index 3a0171087d..fa68df1adf 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
@@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d,
 class TestNGRAPHTopkOp(TestTopkOp):
     def setUp(self):
         super(TestNGRAPHTopkOp, self).setUp()
+        self._cpu_only = True
 
 
 class TestNGRAPHTopkOp2(TestTopkOp2):
     def setUp(self):
         super(TestNGRAPHTopkOp2, self).setUp()
+        self._cpu_only = True
 
 
 class TestNGRAPHTopkOp3(TestTopkOp3):
     def setUp(self):
         super(TestNGRAPHTopkOp3, self).setUp()
+        self._cpu_only = True
 
 
 class TestNGRAPHTopkOp4(TestTopkOp4):
     def setUp(self):
         super(TestNGRAPHTopkOp4, self).setUp()
+        self._cpu_only = True
 
 
 if __name__ == "__main__":

From 1198ccae6bd4548a749c712a2fe24cf5f2191e63 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Mon, 11 Feb 2019 14:29:35 +0100
Subject: [PATCH 23/78] Enable batch_norm operator for a ngraph engine
 test=develop

---
 .../fluid/operators/ngraph/ngraph_bridge.cc   |   2 +
 paddle/fluid/operators/ngraph/ngraph_ops.h    |   1 +
 .../operators/ngraph/ops/batch_norm_op.h      | 150 ++++++++++++++++++
 paddle/fluid/platform/ngraph_helper.h         |  20 +++
 .../ngraph/test_batch_norm_ngraph_op.py       |  37 +++++
 5 files changed, 210 insertions(+)
 create mode 100644 paddle/fluid/operators/ngraph/ops/batch_norm_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py

diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 38e65524e8..e8b92fc02a 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -34,6 +34,8 @@ std::map<std::string,
         {"accuracy", NG_OPS::BuildAccuracyNode},
         {"conv2d", NG_OPS::BuildConv2dNode},
         {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
+        {"batch_norm", NG_OPS::BuildBatchNormNode},
+        {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
         {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
         {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
         {"fill_constant", NG_OPS::BuildFillConstantNode},
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index fb574f1bc1..438a9c1be9 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #pragma once
 
 #include "ops/accuracy_op.h"
+#include "ops/batch_norm_op.h"
 #include "ops/binary_unary_op.h"
 #include "ops/conv2d_op.h"
 #include "ops/elementwise_add_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
new file mode 100644
index 0000000000..2cdd029976
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -0,0 +1,150 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildBatchNormNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto& data_layout = op_attrs.Get<std::string>("data_layout");
+
+  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
+  auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
+  auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+
+  const bool is_test = op_attrs.Get<bool>("is_test");
+  const float epsilon = op_attrs.Get<float>("epsilon");
+  const float momentum = op_attrs.Get<float>("momentum");
+
+  if (data_layout == "NHWC") {
+    x = paddle::platform::Nhwc2Nchw(x);
+  }
+
+  std::shared_ptr<ngraph::Node> mean_out, saved_mean, saved_variance,
+      variance_out, y;
+
+  if (!is_test) {
+    auto BN = std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale,
+                                                              bias, x);
+    y = std::make_shared<ngraph::op::GetOutputElement>(BN, 0);
+    saved_mean = std::make_shared<ngraph::op::GetOutputElement>(BN, 1);
+    saved_variance = std::make_shared<ngraph::op::GetOutputElement>(BN, 2);
+
+    mean_out = std::make_shared<ngraph::op::Add>(
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            momentum, mean),
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            1. - momentum, saved_mean));
+    variance_out = std::make_shared<ngraph::op::Add>(
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            momentum, variance),
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            1. - momentum, saved_variance));
+
+    if (data_layout == "NHWC") {
+      y = paddle::platform::Nchw2Nhwc(y);
+    }
+
+    paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map);
+    paddle::platform::SetOutputNode(op, "VarianceOut", variance_out,
+                                    ngb_node_map);
+    paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map);
+    paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance,
+                                    ngb_node_map);
+    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
+  } else {
+    y = std::make_shared<ngraph::op::BatchNormInference>(epsilon, scale, bias,
+                                                         x, mean, variance);
+    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
+  }
+}
+
+void BuildBatchNormGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto& data_layout = op_attrs.Get<std::string>("data_layout");
+
+  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
+  auto saved_mean =
+      paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map);
+  auto saved_variance =
+      paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  auto dy_shape = dy->get_shape();
+
+  PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4,
+                 "BN grap input size needs to be 2 or 4");
+  PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
+                    "BN grap input and delta size needs to be equal");
+
+  if (x_shape.size() == 2) {
+    x = std::make_shared<ngraph::op::Reshape>(
+        x, ngraph::AxisVector{0, 1},
+        ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1});
+    dy = std::make_shared<ngraph::op::Reshape>(
+        dy, ngraph::AxisVector{0, 1},
+        ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1});
+  }
+
+  if (data_layout == "NHWC") {
+    x = paddle::platform::Nhwc2Nchw(dy);
+    dy = paddle::platform::Nhwc2Nchw(dy);
+  }
+  const float epsilon = op_attrs.Get<float>("epsilon");
+
+  auto bn_bprop = std::make_shared<ngraph::op::BatchNormTrainingBackprop>(
+      epsilon, scale, bias, x, saved_mean, saved_variance, dy);
+
+  std::shared_ptr<ngraph::Node> dx =
+      std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 0);
+  auto dscale = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 1);
+  auto dbias = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 2);
+  paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map);
+  if (x_shape.size() == 2) {
+    paddle::platform::SetOutputNode(
+        op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map);
+  } else {
+    if (data_layout == "NHWC") {
+      dx = paddle::platform::Nchw2Nhwc(dx);
+    }
+    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
index b84315995a..5ee985ea71 100644
--- a/paddle/fluid/platform/ngraph_helper.h
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -23,6 +23,26 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+std::shared_ptr<ngraph::Node> Nhwc2Nchw(std::shared_ptr<ngraph::Node> in) {
+  auto in_shape = in->get_shape();
+  in_shape[0] = in->get_shape()[0];
+  in_shape[1] = in->get_shape()[3];
+  in_shape[2] = in->get_shape()[1];
+  in_shape[3] = in->get_shape()[2];
+  ngraph::AxisVector axis_vec = {0, 3, 1, 2};
+  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
+}
+
+std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
+  auto in_shape = in->get_shape();
+  in_shape[0] = in->get_shape()[0];
+  in_shape[1] = in->get_shape()[2];
+  in_shape[2] = in->get_shape()[3];
+  in_shape[3] = in->get_shape()[1];
+  ngraph::AxisVector axis_vec = {0, 2, 3, 1};
+  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
+}
+
 ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
   auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
                             std::multiplies<size_t>());
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
new file mode 100644
index 0000000000..511173af5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
+
+
+class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining):
+    def init_kernel_type(self):
+        super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type()
+
+
+class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        super(TestNGRAPHBatchNormOpInference, self).init_kernel_type()
+
+
+class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 04e9776aefca6132d28965b6692471e15891e657 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 11 Feb 2019 22:11:48 +0800
Subject: [PATCH 24/78] add details. test=develop

---
 cmake/flags.cmake                             |   3 +-
 paddle/fluid/framework/details/CMakeLists.txt |   8 +-
 .../fluid/framework/details/build_strategy.cc |   2 -
 .../fluid/framework/details/build_strategy.h  |   3 -
 .../framework/details/inplace_op_pass.cc      |  13 +-
 .../fluid/framework/details/inplace_op_pass.h |  15 +-
 .../details/memory_early_delete_pass.cc       | 117 -----
 .../details/memory_early_delete_pass.h        |  32 --
 .../details/memory_optimize_helper.cc         | 336 ++++++++++++--
 .../details/memory_optimize_helper.h          | 119 +++--
 .../details/memory_optimize_helper_test.cc    | 417 +++++++++++++++++-
 .../framework/details/memory_optimize_pass.cc | 297 +------------
 .../framework/details/memory_optimize_pass.h  |  50 +--
 .../details/memory_optimize_pass_test.cc      | 417 ------------------
 .../details/sequential_execution_pass.cc      |   1 +
 .../details/sequential_execution_pass.h       |   2 -
 paddle/fluid/framework/inplace_op_inference.h |   2 +-
 paddle/fluid/framework/parallel_executor.cc   |  11 +-
 paddle/fluid/framework/scope.cc               |   6 +-
 .../memory/allocation/legacy_allocator.cc     |   5 +-
 paddle/fluid/platform/place.cc                |   6 +
 paddle/fluid/pybind/pybind.cc                 |   4 -
 python/paddle/fluid/parallel_executor.py      |   3 +-
 23 files changed, 842 insertions(+), 1027 deletions(-)
 delete mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.cc
 delete mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.h
 delete mode 100644 paddle/fluid/framework/details/memory_optimize_pass_test.cc

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 81e7868a6a..5895657ece 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,12 +21,13 @@ function(CheckCompilerCXX11Flag)
             if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
                 message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
             endif()
-        endif()   
+        endif()
     endif()
 endfunction()
 
 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 6621a59d37..e88084424b 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
   list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
-cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph)
-cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
-
+cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 51ce973272..f8030c53f7 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
           new std::vector<OpDesc *>(main_program.Block(0).AllOps());
       graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
                                               all_op_descs);  // take ownership
-      graph->Set<GraphNodePool>(kGraphNodePool,
-                                new GraphNodePool);  // take ownership
 
       pass->Erase(kAllOpDescs);
       pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index e3e06a5614..e62e3edcef 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -77,9 +77,6 @@ struct BuildStrategy {
   bool fuse_relu_depthwise_conv_{false};
 
   bool memory_optimize_{false};
-
-  bool memory_early_delete_{false};
-
   // TODO(dzhwinter):
   // make enable_inplace, memory_optimize_
   // memory_early_delete_ true by default
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 78c5d5b50e..b0c5968499 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
   }
 }
 
-const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
-                                                   const std::string& cache_var,
-                                                   const size_t& idx,
-                                                   ir::Graph* graph) const {
+const NodeSwapQueue InplacePass::TryInplaceModifyVar(
+    const std::string& var, const std::string& cache_var, const size_t& idx,
+    ir::Graph* graph) const {
   PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
                  var_nodes_[var].at(0)->Var() != nullptr);
   std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
   var_desc->SetName(cache_var);
 
-  SSANodePair swap_nodes;
+  NodeSwapQueue swap_nodes;
 
   for (size_t i = idx; i < view_.AllOps().size(); ++i) {
     auto* op = view_.AllOps()[i];
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
   return swap_nodes;
 }
 
-void InplacePass::CommitModify(const SSANodePair& swap_nodes,
+void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
                                ir::Graph* graph) const {
   for (auto& pair : swap_nodes) {
     auto *node = pair.first, *cache_node = pair.second;
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
   }
 }
 
-void InplacePass::WithdrawModify(const SSANodePair& nodes,
+void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
                                  ir::Graph* graph) const {
   for (auto& pair : nodes) {
     auto *node = pair.first, *cache_node = pair.second;
diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h
index 1abcf1f279..7be7f31185 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -56,7 +56,8 @@ class GraphView {
   std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
 };
 
-typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair;
+// swap pairs in sequence
+typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
 class InplacePass : public ir::Pass {
  public:
   InplacePass();
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
   void InitSSAGraphNodes() const;
 
  private:
-  const SSANodePair TryInplaceModifyVar(const std::string& var,
-                                        const std::string& cache_var,
-                                        const size_t& idx,
-                                        ir::Graph* graph) const;
+  const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
+                                          const std::string& cache_var,
+                                          const size_t& idx,
+                                          ir::Graph* graph) const;
 
-  void CommitModify(const SSANodePair&, ir::Graph* graph) const;
+  void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
 
-  void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const;
+  void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
 
   void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
                          const size_t& idx) const;
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc
deleted file mode 100644
index 69f8f70548..0000000000
--- a/paddle/fluid/framework/details/memory_early_delete_pass.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
-#include <queue>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/memory_optimize_helper.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
-  std::queue<VarHandleBase*> queue;
-  queue.push(var_in);
-  do {
-    auto* var = queue.front();
-    queue.pop();
-    for (auto* op : var->PendingOps()) {
-      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) {
-        return compute_op;
-      }
-      for (auto* out_var : op->Outputs()) {
-        queue.push(out_var);
-      }
-    }
-  } while (!queue.empty());
-  return nullptr;
-}
-
-std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
-  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
-
-  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
-  unlived_vars.reserve(graph_pool.size());
-  for (auto& pair : graph_pool) {
-    unlived_vars.insert(std::make_pair(pair.first, pair.second));
-  }
-
-  auto compare_and_insert_early_delete_op = [&](
-      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
-    if (unlived_vars.empty()) return;
-    // unlived vars can be deleted after the last used op has finished.
-    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
-    for (auto& var : vars) {
-      auto* var_handle = dynamic_cast<VarHandle*>(var);
-      auto var_name = var->Node()->Name();
-      auto& var_place = var_handle->place();
-      if (unlived_vars.count(var_name) == 0) continue;
-      if (!unlived_vars[var_name].empty()) {
-        if (compute_op != nullptr &&
-            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
-          unlived_vars[var_name].erase(compute_op->Node()->Op());
-        }
-        continue;
-      }
-
-      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
-          var_handle->Node()->IsCtrlVar())
-        continue;
-
-      // shameless copyed from reference count pass.
-      if (compute_op == nullptr) {
-        // use next computation op scope
-        compute_op = FindNextComputationOpHandle(var_handle);
-      }
-      auto* early_delete_node =
-          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
-      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
-      auto* early_delete_handle = new EarlyDeleteOpHandle(
-          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
-      if (compute_op->Outputs().empty()) {
-        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-        compute_op->AddOutput(dep_var);
-        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      }
-      early_delete_handle->AddInput(compute_op->Outputs().front());
-      VLOG(5) << "Add early delete op " << var_name << " to Operator"
-              << compute_op->Name();
-    }
-  };
-
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto& op : all_ops) {
-    compare_and_insert_early_delete_op(op, op->Inputs());
-    compare_and_insert_early_delete_op(op, op->Outputs());
-  }
-  return graph;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(memory_early_delete_pass,
-              paddle::framework::details::MemoryEarlyDeletePass)
-    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
-    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h
deleted file mode 100644
index 8215aa1b2b..0000000000
--- a/paddle/fluid/framework/details/memory_early_delete_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/details/early_delete_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class MemoryEarlyDeletePass : public ir::Pass {
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index b56ef021ef..6345ba3359 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,17 +13,108 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <deque>
 #include <functional>
 #include <iostream>
 #include <numeric>
 #include <sstream>
 #include <string>
+#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
+using paddle::framework::VarDesc;
 
-size_t NodeSizeInBytes(const VarDesc& node) {
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
+  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+                 "Graph has no attribute of kAllOpDescs.");
+  // 1. get op desc order
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+
+  // 2. topology sort order
+  auto nodes = graph.Nodes();
+  std::deque<ir::Node*> ops;
+  FilterVariables(nodes, [&](ir::Node* op) {
+    if (op->IsOp() && op->Op() != nullptr) {
+      ops.emplace_back(op);
+    }
+  });
+  std::unordered_map<ir::Node*, size_t> op_deps;
+  std::list<ir::Node*> ready_ops;
+  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
+
+  for (auto* op : ops) {
+    std::unordered_set<ir::Node*> preceding_op;
+    for (auto* in : op->inputs) {
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
+      preceding_op.emplace(in->inputs[0]);
+      pending_ops[in->inputs[0]].emplace(op);
+    }
+    op_deps[op] = preceding_op.size();
+    if (preceding_op.empty()) {
+      ready_ops.emplace_back(op);
+    }
+  }
+
+  // 3. generated op list based desc order and the topology order
+  std::vector<ir::Node*> ret;
+  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
+
+  auto update_by_found_node = [&](ir::Node* found_node) {
+    for (auto* pending_op : pending_ops[found_node]) {
+      if (--op_deps[pending_op] == 0) {
+        ready_ops.emplace_back(pending_op);
+      }
+    }
+    ready_ops.remove(found_node);
+    ret.emplace_back(found_node);
+  };
+
+  while (!ready_ops.empty()) {
+    bool all_of_ready_op_unmatched = true;
+    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
+      auto op_desc = *it;
+      ir::Node* found_node = nullptr;
+      for (auto* op : ready_ops) {
+        if (IsSameDesc(op->Op(), op_desc)) {
+          found_node = op;
+          break;
+        }
+      }
+
+      // 3.1 op desc deleted by other pass
+      if (found_node == nullptr) {
+        ++it;
+        continue;
+      } else {
+        all_of_ready_op_unmatched = false;
+        it = op_descs_list.erase(it);
+      }
+      update_by_found_node(found_node);
+    }
+
+    // 3.2 op descs are added by other pass
+    // preceding op non empty means some new op descs are
+    // created, but not contained in return node list.
+    // these new op desc may depend on each other.
+    std::list<ir::Node*> prev_ready_ops(ready_ops);
+    if (all_of_ready_op_unmatched) {
+      for (auto op : prev_ready_ops) {
+        update_by_found_node(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE(std::all_of(
+      op_deps.begin(), op_deps.end(),
+      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
+
+  return ret;
+}
+
+size_t NodeSize(const VarDesc& node) {
   auto shape = node.GetShape();
   int size =
       std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
@@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
   return type_size * std::abs(size);
 }
 
-size_t NodeSizeInBytes(ir::Node* n) {
+size_t NodeSize(ir::Node* n) {
   auto* desc = FindVarDescInBlock(n);
-  return NodeSizeInBytes(*desc);
+  return NodeSize(*desc);
 }
 
 std::string DebugStringImpl(VarDesc* var) {
@@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) {
 std::string DebugString(ir::Node* var) {
   return DebugStringImpl(FindVarDescInBlock(var));
 }
-// return DebugString(var->Var()); }
 
 // NOTE(dzh): based ir node, if a large node has been reused
 // by a small size node, then next time it appear in pool, it will
@@ -80,18 +170,17 @@ struct NodeComparator {
     auto rhs_shape = rhs_desc->GetShape();
     if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
         (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
+      return NodeSize(lhs) <= NodeSize(rhs);
     } else {
       return false;
     }
   }
 };
 
-void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
+void OrderedSet::Insert(ir::Node* var) {
   PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
-  PADDLE_ENFORCE(op->IsOp());
   if (mark_table_.count(var->Name()) != 0) {
-    mark_table_[var->Name()]->second.insert(op);
+    mark_table_[var->Name()]->emplace_back(var);
     return;
   }
 
@@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
   auto var_shape = var_desc->GetShape();
   int batch_size = static_cast<int>(var_shape[0]);
 
-  NodeComparator compare_node;
+  NodeComparator functor;
   Iter it = nodes_.begin();
   while (it != nodes_.end()) {
-    auto* cache_desc = FindVarDescInBlock(it->first);
+    auto& prev = it->front();
+    auto* cache_desc = FindVarDescInBlock(prev);
     int cache_batch_size = cache_desc->GetShape()[0];
     if ((cache_batch_size == -1 && batch_size == -1) ||
         (cache_batch_size != -1 && batch_size != -1)) {
-      if (compare_node(it->first, var)) {
+      if (functor(prev, var)) {
         ++it;
       } else {
         break;
@@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
     }
   }
 
-  it =
-      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
+  it = nodes_.insert(it, {var});
   mark_table_[var->Name()] = it;
 }
 
-int OrderedNodeList::GetIndex(ir::Node* var) {
+int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
   return std::distance(nodes_.begin(), mark_table_[var->Name()]);
 }
 
-ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const {
+ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
   ir::Node* found_node = nullptr;
-  NodeComparator compare_node;
+  NodeComparator functor;
 
   for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    if (compare_node(var, it->first)) {
-      found_node = it->first;
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
       break;
     }
   }
   return found_node;
 }
 
-void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); }
+bool OrderedSet::Has(ir::Node* var) const {
+  if (mark_table_.count(var->Name())) {
+    auto& node_in_samename = mark_table_.at(var->Name());
+    auto iter =
+        std::find_if(node_in_samename->begin(), node_in_samename->end(),
+                     [&](ir::Node* n) { return n->Name() == var->Name(); });
+    return iter != node_in_samename->end();
+  }
+  return false;
+}
 
-void OrderedNodeList::Erase(const std::string& var) {
-  PADDLE_ENFORCE(mark_table_.count(var));
-  nodes_.erase(mark_table_[var]);
-  mark_table_.erase(var);
+void OrderedSet::Erase(ir::Node* var) {
+  PADDLE_ENFORCE(mark_table_.count(var->Name()));
+  nodes_.erase(mark_table_[var->Name()]);
+  mark_table_.erase(var->Name());
 }
 
-std::string OrderedNodeList::ToString() const {
+std::string OrderedSet::ToString() const {
   std::stringstream ss;
   for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    ss << DebugString(it->first) << " ";
+    for (auto& node : *it) {
+      ss << DebugString(node) << " ";
+    }
   }
   return ss.str();
 }
 
 bool NodeCanReused(ir::Node* node) {
+  // valid the node is a var node
   if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
-  // auto* desc = node->Var();
-  bool flag = NodeCanReused(*node->Var());
+
+  bool flag = true;
+  // op output force generated in cpu, can not be reused.
   for (auto* op : node->inputs) {
     if (op->Op()->HasAttr("force_cpu")) {
-      // op output force generated in cpu, can not be reused.
       flag &= framework::AttrReader(op->Op()->GetAttrMap())
                   .Get<bool>("force_cpu") == 0;
     }
   }
+  // var desc validation.
+  flag &= NodeCanReused(*node->Var());
   return flag;
 }
 
 bool NodeCanReused(const VarDesc& node) {
   auto type = node.GetType();
-  if (node.Persistable() || type != proto::VarType::LOD_TENSOR ||
-      node.GetShape().empty()) {
+  if (!(type == proto::VarType::LOD_TENSOR ||
+        type == proto::VarType::SELECTED_ROWS ||
+        type == proto::VarType::LOD_TENSOR_ARRAY)) {
+    return false;
+  }
+  if (node.Persistable() || node.GetShape().empty()) {
     return false;
   }
   // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) {
   return false;
 }
 
+ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
+  ops_ = SortOpLikeDescOrder(graph);
+  ConnectNodes();
+}
+
+void ControlFlowGraph::BuildCFGGraph() {
+  // FIXME(dzh): same effect with ConnectNodes, but use the control
+  // link to build dependency graph, it goes wrong in transformer.
+  for (ir::Node* op : ops_) {
+    for (auto& input_var : op->inputs) {
+      if (!input_var->inputs.empty()) {
+        PADDLE_ENFORCE(
+            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
+            "Preceding Op Node of Var Node must be unique");
+        auto* pred_op = input_var->inputs[0];
+        if (pred_op->Op() != nullptr) {
+          predecessors_[op].insert(pred_op);
+          successors_[pred_op].insert(op);
+        }
+      }
+      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
+        uses_[op].insert(input_var->Name());
+      }
+    }
+    for (auto& output_var : op->outputs) {
+      // output var may be used by many op
+      for (auto* succ_op : output_var->outputs) {
+        if (succ_op->Op() != nullptr) {
+          successors_[op].insert(succ_op);
+          predecessors_[succ_op].insert(op);
+        }
+      }
+      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
+        defs_[op].insert(output_var->Name());
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::ConnectNodes() {
+  for (size_t i = 0; i < ops_.size(); ++i) {
+    auto& op = ops_[i];
+    try {
+      auto& next_op = ops_.at(i + 1);
+      successors_[op].insert(next_op);
+      predecessors_[next_op].insert(op);
+    } catch (...) {
+      // do nothing
+    }
+
+    FilterVariables(op->inputs,
+                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
+
+    FilterVariables(op->outputs,
+                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
+  }
+}
+
+void ControlFlowGraph::LiveVariableAnalysis() {
+  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
+  // compute the liveness of for each variable though reversed_ops algorithm.
+  // It iterates the operators from end to begin, compute the live in/live out
+  // variable set for each op, then the diff between in/out will be used for
+  // the variable reuse. For detail refer to
+  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
+  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
+  while (!work_list.empty()) {
+    ir::Node* op = work_list.front();
+    work_list.pop_front();
+    // get the live_in calculated before. Empty if first.
+    auto prev_live_in = std::move(live_in_[op]);
+    for (auto& s : successors_[op]) {
+      for (auto& var : live_in_[s]) {
+        live_out_[op].insert(var);
+      }
+    }
+    for (auto& var : uses_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : live_out_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : defs_[op]) {
+      live_in_[op].erase(var);
+    }
+
+    // If the live_in is not changed, then the liveness analysis of
+    // predecessors is completed.
+    //
+    // Otherwise, recalculate the predecessors liveness
+    if (live_in_[op] != prev_live_in) {
+      for (auto& pre : predecessors_[op]) {
+        work_list.push_back(pre);
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
+                                           const std::string& new_node,
+                                           int begin_idx) {
+  // update graph from begin idx to the end
+  for (size_t i = begin_idx; i != ops_.size(); ++i) {
+    auto* op = ops_[i];
+    if (uses_[op].find(old_node) != uses_[op].end()) {
+      uses_[op].erase(old_node);
+      uses_[op].insert(new_node);
+    }
+    if (defs_[op].find(old_node) != defs_[op].end()) {
+      defs_[op].erase(old_node);
+      defs_[op].insert(new_node);
+    }
+    if (live_in_[op].find(old_node) != live_in_[op].end()) {
+      live_in_[op].erase(old_node);
+      live_in_[op].insert(new_node);
+    }
+    if (live_out_[op].find(old_node) != live_out_[op].end()) {
+      live_out_[op].erase(old_node);
+      live_out_[op].insert(new_node);
+    }
+  }
+}
+
+const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+  auto it = live_in_.find(op);
+  PADDLE_ENFORCE(
+      it != live_in_.end(),
+      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+  auto it = live_out_.find(op);
+  PADDLE_ENFORCE(
+      it != live_out_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+  auto it = uses_.find(op);
+  PADDLE_ENFORCE(
+      it != uses_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+
+std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
+
+ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
+                                          ir::Node* op) const {
+  // in ssa-graph, different version nodes have same name,
+  // this function get the latest version var before target op
+  // It may return nullptr, such as data node.
+  ir::Node* found_node = nullptr;
+  for (auto* node : ops_) {
+    if (node == op) break;
+    for (auto& output : node->outputs) {
+      if (output->Name() == name) {
+        found_node = output;
+      }
+    }
+  }
+  return found_node;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 064183d61e..0bfaf827fe 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -17,6 +17,8 @@
 #include <iostream>
 #include <iterator>
 #include <list>
+#include <map>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,41 +29,41 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kFetchedVars[] = "fetched_vars";
-constexpr char kGraphNodePool[] = "graph_node_pool";
+constexpr char kAllOpDescs[] = "all_op_descs";
 
-// NOTE(dzh): Variable and the operators use the var.
-// for early delete pass.
-// Because analysis var pass build base on ir::Node, which maybe released
-// or modified between passes, so we use OpDesc* to mark ops.
-using GraphNodePool = std::vector<
-    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
-// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
-// in fluid, -1 means the batch_size is determined in runtime.
-// the node batch_size equal -1 always ranking in the front than the node not.
+// NOTE(dzh): A ordered set for node reuse in memory optimize.
+// the orderedset sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size, which is determined in runtime.
+// So the reuse happens between nodes who's batch_size both are -1
+// simultaneously or not.
+//
+// sort rule:
+// rule 0 : smaller node ranking in front.
+// rule 1 : batch_size equal -1 ranking in the front than the node not.
+//
 // For example,
 // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
-// O(1) insert, delete
-class OrderedNodeList {
- public:
-  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
-  using Iter = typename std::list<NodePair>::iterator;
-  using ConstIter = typename std::list<NodePair>::const_iterator;
 
-  void Insert(ir::Node* var, ir::Node* op);
+class OrderedSet {
+ public:
+  // nodes with same name exists in pool.
+  using NodeVector = std::vector<ir::Node*>;
+  using Iter = typename std::list<NodeVector>::iterator;
+  using ConstIter = typename std::list<NodeVector>::const_iterator;
 
+  void Insert(ir::Node* var);
   void Erase(ir::Node* var);
-
-  void Erase(const std::string& var);
-
-  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
-
-  bool Has(const std::string& var) { return mark_table_.count(var); }
-
-  ir::Node* NodeMatch(ir::Node* var) const;
+  bool Has(ir::Node* var) const;
+  void Clear() {
+    mark_table_.clear();
+    nodes_.clear();
+  }
+  // find the bestfit shape node block with var.
+  ir::Node* FindBestFitNode(ir::Node* var) const;
   // map store non-const iterator, can not promise const
-  int GetIndex(ir::Node* var);
+  int GetNodeIndexInPool(ir::Node* var);
   // pool all node to string
   std::string ToString() const;
 
@@ -69,18 +71,54 @@ class OrderedNodeList {
   Iter end() { return nodes_.end(); }
   ConstIter begin() const { return nodes_.begin(); }
   ConstIter end() const { return nodes_.end(); }
-  size_t size() const { return nodes_.size(); }
 
-  void Clear() {
-    mark_table_.clear();
-    nodes_.clear();
-  }
+  size_t size() const { return nodes_.size(); }
 
  private:
   // for searching.
   std::unordered_map<std::string, Iter> mark_table_;
-  // node swap pairs. var -> ops dep var
-  std::list<NodePair> nodes_;
+  // node pool
+  std::list<NodeVector> nodes_;
+};
+
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // IR Graph
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };
 
 // valid a tensor can be reuse or not
@@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node);
 bool OpHasSubBlock(OpDesc* desc);
 
 // node memory size in bytes
-size_t NodeSizeInBytes(ir::Node* n);
+size_t NodeSize(ir::Node* n);
 
 // node memory size in bytes
-size_t NodeSizeInBytes(const VarDesc&);
+size_t NodeSize(const VarDesc&);
 
 std::string DebugString(ir::Node* var);
 
+// NOTE(dzhwinter)
+// after node reuse, the replaced node shape is
+// different with its VarDesc. So need to find the
+// correct VarDesc in Block.
 VarDesc* FindVarDescInBlock(ir::Node* n);
 
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
 template <typename Container, typename Callback>
 class FilterVariableImpl {
  public:
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index f2b9baf14a..5c13dda9e5 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -22,13 +23,19 @@
 #include <vector>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/graph_test_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-TEST(OrderedNodeList, Normal) {
-  OrderedNodeList pool;
+TEST(OrderedSet, Normal) {
+  OrderedSet pool;
   std::vector<std::unique_ptr<ir::Node>> nodes;
 
   // clang-format off
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
     nodes.emplace_back(std::move(node));
   }
 
+  // Insert
   for (auto& node : nodes) {
-    pool.Insert(node.get(), op.get());
+    pool.Insert(node.get());
+  }
+
+  // Has/size
+  ASSERT_EQ(pool.size(), shapes.size());
+  for (auto& node : nodes) {
+    ASSERT_TRUE(pool.Has(node.get()));
   }
 
   // assert its order and interface.
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
   std::cout << pool.ToString() << std::endl;
 
   ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
-  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
+  ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
 
   {
     auto v1 = block_desc->Var("11");
     v1->SetShape({-1, 256, 56, 56});
     std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
     node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
+    auto* cache = pool.FindBestFitNode(node1.get());
     ASSERT_EQ(cache, nullptr);
   }
   {
@@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) {
     v2->SetShape({-1, 2, 5});
     std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
     node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2);  // match 6:[-1,2,5]
   }
   {
     auto v3 = block_desc->Var("13");
     v3->SetShape({2, 5});
     std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
     node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
+                  paddle::framework::AssignOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+/*
+  https://en.wikipedia.org/wiki/Live_variable_analysis
+  Create a customed classical dependency graph, left row is the instruction
+  number.
+  1. a = 1
+  2. b = a
+  3. c = a
+  4. d = b + c
+  5. e = d
+
+  a--------+
+  |        |
+  b        c
+  |        |
+  d--------+
+  |
+  e
+  Then analysis these variable's liveness range
+ */
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+inline static ProgramDesc FillProgramDesc() {
+  ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"b"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"c"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("sum");
+    op->SetInput("X", {"b", "c"});
+    op->SetOutput("Out", {"d"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"d"});
+    op->SetOutput("Out", {"e"});
+  }
+  return prog;
+}
+
+TEST(CFGGraph, IRGraph) {
+  // prepare ir graph
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  ControlFlowGraph cfg(graph);
+  cfg.LiveVariableAnalysis();
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
+
+  // test sum op
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
+  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
+}
+
+// 1. normal test
+TEST(SortOpLikeDescOrder, NormalTest) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto nodes = SortOpLikeDescOrder(graph);
+  auto op_descs = prog.Block(0).AllOps();
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 2. remove some op_desc
+TEST(SortOpLikeDescOrder, RemoveOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = graph.Nodes();
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->outputs.back()->Name() == "e") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  ir::Node* e = find_node_in_graph("e");
+  ir::Node* d = find_node_in_graph("d");
+  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+  graph.RemoveNode(found_node);
+  graph.RemoveNode(e);
+
+  // other node keeps the same order
+  auto remain_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < remain_nodes.size(); ++i) {
+    auto node = remain_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 3. add some op_desc
+TEST(SortOpLikeDescOrder, AddOpDesc) {
+  auto prog = FillProgramDesc();
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  ir::Graph graph(prog);
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // cached desc different with real one
+  // mimic the intermidiete pass modify the programdesc.
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto op_descs = prog.Block(0).AllOps();
+
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  ir::Node* node = graph.CreateOpNode(op);
+  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+  ir::Node* b = find_node_in_graph("b");
+  ir::Node* c = find_node_in_graph("c");
+  node->outputs.emplace_back(d1);
+  node->inputs.emplace_back(b);
+  node->inputs.emplace_back(c);
+  d1->inputs.emplace_back(node);
+  b->outputs.emplace_back(node);
+  c->outputs.emplace_back(node);
+  op_descs.insert(op_descs.begin() + 4, op);
+
+  auto nodes = SortOpLikeDescOrder(graph);
+
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 4. add and delete some op_desc
+TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // remove sum node
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  auto nodes = graph.Nodes();
+  for (auto node : nodes) {
+    if (node->Name() == "sum") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* c = find_node_in_graph("c");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
+    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
+    graph.RemoveNode(e);
+    graph.RemoveNode(pending_op);
+    graph.RemoveNode(found_node);
+  }
+
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.insert(op_descs.begin() + 2, op);
+
+  // check the order
+  auto mynodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < mynodes.size(); ++i) {
+    auto node = mynodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 5. add and replace some op_desc inplace.
+TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  auto op_descs = prog.Block(0).AllOps();
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    d1->inputs.emplace_back(node);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+
+  op_descs.emplace_back(op);
+
+  // replace op_desc inplace
+  auto nodes = graph.Nodes();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->Op() && node->Name() == "assign") {
+      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
+        found_node = node;
+        break;
+      }
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
+    graph.RemoveNode(found_node);
+  }
+  op_descs.erase(op_descs.begin() + 3);
+
+  auto replace_op = prog.MutableBlock(0)->AppendOp();
+  replace_op->SetType("sum");
+  replace_op->SetInput("X", {"d", "d1"});
+  replace_op->SetOutput("Out", {"e"});
+  {
+    ir::Node* sum2 = graph.CreateOpNode(replace_op);
+    ir::Node* e = find_node_in_graph("e");
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* d1 = find_node_in_graph("d1");
+    sum2->inputs.emplace_back(d);
+    sum2->inputs.emplace_back(d1);
+    sum2->outputs.emplace_back(e);
+    e->inputs.emplace_back(sum2);
+    d->outputs.emplace_back(sum2);
+    d1->outputs.emplace_back(sum2);
+  }
+
+  op_descs.emplace_back(replace_op);
+  // compare op order
+  auto graph_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < graph_nodes.size(); ++i) {
+    auto node = graph_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
   }
 }
 
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 85de14a60a..41e4a834df 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -43,11 +43,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
 std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   auto nodes = graph->Nodes();
@@ -77,7 +72,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
       if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
           skip_set_.count(var->Name()))
         continue;
-      ir::Node* cache = pool_.NodeMatch(var);
+      ir::Node* cache = pool_.FindBestFitNode(var);
 
       if (var->Name() == FLAGS_memory_optimize_debug) {
         VLOG(3) << "start match var " << DebugString(var) << " of op "
@@ -95,11 +90,12 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
                 << "replace it again. Skip this candidate.";
         continue;
 
-        int node_idx_in_pool = pool_.GetIndex(cache);
+        int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
         VLOG(3) << string::Sprintf(
             "!!! %s,  %s => %s, cache idx %d, pool size %d",
             std::to_string(reuse_id++), DebugString(var), DebugString(cache),
             node_idx_in_pool, static_cast<int>(pool_.size()));
+
         // update CFG Graph on the fly.
         // reused var maybe re-fill into the pool
         cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
@@ -112,6 +108,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
 
         pool_.Erase(cache);
       }
+
       // fill the pool
       std::unordered_set<std::string> unlived_vars;
       for (auto var : cfg_->LiveIn(op)) {
@@ -120,36 +117,15 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
         }
       }
       for (auto var : unlived_vars) {
-        ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
+        ir::Node* var_node = cfg_->GetNodeByName(var, op);
         if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
-          pool_.Insert(var_node, op);
+          pool_.Insert(var_node);
         }
       }
     }
   }
   graph->ResolveHazard(var_nodes_);
 
-  // For early delete pass. use GraphNodePool load the unlived vars.
-  // 1. find all deps op for each unlived var in memory pool.
-  for (auto& op : graph->Nodes()) {
-    for (auto& var : op->inputs) {
-      if (pool_.Has(var)) {
-        pool_.Insert(var, op);
-      }
-    }
-  }
-  // 2. convert ir node based memory pool to graph node
-  // because Node* maybe released bettwen passes.
-  auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
-  for (auto it = pool_.begin(); it != pool_.end(); ++it) {
-    std::unordered_set<OpDesc*> descs;
-    for (auto& op : it->second) {
-      PADDLE_ENFORCE(op->IsOp());
-      descs.insert(op->Op());
-    }
-    graph_pool.push_back(std::make_pair(it->first->Name(), descs));
-  }
-
   return graph;
 }
 
@@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
     PADDLE_ENFORCE(sub_op != nullptr);
     for (auto* var : sub_op->outputs) {
       if (NodeCanReused(var)) {
-        ir::Node* cache = pool_.NodeMatch(var);
+        ir::Node* cache = pool_.FindBestFitNode(var);
         if (cache != nullptr) {
           if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
             continue;
           }
-          int node_idx_in_pool = pool_.GetIndex(cache);
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
           VLOG(3) << string::Sprintf(
               "!!! %s,  %s => %s, cache idx %d, pool size %d",
               std::to_string(sub_reuse_id++), DebugString(var),
@@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
   var_nodes_.at(var).clear();
 }
 
-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
-  // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
-
-  // 2. topology sort order
-  auto nodes = graph.Nodes();
-  std::deque<ir::Node*> ops;
-  FilterVariables(nodes, [&](ir::Node* op) {
-    if (op->IsOp() && op->Op() != nullptr) {
-      ops.emplace_back(op);
-    }
-  });
-  std::unordered_map<ir::Node*, size_t> op_deps;
-  std::list<ir::Node*> ready_ops;
-  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
-
-  for (auto* op : ops) {
-    std::unordered_set<ir::Node*> preceding_op;
-    for (auto* in : op->inputs) {
-      if (in->inputs.empty()) continue;
-      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
-      preceding_op.emplace(in->inputs[0]);
-      pending_ops[in->inputs[0]].emplace(op);
-    }
-    op_deps[op] = preceding_op.size();
-    if (preceding_op.empty()) {
-      ready_ops.emplace_back(op);
-    }
-  }
-
-  // 3. generated op list based desc order and the topology order
-  std::vector<ir::Node*> ret;
-  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
-
-  auto update_by_found_node = [&](ir::Node* found_node) {
-    for (auto* pending_op : pending_ops[found_node]) {
-      if (--op_deps[pending_op] == 0) {
-        ready_ops.emplace_back(pending_op);
-      }
-    }
-    ready_ops.remove(found_node);
-    ret.emplace_back(found_node);
-  };
-
-  while (!ready_ops.empty()) {
-    bool all_of_ready_op_unmatched = true;
-    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
-      auto op_desc = *it;
-      ir::Node* found_node = nullptr;
-      for (auto* op : ready_ops) {
-        if (IsSameDesc(op->Op(), op_desc)) {
-          found_node = op;
-          break;
-        }
-      }
-
-      // 3.1 op desc deleted by other pass
-      if (found_node == nullptr) {
-        ++it;
-        continue;
-      } else {
-        all_of_ready_op_unmatched = false;
-        it = op_descs_list.erase(it);
-      }
-      update_by_found_node(found_node);
-    }
-
-    // 3.2 op descs are added by other pass
-    // preceding op non empty means some new op descs are
-    // created, but not contained in return node list.
-    // these new op desc may depend on each other.
-    std::list<ir::Node*> prev_ready_ops(ready_ops);
-    if (all_of_ready_op_unmatched) {
-      for (auto op : prev_ready_ops) {
-        update_by_found_node(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE(std::all_of(
-      op_deps.begin(), op_deps.end(),
-      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
-
-  return ret;
-}
-
-ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
-  ops_ = SortOpLikeDescOrder(graph);
-  ConnectNodes();
-}
-
-void ControlFlowGraph::BuildCFGGraph() {
-  // FIXME(dzh): same effect with ConnectNodes, but use the control
-  // link to build dependency graph, it goes wrong in transformer.
-  for (ir::Node* op : ops_) {
-    for (auto& input_var : op->inputs) {
-      if (!input_var->inputs.empty()) {
-        PADDLE_ENFORCE(
-            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
-            "Preceding Op Node of Var Node must be unique");
-        auto* pred_op = input_var->inputs[0];
-        if (pred_op->Op() != nullptr) {
-          predecessors_[op].insert(pred_op);
-          successors_[pred_op].insert(op);
-        }
-      }
-      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
-        uses_[op].insert(input_var->Name());
-      }
-    }
-    for (auto& output_var : op->outputs) {
-      // output var may be used by many op
-      for (auto* succ_op : output_var->outputs) {
-        if (succ_op->Op() != nullptr) {
-          successors_[op].insert(succ_op);
-          predecessors_[succ_op].insert(op);
-        }
-      }
-      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
-        defs_[op].insert(output_var->Name());
-      }
-    }
-  }
-}
-
-void ControlFlowGraph::ConnectNodes() {
-  for (size_t i = 0; i < ops_.size(); ++i) {
-    auto& op = ops_[i];
-    try {
-      auto& next_op = ops_.at(i + 1);
-      successors_[op].insert(next_op);
-      predecessors_[next_op].insert(op);
-    } catch (...) {
-      // do nothing
-    }
-
-    FilterVariables(op->inputs,
-                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
-
-    FilterVariables(op->outputs,
-                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
-  }
-}
-
-void ControlFlowGraph::LiveVariableAnalysis() {
-  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
-  // compute the liveness of for each variable though reversed_ops algorithm.
-  // It iterates the operators from end to begin, compute the live in/live out
-  // variable set for each op, then the diff between in/out will be used for
-  // the variable reuse. For detail refer to
-  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
-  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
-  while (!work_list.empty()) {
-    ir::Node* op = work_list.front();
-    work_list.pop_front();
-    // get the live_in calculated before. Empty if first.
-    auto prev_live_in = std::move(live_in_[op]);
-    for (auto& s : successors_[op]) {
-      for (auto& var : live_in_[s]) {
-        live_out_[op].insert(var);
-      }
-    }
-    for (auto& var : uses_[op]) {
-      live_in_[op].insert(var);
-    }
-    for (auto& var : live_out_[op]) {
-      live_in_[op].insert(var);
-    }
-    for (auto& var : defs_[op]) {
-      live_in_[op].erase(var);
-    }
-
-    // If the live_in is not changed, then the liveness analysis of
-    // predecessors is completed.
-    //
-    // Otherwise, recalculate the predecessors liveness
-    if (live_in_[op] != prev_live_in) {
-      for (auto& pre : predecessors_[op]) {
-        work_list.push_back(pre);
-      }
-    }
-  }
-}
-
-void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
-                                           const std::string& new_node,
-                                           int begin_idx) {
-  // update graph from begin idx to the end
-  for (size_t i = begin_idx; i != ops_.size(); ++i) {
-    auto* op = ops_[i];
-    if (uses_[op].find(old_node) != uses_[op].end()) {
-      uses_[op].erase(old_node);
-      uses_[op].insert(new_node);
-    }
-    if (defs_[op].find(old_node) != defs_[op].end()) {
-      defs_[op].erase(old_node);
-      defs_[op].insert(new_node);
-    }
-    if (live_in_[op].find(old_node) != live_in_[op].end()) {
-      live_in_[op].erase(old_node);
-      live_in_[op].insert(new_node);
-    }
-    if (live_out_[op].find(old_node) != live_out_[op].end()) {
-      live_out_[op].erase(old_node);
-      live_out_[op].insert(new_node);
-    }
-  }
-}
-
-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
-  auto it = live_in_.find(op);
-  PADDLE_ENFORCE(
-      it != live_in_.end(),
-      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
-  auto it = live_out_.find(op);
-  PADDLE_ENFORCE(
-      it != live_out_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
-  auto it = uses_.find(op);
-  PADDLE_ENFORCE(
-      it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
-
-std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
-
-ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
-                                               ir::Node* op) const {
-  // in ssa-graph, different version nodes have same name,
-  // this function get the latest version var before target op
-  // It may return nullptr, such as data node.
-  ir::Node* found_node = nullptr;
-  for (auto* node : ops_) {
-    if (node == op) break;
-    for (auto& output : node->outputs) {
-      if (output->Name() == name) {
-        found_node = output;
-      }
-    }
-  }
-  return found_node;
-}
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 
 REGISTER_PASS(memory_optimize_pass,
               paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
     .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h
index 3d6b1897f3..593ffc10fc 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -32,20 +32,15 @@
 namespace paddle {
 namespace framework {
 namespace details {
-constexpr char kAllOpDescs[] = "all_op_descs";
-
-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
-
-class ControlFlowGraph;
 
 class MemoryOptimizePass : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override;
-
- private:
   // fill the variable map(var_nodes) by version.
   void InitSSAGraphNodes() const;
+
+ private:
   // update program descs
   void RenameVarInGraphDesc(const std::string& var,
                             const std::string& cache_var, size_t idx) const;
@@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass {
 
  private:
   // Reuse Node Pool, Owned.
-  mutable OrderedNodeList pool_;
+  mutable OrderedSet pool_;
   // controlflow Graph
   mutable std::unique_ptr<ControlFlowGraph> cfg_;
   // skip set
@@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
   mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
 };
 
-class ControlFlowGraph {
- public:
-  ControlFlowGraph() = default;
-  // For IR Graph in parallelexecutor
-  explicit ControlFlowGraph(const ir::Graph& graph);
-
-  void LiveVariableAnalysis();
-
-  void RenameVarInCFGGraph(const std::string& old_node,
-                           const std::string& new_node, int begin_idx);
-
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
-  std::vector<ir::Node*>& Ops();
-
-  // for ssa-graph nodes
-  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
-
- private:
-  void BuildCFGGraph();
-  void ConnectNodes();
-  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
-  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
-  // successors ops use the output variables.
-  NodeListMap successors_;
-  // predecessors ops generated input variables.
-  NodeListMap predecessors_;
-  // variables lived before run current op.
-  VarSetMap live_in_;
-  // variables lived after run current op.
-  VarSetMap live_out_;
-  VarSetMap uses_;  // op inputs
-  VarSetMap defs_;  // op outputs
-
-  std::vector<ir::Node*> ops_;  // op sequence by topology sort
-};
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc
deleted file mode 100644
index 3d3dfa9359..0000000000
--- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_optimize_pass.h"
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/graph_test_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
-                  paddle::framework::SumOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
-                  paddle::framework::AssignOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
-                  paddle::framework::SumOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-/*
-  https://en.wikipedia.org/wiki/Live_variable_analysis
-  Create a customed classical dependency graph, left row is the instruction
-  number.
-  1. a = 1
-  2. b = a
-  3. c = a
-  4. d = b + c
-  5. e = d
-
-  a--------+
-  |        |
-  b        c
-  |        |
-  d--------+
-  |
-  e
-  Then analysis these variable's liveness range
- */
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
-inline static ProgramDesc FillProgramDesc() {
-  ProgramDesc prog;
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"a"});
-    op->SetOutput("Out", {"b"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"a"});
-    op->SetOutput("Out", {"c"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("sum");
-    op->SetInput("X", {"b", "c"});
-    op->SetOutput("Out", {"d"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"d"});
-    op->SetOutput("Out", {"e"});
-  }
-  return prog;
-}
-
-TEST(CFGGraph, IRGraph) {
-  // prepare ir graph
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  ControlFlowGraph cfg(graph);
-  cfg.LiveVariableAnalysis();
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
-  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
-  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
-
-  // test sum op
-  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
-  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
-  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
-}
-
-// 1. normal test
-TEST(SortOpLikeDescOrder, NormalTest) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto nodes = SortOpLikeDescOrder(graph);
-  auto op_descs = prog.Block(0).AllOps();
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    auto node = nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 2. remove some op_desc
-TEST(SortOpLikeDescOrder, RemoveOpDesc) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-  auto nodes = graph.Nodes();
-  auto op_descs = prog.Block(0).AllOps();
-  ir::Node* found_node = nullptr;
-  for (auto node : nodes) {
-    if (node->IsOp() && node->outputs.back()->Name() == "e") {
-      found_node = node;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found_node != nullptr);
-  for (auto it = op_descs.begin(); it != op_descs.end();) {
-    if (IsSameDesc(*it, found_node->Op())) {
-      it = op_descs.erase(it);
-    } else {
-      ++it;
-    }
-  }
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  ir::Node* e = find_node_in_graph("e");
-  ir::Node* d = find_node_in_graph("d");
-  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-  graph.RemoveNode(found_node);
-  graph.RemoveNode(e);
-
-  // other node keeps the same order
-  auto remain_nodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < remain_nodes.size(); ++i) {
-    auto node = remain_nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 3. add some op_desc
-TEST(SortOpLikeDescOrder, AddOpDesc) {
-  auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  ir::Graph graph(prog);
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  // cached desc different with real one
-  // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
-
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  ir::Node* node = graph.CreateOpNode(op);
-  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-  ir::Node* b = find_node_in_graph("b");
-  ir::Node* c = find_node_in_graph("c");
-  node->outputs.emplace_back(d1);
-  node->inputs.emplace_back(b);
-  node->inputs.emplace_back(c);
-  d1->inputs.emplace_back(node);
-  b->outputs.emplace_back(node);
-  c->outputs.emplace_back(node);
-  op_descs.insert(op_descs.begin() + 4, op);
-
-  auto nodes = SortOpLikeDescOrder(graph);
-
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    auto node = nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 4. add and delete some op_desc
-TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
-  ir::Node* found_node = nullptr;
-  auto nodes = graph.Nodes();
-  for (auto node : nodes) {
-    if (node->Name() == "sum") {
-      found_node = node;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found_node != nullptr);
-  for (auto it = op_descs.begin(); it != op_descs.end();) {
-    if (IsSameDesc(*it, found_node->Op())) {
-      it = op_descs.erase(it);
-    } else {
-      ++it;
-    }
-  }
-  {
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* c = find_node_in_graph("c");
-    ir::Node* e = find_node_in_graph("e");
-    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
-    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
-    graph.RemoveNode(e);
-    graph.RemoveNode(pending_op);
-    graph.RemoveNode(found_node);
-  }
-
-  // add node
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  {
-    ir::Node* node = graph.CreateOpNode(op);
-    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-    ir::Node* b = find_node_in_graph("b");
-    ir::Node* c = find_node_in_graph("c");
-    node->outputs.emplace_back(d1);
-    node->inputs.emplace_back(b);
-    node->inputs.emplace_back(c);
-    b->outputs.emplace_back(node);
-    c->outputs.emplace_back(node);
-  }
-  op_descs.insert(op_descs.begin() + 2, op);
-
-  // check the order
-  auto mynodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < mynodes.size(); ++i) {
-    auto node = mynodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 5. add and replace some op_desc inplace.
-TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  auto op_descs = prog.Block(0).AllOps();
-  // add node
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  {
-    ir::Node* node = graph.CreateOpNode(op);
-    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-    ir::Node* b = find_node_in_graph("b");
-    ir::Node* c = find_node_in_graph("c");
-    node->outputs.emplace_back(d1);
-    node->inputs.emplace_back(b);
-    node->inputs.emplace_back(c);
-    d1->inputs.emplace_back(node);
-    b->outputs.emplace_back(node);
-    c->outputs.emplace_back(node);
-  }
-
-  op_descs.emplace_back(op);
-
-  // replace op_desc inplace
-  auto nodes = graph.Nodes();
-  ir::Node* found_node = nullptr;
-  for (auto node : nodes) {
-    if (node->IsOp() && node->Op() && node->Name() == "assign") {
-      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
-        found_node = node;
-        break;
-      }
-    }
-  }
-  {
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* e = find_node_in_graph("e");
-    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
-    graph.RemoveNode(found_node);
-  }
-  op_descs.erase(op_descs.begin() + 3);
-
-  auto replace_op = prog.MutableBlock(0)->AppendOp();
-  replace_op->SetType("sum");
-  replace_op->SetInput("X", {"d", "d1"});
-  replace_op->SetOutput("Out", {"e"});
-  {
-    ir::Node* sum2 = graph.CreateOpNode(replace_op);
-    ir::Node* e = find_node_in_graph("e");
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* d1 = find_node_in_graph("d1");
-    sum2->inputs.emplace_back(d);
-    sum2->inputs.emplace_back(d1);
-    sum2->outputs.emplace_back(e);
-    e->inputs.emplace_back(sum2);
-    d->outputs.emplace_back(sum2);
-    d1->outputs.emplace_back(sum2);
-  }
-
-  op_descs.emplace_back(replace_op);
-  // compare op order
-  auto graph_nodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < graph_nodes.size(); ++i) {
-    auto node = graph_nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index cc2c8bfef9..879fb29d59 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h
index a04c08bc2e..ea3034877f 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -21,8 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kAllOpDescs[] = "all_op_descs";
-
 class SequentialExecutionPass : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index 03ab2a2b6c..a3ccf677c9 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
   bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
     return in.Name() != out.Name() && details::NodeCanReused(in) &&
            details::NodeCanReused(out) &&
-           details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in);
+           details::NodeSize(out) <= details::NodeSize(in);
   }
 };
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f61c9e3a91..ff7ef0cce2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
-
-    if (build_strategy_.memory_early_delete_) {
-      auto early_delete_pass =
-          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
-      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = early_delete_pass->Apply(std::move(graph));
-    }
-    VLOG(10) << "MemoryEarlyDeletePass Applied.";
   }
 
   return graph;
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
   graphs.push_back(std::move(graph));
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
+  VLOG(10) << "Eager Deletion Threshold "
+           << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
     for (size_t i = 0; i < graphs.size(); ++i) {
       graphs[i] = member_->PrepareGCAndRefCnts(
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9536185609..87f0f307d3 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -22,11 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
 
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+DECLARE_bool(benchmark);
 
 DEFINE_bool(
     eager_delete_scope, true,
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 327adcc4aa..3495795563 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false,
             "that initializing the allocated memory with a small value "
             "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace memory {
@@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
   } else {
-    if (VLOG_IS_ON(3)) {
+    if (FLAGS_benchmark) {
       allocation::GPUMemMonitor.Add(place.device, size);
     }
     if (FLAGS_init_allocated_mem) {
@@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                                size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
-  if (VLOG_IS_ON(3)) {
+  if (FLAGS_benchmark) {
     allocation::GPUMemMonitor.Minus(place.device, size);
   }
 #else
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 655ce8485d..60b2d83f15 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/place.h"
 
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6549229e05..4ac7b7c259 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1092,10 +1092,6 @@ All parameter, weight, gradient are variables in Paddle.
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
           [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
-      .def_property(
-          "memory_early_delete",
-          [](const BuildStrategy &self) { return self.memory_early_delete_; },
-          [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
       .def_property(
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 52b260efd1..22212ae9a2 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -148,7 +148,8 @@ class ParallelExecutor(object):
             else framework.default_main_program()
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
-        build_strategy.enable_inplace = False if main._is_mem_optimized else True
+        if build_strategy.enable_inplace is None:
+            build_strategy.enable_inplace = False if main._is_mem_optimized else True
         scope = scope if scope is not None else executor.global_scope()
 
         if share_vars_from and not isinstance(share_vars_from,

From 7e399b062848547bc7e57f2e3997cdd531f74725 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 12 Feb 2019 09:48:39 +0800
Subject: [PATCH 25/78] rename test=develop

---
 .../fluid/contrib/decoder/beam_search_decoder.py   |  6 +++---
 python/paddle/fluid/contrib/inferencer.py          |  4 ++--
 python/paddle/fluid/contrib/trainer.py             |  4 ++--
 python/paddle/fluid/executor.py                    |  4 ++--
 python/paddle/fluid/framework.py                   | 14 +++++++-------
 python/paddle/fluid/imperative/base.py             |  4 ++--
 python/paddle/fluid/initializer.py                 |  4 ++--
 python/paddle/fluid/layers/control_flow.py         |  4 ++--
 python/paddle/fluid/layers/io.py                   |  4 ++--
 python/paddle/fluid/optimizer.py                   |  4 ++--
 python/paddle/fluid/profiler.py                    |  6 +++---
 python/paddle/fluid/recordio_writer.py             |  4 ++--
 python/paddle/fluid/unique_name.py                 |  4 ++--
 python/paddle/fluid/wrapped_decorator.py           |  4 ++--
 14 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index d0ca4fd485..5854cadb58 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,7 +22,7 @@ This API is still under active development and may change drastically.
 
 from __future__ import print_function
 
-from ...wrapped_decorator import contextmanager
+from ...wrapped_decorator import signature_safe_contextmanager
 import numpy as np
 import six
 
@@ -419,7 +419,7 @@ class TrainingDecoder(object):
         self._state_cell = state_cell
         self._state_cell._enter_decoder(self)
 
-    @contextmanager
+    @signature_safe_contextmanager
     def block(self):
         """
         Define the behavior of the decoder for each RNN time step.
@@ -613,7 +613,7 @@ class BeamSearchDecoder(object):
         self._word_dim = word_dim
         self._input_var_dict = input_var_dict
 
-    @contextmanager
+    @signature_safe_contextmanager
     def block(self):
         """
         Define the behavior of the decoder for each RNN time step.
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index 41a0d55b57..4f37129234 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..wrapped_decorator import contextmanager
+from ..wrapped_decorator import signature_safe_contextmanager
 
 from .. import core
 
@@ -105,7 +105,7 @@ class Inferencer(object):
 
         return results
 
-    @contextmanager
+    @signature_safe_contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(main_program=self.inference_program):
             with executor.scope_guard(self.scope):
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index f448c309b0..d27b808438 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..wrapped_decorator import contextmanager
+from ..wrapped_decorator import signature_safe_contextmanager
 import os
 import errno
 import shutil
@@ -453,7 +453,7 @@ class Trainer(object):
             io.save_inference_model(param_path, feeded_var_names, target_vars,
                                     exe)
 
-    @contextmanager
+    @signature_safe_contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
                 main_program=self.train_program,
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 6c49c56408..8815911eae 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import os
 import multiprocessing
 import numpy as np
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 import six
 from .framework import Program, default_main_program, Variable
 from . import core
@@ -49,7 +49,7 @@ def _switch_scope(scope):
     return ex
 
 
-@contextmanager
+@signature_safe_contextmanager
 def scope_guard(scope):
     """
     Change the global/default scope instance by Python `with` statement. All
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f94c8136ca..832c97c7de 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import collections
 from collections import defaultdict
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 import os
 import re
 import traceback
@@ -111,7 +111,7 @@ class NameScope(object):
 _name_scope = NameScope()
 
 
-@contextmanager
+@signature_safe_contextmanager
 def name_scope(prefix=None):
     """
     Generate hierarchical name prefix for the operators.
@@ -1775,7 +1775,7 @@ class Program(object):
     def set_op_role_var(self, var_name):
         self._op_role_var = [var_name]
 
-    @contextmanager
+    @signature_safe_contextmanager
     def _optimized_guard(self, param_and_grads):
         """
         A with guard to set :code:`Optimization` :code:`OpRole` and
@@ -1805,7 +1805,7 @@ class Program(object):
         self._op_role_var = tmp_var
         self._current_role = tmp_role
 
-    @contextmanager
+    @signature_safe_contextmanager
     def _lr_schedule_guard(self, is_with_opt=False):
         """
         A with guard to set :code:`LRSched` :code:`OpRole` and
@@ -2459,7 +2459,7 @@ def switch_startup_program(program):
     return prev_program
 
 
-@contextmanager
+@signature_safe_contextmanager
 def program_guard(main_program, startup_program=None):
     """
     Change the global main program and startup program with `with` statement.
@@ -2524,7 +2524,7 @@ def _get_var(name, program=None):
     return program.global_block().var(name)
 
 
-@contextmanager
+@signature_safe_contextmanager
 def _imperative_guard(tracer):
     global _imperative_tracer_
     tmp_trace = _imperative_tracer_
@@ -2535,7 +2535,7 @@ def _imperative_guard(tracer):
     _imperative_tracer_ = tmp_trace
 
 
-@contextmanager
+@signature_safe_contextmanager
 def _imperative_place_guard(place):
     global _imperative_current_expected_place_
     tmp_place = _imperative_current_expected_place_
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 2f8b3534aa..d4525233cc 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..wrapped_decorator import contextmanager
+from ..wrapped_decorator import signature_safe_contextmanager
 import numpy as np
 
 from paddle.fluid import core
@@ -24,7 +24,7 @@ def enabled():
     return framework._in_imperative_mode()
 
 
-@contextmanager
+@signature_safe_contextmanager
 def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8f3f03cb1a..e8341be286 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 from . import framework
 import numpy as np
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 from .core import VarDesc
 from . import unique_name
 
@@ -49,7 +49,7 @@ def force_init_on_cpu():
     return _force_init_on_cpu_
 
 
-@contextmanager
+@signature_safe_contextmanager
 def init_on_cpu():
     """
     Force the variable to be inited on CPU.
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 1d639144e2..3a6753b01f 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-from ..wrapped_decorator import contextmanager
+from ..wrapped_decorator import signature_safe_contextmanager
 
 from .layer_function_generator import autodoc, templatedoc
 from .tensor import assign, fill_constant
@@ -1532,7 +1532,7 @@ class DynamicRNN(object):
             outputs={'Out': [x_reordered]})
         return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
 
-    @contextmanager
+    @signature_safe_contextmanager
     def block(self):
         """
         The block for user to define operators in RNN. See the class docstring
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 58c892315f..b88be66906 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-from ..wrapped_decorator import contextmanager
+from ..wrapped_decorator import signature_safe_contextmanager
 import multiprocessing
 import os
 import six
@@ -1116,7 +1116,7 @@ class Preprocessor(object):
     def _is_completed(self):
         return self.sub_block and self.source_var_names and self.sink_var_names
 
-    @contextmanager
+    @signature_safe_contextmanager
     def block(self):
         self.status = Preprocessor.IN_SUB_BLOCK
         self.sub_block = self.main_prog._create_block()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index e89103f18d..fbd04f1eb4 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 from collections import defaultdict
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
@@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer):
             },
             stop_gradient=True)
 
-    @contextmanager
+    @signature_safe_contextmanager
     def apply(self, executor, need_restore=True):
         """Apply average values to parameters of current model.
         """
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 08f5b38310..d5670dbc82 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 from . import core
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 import os
 import six
 
@@ -35,7 +35,7 @@ NVPROF_CONFIG = [
 ]
 
 
-@contextmanager
+@signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
     """The CUDA profiler.
     This fuctions is used to profile CUDA program by CUDA runtime application
@@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     core.disable_profiler(key_map[sorted_key], profile_path)
 
 
-@contextmanager
+@signature_safe_contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     """The profiler interface.
     Different from cuda_profiler, this profiler can be used to profile both CPU
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 5302dbb356..aa581f23a1 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -15,14 +15,14 @@
 from __future__ import print_function
 
 import os
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 from . import core
 __all__ = [
     'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]
 
 
-@contextmanager
+@signature_safe_contextmanager
 def create_recordio_writer(filename,
                            compressor=core.RecordIOWriter.Compressor.Snappy,
                            max_num_records=1000):
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index e1ec726ec4..324257c13f 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import collections
-from .wrapped_decorator import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 import six
 import sys
 
@@ -68,7 +68,7 @@ def switch(new_generator=None):
     return old
 
 
-@contextmanager
+@signature_safe_contextmanager
 def guard(new_generator=None):
     if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py
index 224afcca5a..7e7dbff656 100644
--- a/python/paddle/fluid/wrapped_decorator.py
+++ b/python/paddle/fluid/wrapped_decorator.py
@@ -15,7 +15,7 @@
 import decorator
 import contextlib
 
-__all__ = ['wrap_decorator', 'contextmanager']
+__all__ = ['wrap_decorator', 'signature_safe_contextmanager']
 
 
 def wrap_decorator(decorator_func):
@@ -27,4 +27,4 @@ def wrap_decorator(decorator_func):
     return __impl__
 
 
-contextmanager = wrap_decorator(contextlib.contextmanager)
+signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager)

From fc198b1fea40edd19e47b7a34e8708288a5793c2 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Tue, 12 Feb 2019 10:39:39 +0800
Subject: [PATCH 26/78] fix fp16 initializer dtype check test=develop (#15669)

---
 python/paddle/fluid/layer_helper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index a172141b3a..7d1636774c 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -302,7 +302,8 @@ class LayerHelper(object):
         if default_initializer is None and attr.initializer is None:
             if isinstance(dtype, core.VarDesc.VarType):
                 if dtype != core.VarDesc.VarType.FP32 and \
-                    dtype != core.VarDesc.VarType.FP64:
+                    dtype != core.VarDesc.VarType.FP64 and \
+                    dtype != core.VarDesc.VarType.FP16:
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )

From eeaa2066e5066baf2d57b3003ced8cb440db0212 Mon Sep 17 00:00:00 2001
From: xuezhong <xuezhongq@gmail.com>
Date: Tue, 12 Feb 2019 04:32:31 +0000
Subject: [PATCH 27/78] add device info to tensor test=develop

---
 paddle/fluid/pybind/pybind.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6549229e05..0493f60860 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) {
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
+      .def("_place", [](Tensor &self) { return self.place(); })
       .def("_dtype", [](Tensor &self) { return self.type(); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
@@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
+      .def("is_gpu_place",
+           [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("gpu_device_id",
+           [](platform::Place &self) {
+             return boost::get<platform::CUDAPlace>(self).device;
+           })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;

From fbadd4b60cea88dd1efba1d90b570130cd2f4d1c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 12:57:18 +0800
Subject: [PATCH 28/78] follow comment test=develop

---
 paddle/fluid/framework/details/build_strategy.cc           | 6 +++---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index a81f284268..3f6e00248a 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -133,15 +133,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
     if (strategy_.is_distribution_) {
-      VLOG(3) << "dist train mode";
+      VLOG(3) << "multi device dist train mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(3) << "allreduce mode";
+        VLOG(3) << "multi device allreduce mode";
         multi_devices_pass =
             AppendPass("allreduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(3) << "reduce mode";
+        VLOG(3) << "multi device reduce mode";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 30a3549ffe..24977aabda 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -731,7 +731,6 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
       }
     }
     insert_op = true;
-    need_broadcast_var_ = true;
   } else if (OpHaveRole(*node, OpRole::kDist)) {
     int op_dev_id = CreateDistTrainOp(result, node);
     if (node->Op()->Type() == "concat") {
@@ -925,6 +924,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
+  // only GPU reduce mode need to broadcast parameters to each device.
   if (UseGPU() && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
     if (strategy_.fuse_broadcast_op_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);

From 16ec4b8c8bd5c95c38c39d2a2528027b4a1930b6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 13:37:25 +0800
Subject: [PATCH 29/78] clean code test=develop

---
 python/paddle/fluid/compiler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index f3935e22b4..2b69fd89a2 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -120,8 +120,7 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = _is_pserver_mode(
-            self._program) or self._build_strategy.num_trainers > 1
+        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
     def with_inference_optimize(self, config):

From ffd0d1d216edf9c402daf79fb3e0febf48eb2f7c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 13:58:59 +0800
Subject: [PATCH 30/78] clean need_broadcast_var_ test=develop

---
 paddle/fluid/framework/details/multi_devices_graph_pass.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 6d4386538e..21f85dc828 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -174,7 +174,6 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
   int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
-  mutable bool need_broadcast_var_{false};
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();

From d424e5b4c9edf6fdbb2200f04967e2c3bde9f011 Mon Sep 17 00:00:00 2001
From: Yan Xu <yanxu05@baidu.com>
Date: Tue, 12 Feb 2019 15:48:06 +0800
Subject: [PATCH 31/78] add launch mp distributed job py module test=develop
 (#15620)

* add launch mp distributed mode module test=develop

* delete unused file test=develop

* refine usage test=develop

* refine usage test=develop

* move distributed package test=develop

* add to whl package test=develop
---
 python/paddle/__init__.py                     |  1 +
 python/paddle/distributed/__init__.py         | 13 +++++++
 .../paddle/distributed/launch.py              | 38 +++++++++++--------
 python/paddle/fluid/__init__.py               |  1 -
 python/setup.py.in                            |  1 +
 5 files changed, 37 insertions(+), 17 deletions(-)
 create mode 100644 python/paddle/distributed/__init__.py
 rename tools/run_mp.py => python/paddle/distributed/launch.py (83%)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 53746afdb2..fe2ae67ec6 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -25,4 +25,5 @@ import paddle.reader
 import paddle.dataset
 import paddle.batch
 import paddle.compat
+import paddle.distributed
 batch = batch.batch
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
new file mode 100644
index 0000000000..d0c32e2609
--- /dev/null
+++ b/python/paddle/distributed/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/run_mp.py b/python/paddle/distributed/launch.py
similarity index 83%
rename from tools/run_mp.py
rename to python/paddle/distributed/launch.py
index 2485400ab8..03c4078775 100644
--- a/tools/run_mp.py
+++ b/python/paddle/distributed/launch.py
@@ -37,7 +37,7 @@ default_envs = {
 GPUS = 8
 
 
-def start_procs(gpus, cmd, log_dir):
+def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
     procs = []
     log_fns = []
     os.system("mkdir -p %s" % log_dir)
@@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir):
             "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
         })
 
-        print("starting process ", i, cmd, curr_env)
+        print("starting process ", i, entrypoint, entrypoint_args, curr_env)
         fn = open("%s/workerlog.%d" % (log_dir, i), "w")
         log_fns.append(fn)
-        procs.append(
-            subprocess.Popen(
-                cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
+        cmd = [sys.executable, "-u", entrypoint] + entrypoint_args
+        procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env))
 
     for i in range(gpus):
         try:
@@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir):
             pass
 
 
-def main():
+def parse_args():
+
     parser = argparse.ArgumentParser(
         description='''start paddle training using multi-process mode.
 NOTE: your train program ***must*** run as distributed nccl2 mode,
@@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training)
         type=int,
         default=8,
         help='start number of processes for every gpu')
-    parser.add_argument(
-        '--cmd',
-        type=str,
-        default="",
-        help='command to run for each process, e.g. python train.py --lr 0.1')
     parser.add_argument(
         '--log_dir',
         type=str,
         default="mylog",
         help='directory to put logs per process.')
-    args = parser.parse_args()
-    if args.cmd == "":
-        parser.print_help()
-        exit(0)
-    start_procs(args.gpus, args.cmd, args.log_dir)
+    parser.add_argument(
+        'entrypoint_script',
+        type=str,
+        help="The entrypoint script to be launched in parallel,"
+        "followed by all the arguments for each process,"
+        "e.g. train.py --lr 0.1")
+    parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # launch multiple training process
+    start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args,
+                args.log_dir)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 396f36e188..aa1f85734d 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -161,7 +161,6 @@ def __bootstrap__():
             'times_excess_than_required_tmp_allocation',
             'enable_inplace_whitelist'
         ]
-
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/setup.py.in b/python/setup.py.in
index f93f0cd130..a7c1e91f9c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -100,6 +100,7 @@ packages=['paddle',
           'paddle.utils',
           'paddle.dataset',
           'paddle.reader',
+          'paddle.distributed',
           'paddle.fluid',
           'paddle.fluid.imperative',
           'paddle.fluid.proto',

From 7b673bce6ad2b0f01bfef12c12a0510a297d686c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 15:52:02 +0800
Subject: [PATCH 32/78] lookup_table_grad kernel should consider padding_idx
 test=develop

---
 paddle/fluid/operators/lookup_table_op.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index a7d0fd4856..e20f417d76 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           "must be either LoDTensor or SelectedRows");
     }
 
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
@@ -187,10 +188,12 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
       for (int64_t i = 0; i < ids->numel(); ++i) {
-        PADDLE_ENFORCE_LT(ids_data[i], N);
-        PADDLE_ENFORCE_GE(ids_data[i], 0);
-        for (int j = 0; j < D; ++j) {
-          d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+        if (ids_data[i != padding_idx) {
+          PADDLE_ENFORCE_LT(ids_data[i], N);
+          PADDLE_ENFORCE_GE(ids_data[i], 0);
+          for (int j = 0; j < D; ++j) {
+            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+          }
         }
       }
     }

From 02a585b5c7b7667d20fcae9a8842b8be6ca9e6a3 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Tue, 12 Feb 2019 15:57:16 +0800
Subject: [PATCH 33/78] add details. test=develop

---
 cmake/flags.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 5895657ece..36b533aa4f 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()
 
 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -148,6 +147,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif(NOT WIN32)
 
 if (APPLE)

From 29a4b21bc8d49067e0e4ce470aedb74b29050b37 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 16:18:06 +0800
Subject: [PATCH 34/78] fix problem test=develop

---
 paddle/fluid/operators/lookup_table_op.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index e20f417d76..56c6e37ae3 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -188,7 +188,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
       for (int64_t i = 0; i < ids->numel(); ++i) {
-        if (ids_data[i != padding_idx) {
+        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
+          // the gradient of padding_idx should be 0, already done by memset, so
+          // do nothing.
+        } else {
           PADDLE_ENFORCE_LT(ids_data[i], N);
           PADDLE_ENFORCE_GE(ids_data[i], 0);
           for (int j = 0; j < D; ++j) {

From 03f091a9d3c0614561e85ed7b686fb3e0a0253e6 Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Tue, 12 Feb 2019 17:32:06 +0800
Subject: [PATCH 35/78] fix api doc test=develop

---
 python/paddle/fluid/layers/nn.py     | 49 ++++++++++++++++++++++++----
 python/paddle/fluid/layers/tensor.py |  6 +++-
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0e4b5aadc0..ea043b0eba 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5935,13 +5935,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to the reshaped tensor
                    variable.
-        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
-                       operators. If this flag is set :attr:`True`, reuse input
-                       :attr:`x` to reshape, which will change the shape of
-                       tensor variable :attr:`x` and might cause errors when
-                       :attr:`x` is used in multiple operators. If :attr:`False`,
-                       preserve the shape :attr:`x` and create a new output tensor
-                       variable whose data is copied from input x but reshaped.
+        inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
+                       are the same variable, otherwise, the input and output of
+                       ``layers.reshape`` are different variables. Note that if :attr:`x`
+                       is more than one layers' input, ``inplace`` must be :attr:`False`.
         name (str): The name of this layer. It is optional.
 
     Returns:
@@ -8334,6 +8331,44 @@ def stack(x, axis=0):
     If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
     If :code:`axis` is None, it would be replaced with 0.
 
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+
+          Attrs:
+            axis = 0
+
+          Output:
+            Out.data =[ [ [1.0, 2.0] ],
+                        [ [3.0, 4.0] ],
+                        [ [5.0, 6.0] ] ]
+            Out.dims = [3, 1, 2]
+
+        Case 2:
+          Given
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+
+          Attrs:
+            axis = 1 or axis = -2
+
+          Output:
+            Out.data =[ [ [1.0, 2.0]
+                          [3.0, 4.0]
+                          [5.0, 6.0] ] ]
+            Out.dims = [1, 3, 2]
+
     Args:
         x (Variable|list(Variable)|tuple(Variable)): Input variables.
         axis (int|None): The axis along which all inputs are stacked.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2153ca254f..af747c3cec 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -567,7 +567,7 @@ def ones(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
+        shape(tuple|list): Shape of output tensor
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
 
     Returns:
@@ -578,6 +578,10 @@ def ones(shape, dtype, force_cpu=False):
 
           data = fluid.layers.ones(shape=[1], dtype='int64')
     """
+    assert isinstance(shape, list) or isinstance(
+        shape, tuple), "The shape's type should be list or tuple."
+    assert reduce(lambda x, y: x * y,
+                  shape) > 0, "The shape is invalid: %s." % (str(shape))
     return fill_constant(value=1.0, **locals())
 
 

From 6e0e70619817e6eabcc5320e2acae6cd6e3fe9d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B9=94=E9=BE=99=E9=A3=9E=20Qiao=20Longfei?=
 <qiaolongfei@baidu.com>
Date: Tue, 12 Feb 2019 21:02:30 +0800
Subject: [PATCH 36/78] Revert "cpu reduce mode did not need to broadcast
 params test=develop"

---
 paddle/fluid/framework/details/build_strategy.cc      |  3 ---
 .../framework/details/multi_devices_graph_pass.cc     |  6 ++++--
 .../framework/details/multi_devices_graph_pass.h      |  1 +
 python/paddle/fluid/compiler.py                       | 11 -----------
 4 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 010c8dee6c..f8030c53f7 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -133,15 +133,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
     if (strategy_.is_distribution_) {
-      VLOG(3) << "multi device dist train mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(3) << "multi device allreduce mode";
         multi_devices_pass =
             AppendPass("allreduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(3) << "multi device reduce mode";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 24977aabda..75f922d2cc 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -731,6 +731,7 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
       }
     }
     insert_op = true;
+    need_broadcast_var_ = true;
   } else if (OpHaveRole(*node, OpRole::kDist)) {
     int op_dev_id = CreateDistTrainOp(result, node);
     if (node->Op()->Type() == "concat") {
@@ -924,8 +925,9 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  // only GPU reduce mode need to broadcast parameters to each device.
-  if (UseGPU() && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+  if (need_broadcast_var_ ||
+      (UseGPU() &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
     if (strategy_.fuse_broadcast_op_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 21f85dc828..6d4386538e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -174,6 +174,7 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
   int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+  mutable bool need_broadcast_var_{false};
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 2b69fd89a2..ef02429428 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -19,7 +19,6 @@ import sys
 from .. import compat as cpt
 
 from . import core
-from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -35,15 +34,6 @@ def _place_obj(place):
     return p
 
 
-def _is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else framework.default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class CompiledProgram(object):
     """
     Compiles a Program for execution.
@@ -120,7 +110,6 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
     def with_inference_optimize(self, config):

From da9c94da333cf3190c6f9f647139cc567a723f81 Mon Sep 17 00:00:00 2001
From: Gabor Buella <gabor.buella@intel.com>
Date: Wed, 13 Feb 2019 02:41:42 +0100
Subject: [PATCH 37/78] Clang build fixes (#15628)

* Remove some superfluous std::move calls

The std:move triggered a build error (with -Werror):
```
[  9%] Building CXX object paddle/fluid/memory/allocation/CMakeFiles/allocator_facade.dir/allocator_facade.cc.o
/home/tej/code/gbuella_paddle/paddle/fluid/memory/allocation/allocator_facade.cc:86:29: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
            [this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
                            ^
/home/tej/code/gbuella_paddle/paddle/fluid/memory/allocation/allocator_facade.cc:86:29: note: remove std::move call here
            [this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
                            ^~~~~~~~~~                          ~
1 error generated.
```

See: https://reviews.llvm.org/D7633

* Remove a superfluous lambda capture from framework/operator.h

```
[ 10%] Building CXX object paddle/fluid/platform/CMakeFiles/device_context.dir/init.cc.o
In file included from /home/tej/code/gbuella_paddle/paddle/fluid/platform/init.cc:19:
/home/tej/code/gbuella_paddle/paddle/fluid/framework/operator.h:229:21: error: lambda capture 'this' is not used [-Werror,-Wunused-lambda-capture]
                   [this](Variable* var) { return var; });
                    ^~~~
1 error generated.
```

Changing it to `return it->second;`, as is in the function below.

* Rethrow an exception (instead of copying it)

```
[ 11%] Building CXX object paddle/fluid/framework/CMakeFiles/operator.dir/operator.cc.o
/home/tej/code/gbuella_paddle/paddle/fluid/framework/operator.cc:191:13: error: local variable 'exception' will be copied despite being thrown by name [-Werror,-Wreturn-std-move]
      throw exception;
            ^~~~~~~~~
/home/tej/code/gbuella_paddle/paddle/fluid/framework/operator.cc:191:13: note: call 'std::move' explicitly to avoid copying
      throw exception;
            ^~~~~~~~~
            std::move(exception)

```

See https://reviews.llvm.org/D43322 for an explanation of this diagnostic message.

* Remove an unused variable

```
/home/tej/code/gbuella_paddle/paddle/fluid/framework/operator.cc:884:16: error: private field 'scope_' is not used [-Werror,-Wunused-private-field]
  const Scope& scope_;
               ^
```

* struct ComputationOpHandle -> class ComputationOpHandle

```
[ 13%] Building CXX object paddle/fluid/framework/details/CMakeFiles/memory_early_delete_pass.dir/memory_early_delete_pass.cc.o
In file included from /home/tej/code/gbuella_paddle/paddle/fluid/framework/details/memory_early_delete_pass.cc:21:
/home/tej/code/gbuella_paddle/paddle/fluid/framework/details/reference_count_pass_helper.h:30:1: error: class 'ComputationOpHandle' was previously declared as a struct; this is valid, but may result in linker errors under the Microsoft C++ ABI [-Werror,-Wmismatched-tags]
class ComputationOpHandle;
^
/home/tej/code/gbuella_paddle/paddle/fluid/framework/details/computation_op_handle.h:29:8: note: previous use is here
struct ComputationOpHandle : public OpHandleBase {
       ^
/home/tej/code/gbuella_paddle/paddle/fluid/framework/details/reference_count_pass_helper.h:30:1: note: did you mean struct here?
class ComputationOpHandle;
^~~~~
struct
1 error generated.
```

* Fix name() methods under fluid/operators

```
In file included from /home/tej/code/gbuella_paddle/paddle/fluid/operators/jit/gen/act.cc:15:
In file included from /home/tej/code/gbuella_paddle/paddle/fluid/operators/jit/gen/act.h:19:
/home/tej/code/gbuella_paddle/paddle/fluid/operators/jit/gen/jitcode.h:71:23: error: 'name' overrides a member function but is not marked 'override' [-Werror,-Winconsistent-missing-override]
  virtual const char* name() const = 0;
                      ^
/home/tej/code/gbuella_paddle/paddle/fluid/operators/jit/gen_base.h:31:23: note: overridden virtual function is here
  virtual const char* name() const = 0;
                      ^
```

test=develop
---
 paddle/fluid/framework/details/computation_op_handle.h   | 2 +-
 .../framework/details/parallel_ssa_graph_executor.cc     | 4 ++--
 paddle/fluid/framework/ir/graph.cc                       | 2 +-
 paddle/fluid/framework/operator.cc                       | 9 ++++-----
 paddle/fluid/framework/operator.h                        | 7 +------
 paddle/fluid/inference/analysis/ir_pass_manager.cc       | 2 +-
 paddle/fluid/inference/api/analysis_predictor.cc         | 2 +-
 paddle/fluid/memory/allocation/allocator_facade.cc       | 2 +-
 paddle/fluid/operators/jit/gen/act.h                     | 1 -
 paddle/fluid/operators/jit/gen/blas.h                    | 2 +-
 paddle/fluid/operators/jit/gen/hopv.h                    | 2 +-
 paddle/fluid/operators/jit/gen/jitcode.h                 | 1 -
 paddle/fluid/operators/jit/gen/matmul.h                  | 2 +-
 paddle/fluid/operators/jit/gen/seqpool.h                 | 2 +-
 paddle/fluid/pybind/inference_api.cc                     | 4 ++--
 15 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 601ae4f8c6..1e3dbb1e44 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -26,7 +26,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct ComputationOpHandle : public OpHandleBase {
+class ComputationOpHandle : public OpHandleBase {
  public:
   ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                       size_t scope_idx);
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 128aaa33a2..e8deb5bfc6 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
     if (pool_) {
       run_futures.emplace_back(pool_->enqueue(std::move(call)));
     } else {
-      fetch_data.emplace_back(std::move(call()));
+      fetch_data.emplace_back(call());
     }
   }
 
@@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
       if (exception_holder_.IsCaught()) {
         f.wait();
       } else {
-        fetch_data.emplace_back(std::move(f.get()));
+        fetch_data.emplace_back(f.get());
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 3eb5bdba3b..4b5c846f32 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -76,7 +76,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
       var->inputs.push_back(node);
     }
   }
-  return std::move(var_nodes);
+  return var_nodes;
 }
 
 void Graph::ResolveHazard(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9d6c10ab9e..b22523e0f4 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(3) << place << " " << DebugStringEx(&scope);
   } catch (platform::EnforceNotMet exception) {
     if (Attrs().count("sub_block") != 0) {
-      throw exception;
+      throw;
     }
 
     auto& callstack = Attr<std::vector<std::string>>(
         OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
 
     if (callstack.empty()) {
-      throw exception;
+      throw;
     }
     std::ostringstream sout;
     sout << "Invoke operator " << Type() << " error.\n";
@@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     sout << "C++ Callstacks: \n";
     sout << exception.err_str_;
     exception.err_str_ = sout.str();
-    throw exception;
+    throw;
   } catch (...) {
     std::rethrow_exception(std::current_exception());
   }
@@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  public:
   RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
                            const RuntimeContext& ctx)
-      : op_(op), scope_(scope), ctx_(ctx) {}
+      : op_(op), ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -881,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   const OperatorBase& op_;
-  const Scope& scope_;
   const RuntimeContext& ctx_;
 };
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 40d935a5ff..e33214b44b 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -222,12 +222,7 @@ class ExecutionContext {
     if (it == ctx_.inputs.end()) {
       return {};
     }
-    std::vector<const Variable*> res;
-    res.reserve(it->second.size());
-    std::transform(it->second.begin(), it->second.end(),
-                   std::back_inserter(res),
-                   [this](Variable* var) { return var; });
-    return res;
+    return {it->second.begin(), it->second.end()};
   }
 
   std::vector<Variable*> MultiOutputVar(const std::string& name) const {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 7476c199cf..8d5ee36ae6 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -101,7 +101,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
     }
     graph = pass->Apply(std::move(graph));
   }
-  return std::move(graph);
+  return graph;
 }
 
 framework::proto::ProgramDesc IRPassManager::AcquireProgram(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index da2e9803f0..712e010db4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -421,7 +421,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
-  return std::move(predictor);
+  return predictor;
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 794d729bdc..ea0b729dc6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -83,7 +83,7 @@ class ChunkedAllocator : public Allocator {
         VLOG(1) << "Create AutoIncrementAllocator with chunk_size "
                 << max_chunk_size_ << " and capacity " << capacity;
         default_allocator_ = std::make_shared<AutoIncrementAllocator>(
-            [this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
+            [this] { return CreateAllocatorWithChunk(); }, capacity);
       }
     }
 
diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
index 68e66f9298..1664dfa906 100644
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -63,7 +63,6 @@ class VActFunc : public JitCode {
  public:
   explicit VActFunc(size_t code_size, void* code_ptr)
       : JitCode(code_size, code_ptr) {}
-  virtual const char* name() const = 0;
   virtual void genCode() = 0;
 
  protected:
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index 66a97c1be5..e991139266 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const {
+  virtual const char* name() const override {
     std::string base = "VXXJitCode";
     if (scalar_index_ == 1) {
       base += "_Scalar";
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
index d3bc94b63d..c336fe73fe 100644
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -35,7 +35,7 @@ class HOPVJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const {
+  virtual const char* name() const override {
     std::string base = "VXXJitCode";
     if (type_ == operand_type::MAX) {
       base += "_MAX";
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index c388109604..91058f6cf6 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -68,7 +68,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
             (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
             code_ptr) {}
 
-  virtual const char* name() const = 0;
   virtual void genCode() = 0;
 
   size_t getSize() const override { return CodeGenerator::getSize(); }
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 626baa8f73..7976e3112d 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -36,7 +36,7 @@ class MatMulJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const {
+  virtual const char* name() const override {
     std::string base = "MatMulJitCode";
     base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
            std::to_string(k_);
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index fcbbb3c84c..c464c2eac8 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const {
+  virtual const char* name() const override {
     std::string base = "SeqPoolJitCode";
     if (type_ == SeqPoolType::kSum) {
       base += "_Sum";
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 39e47be606..7db2bb451b 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -74,12 +74,12 @@ void BindPaddleBuf(py::module *m) {
       .def(py::init([](std::vector<float> &data) {
         auto buf = PaddleBuf(data.size() * sizeof(float));
         std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
-        return std::move(buf);
+        return buf;
       }))
       .def(py::init([](std::vector<int64_t> &data) {
         auto buf = PaddleBuf(data.size() * sizeof(int64_t));
         std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
-        return std::move(buf);
+        return buf;
       }))
       .def("resize", &PaddleBuf::Resize)
       .def("reset",

From 882e7ec48012d3250572d7f016ec3c1ef5d89433 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 13 Feb 2019 11:34:41 +0800
Subject: [PATCH 38/78] fix generate doc error in activation ops

test=develop
---
 paddle/fluid/operators/activation_op.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 189db2317d..65efe2966c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -37,7 +37,7 @@ using paddle::framework::Tensor;
           "(bool, default false) Set to true for inference only, false " \
           "for training. Some layers may run faster when this is true.") \
           .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                           \
+      AddComment(OP_COMMENT);                                            \
     }                                                                    \
   }
 
@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
-$$out = \frac{1}{1 + e^{-x}}$$
+$$out = \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
@@ -187,14 +187,14 @@ $out = |x|$
 UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
-$out = ceil(x)$
+$out = \left \lceil x \right \rceil$
 
 )DOC";
 
 UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
-$out = floor(x)$
+$out = \left \lfloor x \right \rfloor$
 
 )DOC";
 
@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
-$$out = \frac{x}{1 + |x|}$$
+$$out = \\frac{x}{1 + \|x\|}$$
 
 )DOC";
 

From b1f97a6fa9186266b9a76c8157ab80801e5cf9f0 Mon Sep 17 00:00:00 2001
From: liuwei1031 <liuwei12@baidu.com>
Date: Wed, 13 Feb 2019 04:56:42 +0000
Subject: [PATCH 39/78] fix security issue 27, 38 test=develop

---
 paddle/fluid/framework/ir/infer_clean_graph_pass.cc | 1 +
 paddle/fluid/operators/random_crop_op.h             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index 7713ed1eab..6607c026a7 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
     std::unordered_set<const Node*> invalid_nodes;
     int valid_op = 0;
     for (auto* node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node);
       if (is_valid_node(node)) {
         invalid_nodes.insert(node);
       } else if (node->IsOp()) {
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index d68ba9d661..ee034b2705 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -121,7 +121,7 @@ struct RandomCropFunctor {
   HOSTDEVICE void operator()(size_t ins_idx) {
     typename Random<DeviceContext>::Engine engine(seed_);
     engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9];
+    size_t offsets[9] = {};
     for (int i = num_batchsize_dims_; i < rank_; ++i) {
       typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
           0, x_dims_[i] - out_dims_[i]);

From 14fe9219dc9a5769215e471d28b9538b912453bf Mon Sep 17 00:00:00 2001
From: liuwei1031 <liuwei12@baidu.com>
Date: Wed, 13 Feb 2019 05:03:24 +0000
Subject: [PATCH 40/78] reset unexpected changes, test=develop

---
 paddle/fluid/memory/detail/system_allocator.cc | 5 -----
 paddle/fluid/memory/detail/system_allocator.h  | 3 ---
 2 files changed, 8 deletions(-)

diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 3c82c8aa19..197d1c2f21 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -117,11 +117,6 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   if (result == cudaSuccess) {
     *index = 0;
     gpu_alloc_size_ += size;
-    if (gpu_alloc_size_ > s_memoryMap[gpu_id_]) {
-        s_memoryMap[gpu_id_] = gpu_alloc_size_;
-        VLOG(3) << "device: " << gpu_id_
-          << " maximum memory size : " <<(gpu_alloc_size_ >> 20) << " MiB";
-    }
     return p;
   } else {
     LOG(WARNING)
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 1ac1df6de7..a0386a2dad 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
-#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -45,8 +44,6 @@ class CPUAllocator : public SystemAllocator {
 #ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
-  std::unordered_map<int, uint64_t> s_memoryMap;
-
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
 
   virtual void* Alloc(size_t* index, size_t size);

From 15d7220f9473e1056f988a9a91a5698b55a4eaa9 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 13 Feb 2019 05:51:50 +0000
Subject: [PATCH 41/78] fix jitcode name

test=develop
---
 paddle/fluid/operators/jit/gen/act.h     | 4 ++--
 paddle/fluid/operators/jit/gen/blas.h    | 4 ++--
 paddle/fluid/operators/jit/gen/gru.h     | 4 ++--
 paddle/fluid/operators/jit/gen/hopv.h    | 4 ++--
 paddle/fluid/operators/jit/gen/jitcode.h | 3 ++-
 paddle/fluid/operators/jit/gen/lstm.h    | 4 ++--
 paddle/fluid/operators/jit/gen/matmul.h  | 4 ++--
 paddle/fluid/operators/jit/gen/seqpool.h | 4 ++--
 paddle/fluid/operators/jit/gen_base.h    | 3 ++-
 9 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
index 1664dfa906..13d98577e2 100644
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -268,7 +268,7 @@ class VActJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "VActJitCode";
     switch (type_) {
       case operand_type::RELU:
@@ -292,7 +292,7 @@ class VActJitCode : public VActFunc {
       default:
         break;
     }
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index e991139266..70312bbe5e 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "VXXJitCode";
     if (scalar_index_ == 1) {
       base += "_Scalar";
@@ -62,7 +62,7 @@ class VXXJitCode : public JitCode {
     }
     base += (with_relu_ ? "_Relu" : "");
     base += "_D" + std::to_string(num_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
index a4d7222a34..d91f828e6a 100644
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "GRUJitCode";
     if (id_ == 0) {
       base += "_H1";
@@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc {
     };
     AddTypeStr(act_gate_);
     AddTypeStr(act_cand_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
index c336fe73fe..28d213e5e4 100644
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "VXXJitCode";
     if (type_ == operand_type::MAX) {
       base += "_MAX";
     } else {
       base += "_SUM";
     }
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 91058f6cf6..689df8b1cb 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <string>
 #include <type_traits>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -59,7 +60,7 @@ typedef enum {
 } operand_type;
 
 #define DECLARE_JIT_CODE(codename) \
-  const char* name() const override { return #codename; }
+  std::string name() const override { return #codename; }
 
 class JitCode : public GenBase, public Xbyak::CodeGenerator {
  public:
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
index d4753bca23..fa560b6230 100644
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "LSTMJitCode";
     if (use_peephole_) {
       base += "_Peephole";
@@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc {
     AddTypeStr(act_gate_);
     AddTypeStr(act_cand_);
     AddTypeStr(act_cell_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 7976e3112d..881cea581a 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "MatMulJitCode";
     base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
            std::to_string(k_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index c464c2eac8..4108ee2f46 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "SeqPoolJitCode";
     if (type_ == SeqPoolType::kSum) {
       base += "_Sum";
@@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode {
       base += "_Sqrt";
     }
     base += ("_W" + std::to_string(w_));
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index d808a33247..32a861b209 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@
 
 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
@@ -28,7 +29,7 @@ namespace jit {
 class GenBase : public Kernel {
  public:
   virtual ~GenBase() = default;
-  virtual const char* name() const = 0;
+  virtual std::string name() const = 0;
   virtual size_t getSize() const = 0;
   virtual const unsigned char* getCodeInternal() = 0;
   template <typename Func>

From ba223e956609fac86e30efaa423dd324e7bc3ecc Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Wed, 13 Feb 2019 15:05:43 +0800
Subject: [PATCH 42/78] doc refine test=develop

---
 python/paddle/fluid/layers/nn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ea043b0eba..f4c4fc3b65 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8331,6 +8331,8 @@ def stack(x, axis=0):
     If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
     If :code:`axis` is None, it would be replaced with 0.
 
+    For Example:
+
     .. code-block:: text
 
         Case 1:

From fb2a7b230010f194238d557fb9d5fd3f44e98bdf Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 13 Feb 2019 17:31:39 +0800
Subject: [PATCH 43/78] fix aligned-new error in jitkernel (#15626)

* fix aligned-new error in jitkernel

test=develop

* override genbase new to fix mis-align

test=develop
---
 paddle/fluid/operators/jit/gen_base.cc | 17 +++++++++++++++++
 paddle/fluid/operators/jit/gen_base.h  |  5 +++++
 2 files changed, 22 insertions(+)

diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 3cd5f6554b..f3603875ad 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -17,7 +17,13 @@
 #include <iostream>
 #include <sstream>
 #include <vector>
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifndef _WIN32
+#define posix_memalign_free free
+#endif
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+void* GenBase::operator new(size_t size) {
+  void* ptr;
+  constexpr size_t alignment = 32ul;
+  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
+                    "GenBase Alloc %ld error!", size);
+  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  return ptr;
+}
+
+void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
+
 std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
   int block;
   int max_num_regs;
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index d808a33247..0f85245ba9 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -42,6 +42,11 @@ class GenBase : public Kernel {
     return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
   }
 
+  void* operator new(size_t size);
+  void operator delete(void* ptr);
+  void* operator new[](size_t size) { return operator new(size); }
+  void operator delete[](void* ptr) { operator delete(ptr); }
+
  protected:
   void dumpCode(const unsigned char* code) const;
 };

From 8fc0fc314a51cfea0579d0ac058349b051e688d4 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrywgz@126.com>
Date: Wed, 13 Feb 2019 09:53:12 +0000
Subject: [PATCH 44/78] support multiple var types for expand op, test=develop

---
 paddle/fluid/operators/expand_op.cc           |  8 +++++--
 paddle/fluid/operators/expand_op.cu           |  8 +++++--
 .../fluid/tests/unittests/test_expand_op.py   | 24 +++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 6aa4c76b9c..44a2f37b66 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -146,7 +146,11 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
 REGISTER_OP_CPU_KERNEL(
     expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
index d95c9b6180..50a506b294 100644
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
@@ -15,7 +15,11 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
 REGISTER_OP_CUDA_KERNEL(
     expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index 67a8d8f072..218fc697f2 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -109,5 +109,29 @@ class TestExpandOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestExpandOpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("int32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestExpandOpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("bool")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()

From ad61e1b22c1625db7e096d207c4240fdfea9b2b8 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 13 Feb 2019 05:53:43 -0600
Subject: [PATCH 45/78] fix potential bug (#15688)

test=develop
---
 paddle/fluid/framework/feed_fetch_method.cc          | 1 +
 paddle/fluid/framework/operator.cc                   | 9 ++++++---
 paddle/fluid/memory/allocation/best_fit_allocator.cc | 2 ++
 paddle/fluid/operators/conv_op.cc                    | 4 ++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 6338be75a4..96530b2a3f 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
   // Since we want to fetch LodTensor from a variable, the variable must
   // be created alreadly.
   Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
   PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
                  "Only %s can be invoked by GetFetchVariable",
                  typeid(FeedFetchList).name());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b22523e0f4..e15c838f4f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -989,11 +989,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* origin_var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
+                            var_name);
     auto* original_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
-                   var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
+                            var_name);
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     original_tensor->ShareDataWith(*transformed_tensor);
   }
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 6f3e512fb0..e3d6c2f511 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const {
 }
 void BestFitAllocator::Free(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(bf_allocation,
+                          "The input allocation is not BestFitAllocation.");
   auto chunk_it = bf_allocation->ChunkIterator();
   PADDLE_ENFORCE(!chunk_it->is_free);
   chunk_it->is_free = true;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index bd788f03e7..fd9f156d07 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -222,7 +222,7 @@ void Conv2DOpMaker::Make() {
       .SetDefault(4096);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
+                "convolution, whether enable exhaustive search "
                 "for cuDNN convolution or not, defalut is False.")
       .SetDefault(false);
   AddComment(R"DOC(
@@ -341,7 +341,7 @@ void Conv3DOpMaker::Make() {
       .SetDefault(4096);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
+                "convolution, whether enable exhaustive search "
                 "for cuDNN convolution or not, defalut is False.")
       .SetDefault(false);
   AddComment(R"DOC(

From 7a8eff36a62944450303f54c39b9830ef37257e5 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Wed, 13 Feb 2019 10:41:45 +0100
Subject: [PATCH 46/78] Fix old FC backward weights descriptor creation

test=develop
---
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index e595f1a627..3a926a716f 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                ? mkldnn::inner_product_backward_weights::desc(
                                      src, diff_weights, bias, diff_dst)
                                : mkldnn::inner_product_backward_weights::desc(
-                                     src, diff_weights, bias, diff_dst);
+                                     src, diff_weights, diff_dst);
 
     return mkldnn::inner_product_backward_weights::primitive_desc(
         bwd_weight_desc, engine, pd);

From c47e258ea489a7773eb6a257b969195dec7642a5 Mon Sep 17 00:00:00 2001
From: baojun <32073718+baojun-nervana@users.noreply.github.com>
Date: Wed, 13 Feb 2019 06:06:04 -0800
Subject: [PATCH 47/78] Add ngraph sum, sigmoid, relu_grad and tanh_grad op
 (#15642)

* Added ngraph sum op test=develop

* Added sigmoid, relu_grad and tanh_grad test=develop

* remove duplicates test=develop
---
 .../fluid/operators/ngraph/ngraph_bridge.cc   |  4 ++
 paddle/fluid/operators/ngraph/ngraph_ops.h    |  2 +
 .../operators/ngraph/ops/activation_op.h      | 52 ++++++++++++++++++
 paddle/fluid/operators/ngraph/ops/sum_op.h    | 55 +++++++++++++++++++
 .../ngraph/test_activation_ngraph_op.py       | 12 +---
 .../unittests/ngraph/test_sum_ngraph_op.py    | 19 +++++++
 6 files changed, 133 insertions(+), 11 deletions(-)
 create mode 100644 paddle/fluid/operators/ngraph/ops/activation_op.h
 create mode 100644 paddle/fluid/operators/ngraph/ops/sum_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py

diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index e8b92fc02a..08d72a5b39 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -48,8 +48,12 @@ std::map<std::string,
         {"softmax", NG_OPS::BuildSoftmaxNode},
         {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
         {"scale", NG_OPS::BuildScaleNode},
+        {"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
+        {"sum", NG_OPS::BuildSumNode},
         {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
+        {"relu_grad", NG_OPS::BuildReluGradNode},
         {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
+        {"tanh_grad", NG_OPS::BuildTanhGradNode},
         {"top_k", NG_OPS::BuildTopKNode}};
 
 void NgraphBridge::BuildNgNode(
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index 438a9c1be9..c7d7392080 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #pragma once
 
 #include "ops/accuracy_op.h"
+#include "ops/activation_op.h"
 #include "ops/batch_norm_op.h"
 #include "ops/binary_unary_op.h"
 #include "ops/conv2d_op.h"
@@ -32,4 +33,5 @@ limitations under the License. */
 #include "ops/pool2d_op.h"
 #include "ops/scale_op.h"
 #include "ops/softmax_op.h"
+#include "ops/sum_op.h"
 #include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
new file mode 100644
index 0000000000..f66080e3aa
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -0,0 +1,52 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildReluGradNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto relu_grad = std::make_shared<ngraph::op::ReluBackprop>(out, dout);
+  platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map);
+}
+
+void BuildTanhGradNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto shape = out->get_shape();
+  auto node_const =
+      ngraph::op::Constant::create(ngraph::element::f32, shape, {1});
+  auto result = dout * (node_const - out * out);
+  platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h
new file mode 100644
index 0000000000..97f4ce64aa
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/sum_op.h
@@ -0,0 +1,55 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildSumNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  std::vector<std::string> op_inputs;
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      op_inputs.push_back(var_name);
+      if (ngb_node_map->find(var_name) == ngb_node_map->end()) {
+        PADDLE_THROW("op % input varname %s is not found in var_node_map",
+                     op->Type(), var_name);
+      }
+    }
+  }
+  std::shared_ptr<ngraph::Node>& sum = ngb_node_map->at(op_inputs[0]);
+  for (size_t k = 1; k < op_inputs.size(); ++k) {
+    std::shared_ptr<ngraph::Node>& nodek = ngb_node_map->at(op_inputs[k]);
+    if (nodek->get_element_type() != sum->get_element_type()) {
+      nodek =
+          std::make_shared<ngraph::op::Convert>(nodek, sum->get_element_type());
+    }
+    sum = sum + nodek;
+  }
+  platform::SetOutputNode(op, "Out", sum, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
index 2bd9bf8430..034d7792c1 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
@@ -18,17 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
-
-
-class TestNGRAPHReluDim2(TestRelu):
-    def setUp(self):
-        super(TestNGRAPHReluDim2, self).setUp()
-
-
-class TestNGRAPHTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestNGRAPHTanhDim2, self).setUp()
+from paddle.fluid.tests.unittests.test_activation_op import TestSigmoid, TestRelu, TestTanh
 
 
 class TestNGRAPHReluDim4(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
new file mode 100644
index 0000000000..ed9fb61802
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
@@ -0,0 +1,19 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_sum_op import TestSumOp, TestSelectedRowsSumOp, TestLoDTensorAndSelectedRowsOp
+
+if __name__ == "__main__":
+    unittest.main()

From a52d5d5095ac7af494400947d4aace90f8309adc Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrywgz@126.com>
Date: Thu, 14 Feb 2019 02:31:39 +0000
Subject: [PATCH 48/78] refine unittest, test=develop

---
 python/paddle/fluid/tests/unittests/test_expand_op.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index 218fc697f2..690875662e 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -112,7 +112,10 @@ class TestExpandOpRank4(OpTest):
 class TestExpandOpInteger(OpTest):
     def setUp(self):
         self.op_type = "expand"
-        self.inputs = {'X': np.random.random((2, 4, 5)).astype("int32")}
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int32")
+        }
         self.attrs = {'expand_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
         self.outputs = {'Out': output}
@@ -124,7 +127,7 @@ class TestExpandOpInteger(OpTest):
 class TestExpandOpBoolean(OpTest):
     def setUp(self):
         self.op_type = "expand"
-        self.inputs = {'X': np.random.random((2, 4, 5)).astype("bool")}
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
         self.attrs = {'expand_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
         self.outputs = {'Out': output}

From 5a03b515ae6d7f96c6a7e451fc0607bee5632e00 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 13 Feb 2019 20:32:05 -0600
Subject: [PATCH 49/78] fix potential bug in async_executor (#15707)

test=develop
---
 paddle/fluid/framework/async_executor.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 1d9678a1ba..60708bf609 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   auto& block = main_program.Block(0);
   for (auto var_name : fetch_var_names) {
     auto var_desc = block.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
     auto shapes = var_desc->GetShape();
     PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
                    "var %s: Fetched var has wrong shape, "

From c794ecf641a575e92e2b55ad56b27c42e8b709f5 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 13 Feb 2019 21:18:09 -0600
Subject: [PATCH 50/78] Remove test_image_classification_resnet from mac CI
 (#15706)

* remove est_image_classification_resnet for mac
test=develop

* increate the timeout
test=develop
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4b26bacce9..534411219b 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -109,11 +109,12 @@ set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+        # change the timeout from 600 to 1200, because in debug mode, this test need more time.
+        set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
+    endif()
 endif()
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    # change the timeout from 600 to 900, because in debug mode, this test need more time.
-    set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900)
-endif()
+
 
 if (WITH_NGRAPH)
     add_subdirectory(ngraph)

From f0590947c39ee1e6aabb1245149dc400a8d5c147 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 13 Feb 2019 10:01:24 +0800
Subject: [PATCH 51/78] fix enforce test=develop

---
 paddle/fluid/platform/enforce.h | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 142d38f060..d32f9c8667 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -233,9 +233,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 #endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
-#define PADDLE_THROW(...)                  \
-  throw ::paddle::platform::EnforceNotMet( \
-      ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
+#define PADDLE_THROW(...)                                            \
+  do {                                                               \
+    throw ::paddle::platform::EnforceNotMet(                         \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
 
 #define PADDLE_ENFORCE(COND, ...)                                         \
   do {                                                                    \
@@ -270,23 +272,25 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
-  do {                                                       \
-    if (UNLIKELY(nullptr == (__VAL))) {                      \
-      PADDLE_THROW(#__VAL " should not be null\n%s",         \
-                   paddle::string::Sprintf("" __VA_ARGS__)); \
-    }                                                        \
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                 \
+  do {                                                      \
+    if (UNLIKELY(nullptr == (__VAL))) {                     \
+      PADDLE_THROW(#__VAL " should not be null\n%s",        \
+                   ::paddle::string::Sprintf(__VA_ARGS__)); \
+    }                                                       \
   } while (0)
 
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
   do {                                                                  \
-    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+    auto __cond1__ = (__VAL0);                                          \
+    auto __cond2__ = (__VAL1);                                          \
+    if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) {                     \
       PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
                    " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
                    #__VAL0, #__VAL1, #__VAL0,                           \
-                   paddle::string::to_string(__VAL0), #__VAL1,          \
-                   paddle::string::to_string(__VAL1),                   \
-                   paddle::string::Sprintf("" __VA_ARGS__));            \
+                   ::paddle::string::to_string(__cond1__), #__VAL1,     \
+                   ::paddle::string::to_string(__cond2__),              \
+                   ::paddle::string::Sprintf(__VA_ARGS__));             \
     }                                                                   \
   } while (0)
 

From c00ed19df2e84ceba337e6f91f5833a1a94bed59 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 14 Feb 2019 13:27:12 +0800
Subject: [PATCH 52/78] add more comment (#15603)

---
 paddle/fluid/inference/api/paddle_api.h | 62 +++++++++++++++++++------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 8ac8bc5291..f90a74b910 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -16,6 +16,12 @@
 /*! \file paddle_api.h
  */
 
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
 #include <cassert>
 #include <memory>
 #include <string>
@@ -34,26 +40,49 @@ enum PaddleDType {
 };
 
 /**
- *\brief Memory menager for PaddleTensor.
+ * \brief Memory manager for `PaddleTensor`.
  *
- *The PaddleBuf holds a buffer for data input or output. The memory can be
- *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- *should be reused for better performance.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
  *
- *For user allocated memory, the following API can be used:
- *- PaddleBuf(void* data, size_t length) to set an external memory by
- *specifying
- *  the memory address and length.
- *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
  *memory.
- *ATTENTION, for user allocated memory, deallocation should be done by users
+ * ATTENTION, for user allocated memory, deallocation should be done by users
  *externally after the program finished. The PaddleBuf won't do any allocation
  *or deallocation.
  *
- *To have the PaddleBuf allocate and manage the memory:
- *- PaddleBuf(size_t length) will allocate a memory of size `length`.
- *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
  *  if the allocated memory is larger than `length`, nothing will done.
+ *
+ * Usage:
+ *
+ * Let PaddleBuf manage the memory internally.
+ * \code{cpp}
+ * const int num_elements = 128;
+ * PaddleBuf buf(num_elements * sizeof(float));
+ * \endcode
+ *
+ * Or
+ * \code{cpp}
+ * PaddleBuf buf;
+ * buf.Resize(num_elements * sizeof(float));
+ * \endcode
+ * Works the exactly the same.
+ *
+ * One can also make the `PaddleBuf` use the external memory.
+ * \code{cpp}
+ * PaddleBuf buf;
+ * void* external_memory = new float[num_elements];
+ * buf.Reset(external_memory, num_elements*sizeof(float));
+ * ...
+ * delete[] external_memory; // manage the memory lifetime outside.
+ * \endcode
  */
 class PaddleBuf {
  public:
@@ -78,7 +107,7 @@ class PaddleBuf {
   /** Tell whether the buffer is empty.
    */
   bool empty() const { return length_ == 0; }
-  /** Get the memory address.
+  /** Get the data's memory address.
    */
   void* data() const { return data_; }
   /** Get the memory length.
@@ -110,7 +139,8 @@ struct PaddleTensor {
 };
 
 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-/** Tensor without copy, currently only supports AnalysisPredictor.
+
+/** Tensor without copy, currently only supports `AnalysisPredictor`.
  */
 class ZeroCopyTensor {
  public:
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
  *
  * Usage:
  *
+ * \code{.cpp}
  * NativeConfig config;
  * ... // change the configs.
  * auto native_predictor = CreatePaddlePredictor(config);
+ * \endcode
  *
  * FOR EXTENSION DEVELOPER:
  * Different predictors are designated by config type. Similar configs can be

From daac6a05f590e33d4d50d71a97378fe57331f33e Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Thu, 14 Feb 2019 08:19:20 +0100
Subject: [PATCH 53/78] Removed duplicated code

This also fixes linking to libpaddle_fluid.so built in debug mode

test=develop
---
 .../analysis/ir_passes/subgraph_detector.cc   | 71 -------------------
 .../analysis/ir_passes/subgraph_detector.h    | 27 +------
 2 files changed, 1 insertion(+), 97 deletions(-)

diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index a64f85ee9a..96befe7f8a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
   return node.inputs.size() == n;
 }
 
-NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (Agent(p).deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-      }
-
-      inlink_visited.clear();
-
-      std::copy_if(p->inputs.begin(), p->inputs.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) -> bool { return visited.count(x) != 0; });
-
-      if (inlink_visited.size() == p->inputs.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outputs) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-NodesTSIterator &NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
index ea88edd042..5d11c217b6 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -30,6 +30,7 @@ namespace inference {
 namespace analysis {
 
 using framework::ir::Graph;
+using framework::ir::NodesTSIterator;
 
 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@@ -132,32 +133,6 @@ struct Agent {
   framework::ir::Node *x_;
 };
 
-// Topological sorting iterator on nodes.
-struct NodesTSIterator
-    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
-  NodesTSIterator() = default;
-  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
-  NodesTSIterator(NodesTSIterator &&other)
-      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-    other.cursor_ = 0;
-  }
-  NodesTSIterator(const NodesTSIterator &other);
-
-  framework::ir::Node &operator*();
-  NodesTSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesTSIterator &operator=(const NodesTSIterator &other);
-  bool operator==(const NodesTSIterator &other);
-  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-  framework::ir::Node *operator->();
-
- private:
-  std::vector<framework::ir::Node *> sorted_;
-  size_t cursor_{0};
-};
-
 // The nodes those have no input will be treated as start points.
 static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
   std::vector<framework::ir::Node *> result;

From 3a5d6e5e64140a1b84363010a9f077c1fd8fb6e1 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 14 Feb 2019 15:38:15 +0800
Subject: [PATCH 54/78] move passes to src to avoid different behavior in
 deployment (#15705)

---
 .../inference/api/paddle_pass_builder.cc      | 46 ++++++++++++++++++
 .../fluid/inference/api/paddle_pass_builder.h | 47 +------------------
 2 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 039389a4cf..f9c13c2fa8 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
+GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
+  passes_.assign({
+    "infer_clean_graph_pass",                        //
+        "identity_scale_op_clean_pass",              //
+        "conv_affine_channel_fuse_pass",             //
+        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+        "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
+#endif
+  });
+
+  for (int i = 6; i >= 3; i--) {
+    passes_.push_back("transpose_flatten" + std::to_string(i) +
+                      "_concat_fuse_pass");
+  }
+  use_gpu_ = true;
+}
+
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
   analysis_passes_.push_back(pass);
 }
 
+CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
+  // NOTE the large fusions should be located in the front, so that they will
+  // not be damaged by smaller ones.
+  passes_.assign({
+      "infer_clean_graph_pass",         //
+      "attention_lstm_fuse_pass",       //
+      "seqpool_concat_fuse_pass",       //
+      "seqconv_eltadd_relu_fuse_pass",  //
+      // "embedding_fc_lstm_fuse_pass", //
+      "fc_lstm_fuse_pass",             //
+      "mul_lstm_fuse_pass",            //
+      "fc_gru_fuse_pass",              //
+      "mul_gru_fuse_pass",             //
+      "seq_concat_fc_fuse_pass",       //
+      "fc_fuse_pass",                  //
+      "repeated_fc_relu_fuse_pass",    //
+      "squared_mat_sub_fuse_pass",     //
+      "conv_bn_fuse_pass",             //
+      "conv_eltwiseadd_bn_fuse_pass",  //
+      "is_test_pass",                  //
+      "identity_scale_op_clean_pass",  //
+  });
+  use_gpu_ = false;
+}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index aa353f12ca..2524d89fcd 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
  */
 class CpuPassStrategy : public PassStrategy {
  public:
-  CpuPassStrategy() : PassStrategy({}) {
-    // NOTE the large fusions should be located in the front, so that they will
-    // not be damaged by smaller ones.
-    passes_.assign({
-        "infer_clean_graph_pass",         //
-        "attention_lstm_fuse_pass",       //
-        "seqpool_concat_fuse_pass",       //
-        "seqconv_eltadd_relu_fuse_pass",  //
-        // "embedding_fc_lstm_fuse_pass", //
-        "fc_lstm_fuse_pass",             //
-        "mul_lstm_fuse_pass",            //
-        "fc_gru_fuse_pass",              //
-        "mul_gru_fuse_pass",             //
-        "seq_concat_fc_fuse_pass",       //
-        "fc_fuse_pass",                  //
-        "repeated_fc_relu_fuse_pass",    //
-        "squared_mat_sub_fuse_pass",     //
-        "conv_bn_fuse_pass",             //
-        "conv_eltwiseadd_bn_fuse_pass",  //
-        "is_test_pass",                  //
-        "identity_scale_op_clean_pass",  //
-    });
-    use_gpu_ = false;
-  }
+  CpuPassStrategy();
 
   explicit CpuPassStrategy(const CpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {}
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
  */
 class GpuPassStrategy : public PassStrategy {
  public:
-  GpuPassStrategy() : PassStrategy({}) {
-    passes_.assign({
-      "infer_clean_graph_pass",                        //
-          "identity_scale_op_clean_pass",              //
-          "conv_affine_channel_fuse_pass",             //
-          "conv_eltwiseadd_affine_channel_fuse_pass",  //
-          "conv_bn_fuse_pass",                         //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-          "conv_elementwise_add_act_fuse_pass",   //
-          "conv_elementwise_add2_act_fuse_pass",  //
-          "conv_elementwise_add_fuse_pass",       //
-#endif
-    });
-
-    for (int i = 6; i >= 3; i--) {
-      passes_.push_back("transpose_flatten" + std::to_string(i) +
-                        "_concat_fuse_pass");
-    }
-    use_gpu_ = true;
-  }
+  GpuPassStrategy();
 
   explicit GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {

From d453b0dcf72d50501bc59309a971c172ef148e31 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 14 Feb 2019 16:02:38 +0800
Subject: [PATCH 55/78] add details. test=develop

---
 .../paddle/fluid/transpiler/memory_optimization_transpiler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 52c1aea288..047e0832bc 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -355,6 +355,10 @@ class ControlFlowGraph(object):
                                                  is_forward).dtype()
                         cache_dtype = self._find_var(block_desc, cache_var,
                                                      is_forward).dtype()
+                        if x_dtype != cache_dtype:
+                            if PRINT_LOG:
+                                print("x_dtype and cache_dtyp are different")
+                            continue
 
                         if not compare_shape(x_shape, cache_shape, level):
                             continue

From 869f00ffc6697bdac73271ecbd7257f6937245c2 Mon Sep 17 00:00:00 2001
From: liuhongyu <phliuhongyu@126.com>
Date: Thu, 14 Feb 2019 16:20:37 +0800
Subject: [PATCH 56/78] set lstm lstmp unsed pointer to null

---
 paddle/fluid/operators/lstm_op.h  | 4 ++++
 paddle/fluid/operators/lstmp_op.h | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 7d62d2d020..289f50f52e 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
+      // lstm_value.output_value not used in bp, set to null
+      // lstm_grad.state_active_grad not used in bp, set to null
+      lstm_value.output_value = nullptr;
+      lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 370dd04d14..05ecd3c1ae 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
 
       int cur_batch_size = bend - bstart;
+      // lstm_value.output_value not used in bp, set to null
+      // lstm_grad.state_active_grad not used in bp, set to null
+      lstm_value.output_value = nullptr;
+      lstm_grad.state_active_grad = nullptr;
+
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);

From 393fa6021e78d111d9a76e52fbdd97c4e152e65d Mon Sep 17 00:00:00 2001
From: liuhongyu <phliuhongyu@126.com>
Date: Thu, 14 Feb 2019 16:25:29 +0800
Subject: [PATCH 57/78] set lstm lstmp unsed pointer to nullptr; test=develop

---
 paddle/fluid/operators/lstm_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 289f50f52e..3f110024b2 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -311,8 +311,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
-      // lstm_value.output_value not used in bp, set to null
-      // lstm_grad.state_active_grad not used in bp, set to null
+      // lstm_value.output_value not used in bp, set to nullptr
+      // lstm_grad.state_active_grad not used in bp, set to nullptr
       lstm_value.output_value = nullptr;
       lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;

From 84f067be9405d45436a4326b474f3984ce44d021 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Thu, 14 Feb 2019 17:15:00 +0800
Subject: [PATCH 58/78] update. test=develop

test=develop
---
 .../paddle/fluid/transpiler/memory_optimization_transpiler.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 047e0832bc..ee8cde441f 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -357,7 +357,7 @@ class ControlFlowGraph(object):
                                                      is_forward).dtype()
                         if x_dtype != cache_dtype:
                             if PRINT_LOG:
-                                print("x_dtype and cache_dtyp are different")
+                                print("x_dtype and cache_dtype are different")
                             continue
 
                         if not compare_shape(x_shape, cache_shape, level):

From 029be5fda9b973ec798444b959e7b83e03ade7f1 Mon Sep 17 00:00:00 2001
From: liuhongyu <phliuhongyu@126.com>
Date: Thu, 14 Feb 2019 17:23:20 +0800
Subject: [PATCH 60/78] fix lstmp bug; test=develop

---
 paddle/fluid/operators/lstmp_op.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 05ecd3c1ae..1f11e57dcb 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -405,10 +405,10 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
 
       int cur_batch_size = bend - bstart;
-      // lstm_value.output_value not used in bp, set to null
-      // lstm_grad.state_active_grad not used in bp, set to null
-      lstm_value.output_value = nullptr;
-      lstm_grad.state_active_grad = nullptr;
+      // lstmp_value.output_value not used in bp, set to null
+      // lstmp_grad.state_active_grad not used in bp, set to null
+      lstmp_value.output_value = nullptr;
+      lstmp_grad.state_active_grad = nullptr;
 
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,

From 31287cdb4351a7896a6836a868d159c2b29935c2 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 14 Feb 2019 21:04:25 +0800
Subject: [PATCH 61/78] remove legace v2 code in python/paddle/utils

---
 python/paddle/utils/dump_config.py        |  45 ---
 python/paddle/utils/dump_v2_config.py     |  62 ----
 python/paddle/utils/image_multiproc.py    | 278 ----------------
 python/paddle/utils/make_model_diagram.py | 140 --------
 python/paddle/utils/merge_model.py        |  73 -----
 python/paddle/utils/predefined_net.py     | 381 ----------------------
 6 files changed, 979 deletions(-)
 delete mode 100644 python/paddle/utils/dump_config.py
 delete mode 100644 python/paddle/utils/dump_v2_config.py
 delete mode 100644 python/paddle/utils/image_multiproc.py
 delete mode 100644 python/paddle/utils/make_model_diagram.py
 delete mode 100644 python/paddle/utils/merge_model.py
 delete mode 100644 python/paddle/utils/predefined_net.py

diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
deleted file mode 100644
index 6a96a0a78f..0000000000
--- a/python/paddle/utils/dump_config.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config
-from paddle.proto import TrainerConfig_pb2
-import sys
-
-__all__ = []
-
-if __name__ == '__main__':
-    whole_conf = False
-    binary = False
-    if len(sys.argv) == 2:
-        conf = parse_config(sys.argv[1], '')
-    elif len(sys.argv) == 3:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-    elif len(sys.argv) == 4:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-        if sys.argv[3] == '--whole':
-            whole_conf = True
-        elif sys.argv[3] == '--binary':
-            binary = True
-    else:
-        raise RuntimeError()
-
-    assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
-
-    if whole_conf:
-        print(conf)
-    else:
-        if binary:
-            sys.stdout.write(conf.model_config.SerializeToString())
-        else:
-            print(conf.model_config)
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
deleted file mode 100644
index 5dc2111e37..0000000000
--- a/python/paddle/utils/dump_v2_config.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.layer import parse_network
-from paddle.proto import TrainerConfig_pb2
-
-__all__ = ["dump_v2_config"]
-
-
-def dump_v2_config(topology, save_path, binary=False):
-    """ Dump the network topology to a specified file.
-
-    This function is only used to dump network defined by using PaddlePaddle V2
-    APIs. This function will NOT dump configurations related to PaddlePaddle
-    optimizer.
-
-    :param topology: The output layers (can be more than one layers given in a
-                     Python List or Tuple) of the entire network. Using the
-                     specified layers (if more than one layer is given) as root,
-                     traversing back to the data layer(s), all the layers
-                     connected to the specified output layers will be dumped.
-                     Layers not connceted to the specified will not be dumped.
-    :type topology: LayerOutput|List|Tuple
-    :param save_path: The path to save the dumped network topology.
-    :type save_path: str
-    :param binary: Whether to dump the serialized network topology or not.
-                   The default value is false. NOTE that, if you call this
-                   function to generate network topology for PaddlePaddle C-API,
-                   a serialized version of network topology is required. When
-                   using PaddlePaddle C-API, this flag MUST be set to True.
-    :type binary: bool
-    """
-
-    if isinstance(topology, LayerOutput):
-        topology = [topology]
-    elif isinstance(topology, collections.Sequence):
-        for out_layer in topology:
-            assert isinstance(out_layer, LayerOutput), (
-                "The type of each element in the parameter topology "
-                "should be LayerOutput.")
-    else:
-        raise RuntimeError("Error input type for parameter topology.")
-
-    model_str = parse_network(topology)
-    with open(save_path, "w") as fout:
-        if binary:
-            fout.write(model_str.SerializeToString())
-        else:
-            fout.write(str(model_str))
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
deleted file mode 100644
index d1bbda3fd3..0000000000
--- a/python/paddle/utils/image_multiproc.py
+++ /dev/null
@@ -1,278 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from PIL import Image
-import six
-from six.moves import cStringIO as StringIO
-import multiprocessing
-import functools
-import itertools
-
-from paddle.utils.image_util import *
-from paddle.trainer.config_parser import logger
-
-try:
-    import cv2
-except ImportError:
-    logger.warning("OpenCV2 is not installed, using PIL to process")
-    cv2 = None
-
-__all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]
-
-
-class CvTransformer(ImageTransformer):
-    """
-    CvTransformer used python-opencv to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.shape[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = cv2.resize(im, (new_row, new_col), interpolation=cv2.INTER_CUBIC)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        im: (H x W x K) ndarrays
-        """
-        row, col = im.shape[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        if self.is_color:
-            im = im[start_h:end_h, start_w:end_w, :]
-        else:
-            im = im[start_h:end_h, start_w:end_w]
-        if (self.is_train) and (np.random.randint(2) == 0):
-            if self.is_color:
-                im = im[:, ::-1, :]
-            else:
-                im = im[:, ::-1]
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        # transpose, swap channel, sub mean
-        im = im.astype('float32')
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imdecode(np.fromstring(data, np.uint8), flag)
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imread(file, flag)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-class PILTransformer(ImageTransformer):
-    """
-    PILTransformer used PIL to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.size[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = im.resize((new_row, new_col), Image.ANTIALIAS)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        """
-        row, col = im.size[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        im = im.crop((start_h, start_w, end_h, end_w))
-        if (self.is_train) and (np.random.randint(2) == 0):
-            im = im.transpose(Image.FLIP_LEFT_RIGHT)
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        im = np.array(im, dtype=np.float32)  # convert to numpy.array
-        # transpose, swap channel, sub mean
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        im = Image.open(StringIO(data))
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        im = Image.open(file)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-def job(is_img_string, transformer, data_label_pack):
-    (data, label) = data_label_pack
-    if is_img_string:
-        return transformer.transform_from_string(data), label
-    else:
-        return transformer.transform_from_file(data), label
-
-
-class MultiProcessImageTransformer(object):
-    def __init__(self,
-                 procnum=10,
-                 resize_size=None,
-                 crop_size=None,
-                 transpose=(2, 0, 1),
-                 channel_swap=None,
-                 mean=None,
-                 is_train=True,
-                 is_color=True,
-                 is_img_string=True):
-        """
-        Processing image with multi-process. If it is used in PyDataProvider,
-        the simple usage for CNN is as follows:
-
-        .. code-block:: python
-
-            def hool(settings, is_train,  **kwargs):
-                settings.is_train = is_train
-                settings.mean_value = np.array([103.939,116.779,123.68], dtype=np.float32)
-                settings.input_types = [
-                    dense_vector(3 * 224 * 224),
-                    integer_value(1)]
-                settings.transformer = MultiProcessImageTransformer(
-                    procnum=10,
-                    resize_size=256,
-                    crop_size=224,
-                    transpose=(2, 0, 1),
-                    mean=settings.mean_values,
-                    is_train=settings.is_train)
-
-
-            @provider(init_hook=hook, pool_size=20480)
-            def process(settings, file_list):
-                with open(file_list, 'r') as fdata:
-                    for line in fdata:
-                        data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
-                        data = data_dic['data']
-                        labels = data_dic['label']
-                        labels = np.array(labels, dtype=np.float32)
-                        for im, lab in settings.dp.run(data, labels):
-                            yield [im.astype('float32'), int(lab)]
-
-        :param procnum: processor number.
-        :type procnum: int
-        :param resize_size: the shorter edge size of image after resizing.
-        :type resize_size: int
-        :param crop_size: the croping size.
-        :type crop_size: int
-        :param transpose: the transpose order, Paddle only allow C * H * W order.
-        :type transpose: tuple or list
-        :param channel_swap: the channel swap order, RGB or BRG.
-        :type channel_swap: tuple or list
-        :param mean: the mean values of image, per-channel mean or element-wise mean.
-        :type mean: array, The dimension is 1 for per-channel mean.
-                    The dimension is 3 for element-wise mean.
-        :param is_train: training peroid or testing peroid.
-        :type is_train: bool.
-        :param is_color: the image is color or gray.
-        :type is_color: bool.
-        :param is_img_string: The input can be the file name of image or image string.
-        :type is_img_string: bool.
-        """
-
-        self.procnum = procnum
-        self.pool = multiprocessing.Pool(procnum)
-        self.is_img_string = is_img_string
-        if cv2 is not None:
-            self.transformer = CvTransformer(resize_size, crop_size, transpose,
-                                             channel_swap, mean, is_train,
-                                             is_color)
-        else:
-            self.transformer = PILTransformer(resize_size, crop_size, transpose,
-                                              channel_swap, mean, is_train,
-                                              is_color)
-
-    def run(self, data, label):
-        fun = functools.partial(job, self.is_img_string, self.transformer)
-        return self.pool.imap_unordered(
-            fun, six.moves.zip(data, label), chunksize=100 * self.procnum)
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
deleted file mode 100644
index 52759d3ad2..0000000000
--- a/python/paddle/utils/make_model_diagram.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Generate dot diagram file for the given paddle model config
-# The generated file can be viewed using Graphviz (http://graphviz.org)
-
-from __future__ import print_function
-
-import six
-import sys
-import traceback
-
-from paddle.trainer.config_parser import parse_config
-
-
-def make_layer_label(layer_config):
-    label = '%s type=%s' % (layer_config.name, layer_config.type)
-    if layer_config.reversed:
-        label += ' <=='
-
-    label2 = ''
-    if layer_config.active_type:
-        label2 += 'act=%s ' % layer_config.active_type
-    if layer_config.bias_parameter_name:
-        label2 += 'bias=%s ' % layer_config.bias_parameter_name
-
-    if label2:
-        label += '\l' + label2
-    return label
-
-
-def make_diagram(config_file, dot_file, config_arg_str):
-    config = parse_config(config_file, config_arg_str)
-    make_diagram_from_proto(config.model_config, dot_file)
-
-
-def make_diagram_from_proto(model_config, dot_file):
-    # print >> sys.stderr, config
-    name2id = {}
-    f = open(dot_file, 'w')
-    submodel_layers = set()
-
-    def make_link(link):
-        return 'l%s -> l%s;' % (name2id[link.layer_name],
-                                name2id[link.link_name])
-
-    def make_mem(mem):
-        s = ''
-        if mem.boot_layer_name:
-            s += 'l%s -> l%s;\n' % (name2id[mem.boot_layer_name],
-                                    name2id[mem.layer_name])
-        s += 'l%s -> l%s [style=dashed];' % (name2id[mem.layer_name],
-                                             name2id[mem.link_name])
-        return s
-
-    print('digraph graphname {', file=f)
-    print('node [width=0.375,height=0.25];', file=f)
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        name2id[l.name] = i
-
-    i = 0
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        print('subgraph cluster_%s {' % i, file=f)
-        print('style=dashed;', file=f)
-        label = '%s ' % sub_model.name
-        if sub_model.reversed:
-            label += '<=='
-        print('label = "%s";' % label, file=f)
-        i += 1
-        submodel_layers.add(sub_model.name)
-        for layer_name in sub_model.layer_names:
-            submodel_layers.add(layer_name)
-            lid = name2id[layer_name]
-            layer_config = model_config.layers[lid]
-            label = make_layer_label(layer_config)
-            print('l%s [label="%s", shape=box];' % (lid, label), file=f)
-        print('}', file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        if l.name not in submodel_layers:
-            label = make_layer_label(l)
-            print('l%s [label="%s", shape=box];' % (i, label), file=f)
-
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        for link in sub_model.in_links:
-            print(make_link(link), file=f)
-        for link in sub_model.out_links:
-            print(make_link(link), file=f)
-        for mem in sub_model.memories:
-            print(make_mem(mem), file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        for l in model_config.layers[i].inputs:
-            print(
-                'l%s -> l%s [label="%s"];' % (name2id[l.input_layer_name], i,
-                                              l.input_parameter_name),
-                file=f)
-
-    print('}', file=f)
-    f.close()
-
-
-def usage():
-    print(
-        ("Usage: python show_model_diagram.py" +
-         " CONFIG_FILE DOT_FILE [config_str]"),
-        file=sys.stderr)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3 or len(sys.argv) > 4:
-        usage()
-
-    config_file = sys.argv[1]
-    dot_file = sys.argv[2]
-    config_arg_str = sys.argv[3] if len(sys.argv) == 4 else ''
-
-    try:
-        make_diagram(config_file, dot_file, config_arg_str)
-    except:
-        traceback.print_exc()
-        raise
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
deleted file mode 100644
index b74649e936..0000000000
--- a/python/paddle/utils/merge_model.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gzip
-import struct
-import os
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.parameters import Parameters
-from paddle.proto import ModelConfig_pb2
-from paddle.v2.topology import Topology
-
-
-def merge_v2_model(net, param_file, output_file):
-    '''Merge the model config and parameters into one file.
-
-    The model configuration file describes the model structure which
-    ends with .py. The parameters file stores the parameters of the model
-    which ends with .tar.gz.
-
-    @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by
-                           v2 api.
-    @param  output_file    Path of the merged file which will be generated.
-
-    Usage:
-
-        from paddle.utils.merge_model import merge_v2_model
-        # import your network configuration
-        from example_net import net_conf
-
-        net = net_conf(is_predict=True)
-        param_file = './param_pass_00000.tar.gz'
-        output_file = './output.paddle'
-
-        merge_v2_model(net, param_file, output_file)
-
-    '''
-
-    assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network for inference"
-    assert os.path.exists(param_file), \
-            "The model parameters file %s does not exists " % (param_file)
-
-    model_proto = Topology(net).proto()
-    assert isinstance(model_proto, ModelConfig_pb2.ModelConfig)
-
-    with gzip.open(param_file) as f:
-        params = Parameters.from_tar(f)
-
-    if os.path.exists(output_file):
-        os.remove(output_file)
-
-    with open(output_file, 'w') as f:
-        param_names = [param.name for param in model_proto.parameters]
-        conf_str = model_proto.SerializeToString()
-        f.write(struct.pack('q', len(conf_str)))
-        f.write(conf_str)
-        for pname in param_names:
-            params.serialize(pname, f)
-
-    print('Generate  %s  success!' % (output_file))
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
deleted file mode 100644
index 2801f4877c..0000000000
--- a/python/paddle/utils/predefined_net.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import six
-import os
-from paddle.trainer.config_parser import *
-from paddle.utils.preprocess_img import \
-    ImageClassificationDatasetCreater
-from paddle.trainer_config_helpers import *
-
-
-def image_data(data_dir,
-               processed_image_size,
-               overwrite=False,
-               color=True,
-               train_list="batches/train.list",
-               test_list="batches/test.list",
-               meta_file="batches/batches.meta",
-               use_jpeg=1):
-    """
-    Predefined image data provider for image classification.
-    train_list: a text file containing a list of training batches.
-    test_list: a text file containing a list of test batches.
-    processed_image_size: all the input images will be resized into this size.
-       If the image is not square. Then the shorter edge will be resized into
-       this size, and the aspect ratio is kept the same.
-    color: whether the images are color or gray.
-    meta_path: the path of the meta file that stores the mean image file and
-               other dataset information, such as the size of images,
-               the size of the mean image, the number of classes.
-    async_load_data: whether to load image data asynchronuously.
-    """
-    data_creator = ImageClassificationDatasetCreater(
-        data_dir, processed_image_size, color)
-    batch_data_dir = data_dir
-    train_list = os.path.join(batch_data_dir, train_list)
-    test_list = os.path.join(batch_data_dir, test_list)
-    meta_path = os.path.join(batch_data_dir, meta_file)
-    image_size = processed_image_size
-    conf = np.load(meta_path)
-    mean_image_size = conf["mean_image_size"]
-    is_color = conf["color"]
-    num_classes = conf["num_classes"]
-    color_string = "color" if is_color else "gray"
-
-    args = {
-        'meta': meta_path,
-        'mean_img_size': mean_image_size,
-        'img_size': image_size,
-        'num_classes': num_classes,
-        'use_jpeg': use_jpeg != 0,
-        'color': color_string
-    }
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module='image_provider',
-        obj='processData',
-        args=args)
-    return {
-        "image_size": image_size,
-        "num_classes": num_classes,
-        "is_color": is_color
-    }
-
-
-def get_extra_layer_attr(drop_rate):
-    if drop_rate == 0:
-        return None
-    else:
-        return ExtraLayerAttribute(drop_rate=drop_rate)
-
-
-def image_data_layers(image_size, num_classes, is_color=False,
-                      is_predict=False):
-    """
-    Data layers for image classification.
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    is_predict: whether the network is used for prediction.
-    """
-    num_image_channels = 3 if is_color else 1
-    data_input = data_layer("input",
-                            image_size * image_size * num_image_channels)
-    if is_predict:
-        return data_input, None, num_image_channels
-    else:
-        label_input = data_layer("label", 1)
-        return data_input, label_input, num_image_channels
-
-
-def simple_conv_net(data_conf, is_color=False):
-    """
-    A Wrapper for a simple network for MNIST digit recognition.
-    It contains two convolutional layers, one fully conencted layer, and
-    one softmax layer.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    filter_sizes = [5, 5]
-    num_channels = [32, 64]
-    strides = [1, 1]
-    fc_dims = [500]
-    conv_bn_pool1 = img_conv_bn_pool(
-        name="g1",
-        input=data_input,
-        filter_size=filter_sizes[0],
-        num_channel=num_image_channels,
-        num_filters=num_channels[0],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    conv_bn_pool2 = img_conv_bn_pool(
-        name="g2",
-        input=conv_bn_pool1,
-        filter_size=filter_sizes[1],
-        num_channel=num_channels[0],
-        num_filters=num_channels[1],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    fc3 = fc_layer(
-        name="fc3", input=conv_bn_pool2, dim=fc_dims[0], act=ReluActivation())
-    fc3_dropped = dropout_layer(name="fc3_dropped", input=fc3, dropout_rate=0.5)
-    output = fc_layer(
-        name="output",
-        input=fc3_dropped,
-        dim=fc_dims[0],
-        act=SoftmaxActivation())
-    if is_predict:
-        end_of_network(output)
-    else:
-        cost = classify(name="cost", input=output, label=label_input)
-        end_of_network(cost)
-
-
-def conv_layer_group(prefix_num,
-                     num_layers,
-                     input,
-                     input_channels,
-                     output_channels,
-                     drop_rates=[],
-                     strides=[],
-                     with_bn=[]):
-    """
-    A set of convolution layers, and batch normalization layers,
-    followed by one pooling layer.
-    It is utilized in VGG network for image classifcation.
-    prefix_num: the prefix number of the layer names.
-                For example, if prefix_num = 1, the first convolutioal layer's
-                name will be conv_1_1.
-    num_layers: number of the convolutional layers.
-    input: the name of the input layer.
-    input_channels: the number of channels of the input feature map.
-    output_channels: the number of channels of the output feature map.
-    drop_rates: the drop rates of the BN layers. It will be all zero by default.
-    strides: the stride of the convolution for the layers.
-             It will be all 1 by  default.
-    with_bn: whether to use Batch Normalization for Conv layers.
-             By default,  it is all false.
-    """
-    if len(drop_rates) == 0: drop_rates = [0] * num_layers
-    if len(strides) == 0: strides = [1] * num_layers
-    if len(with_bn) == 0: with_bn = [False] * num_layers
-    assert (len(drop_rates) == num_layers)
-    assert (len(strides) == num_layers)
-
-    for i in range(1, num_layers + 1):
-        if i == 1:
-            i_conv_in = input
-        else:
-            i_conv_in = group_output
-        i_channels_conv = input_channels if i == 1 else output_channels
-        conv_act = LinearActivation() if with_bn[i - 1] else ReluActivation()
-        conv_output = img_conv_layer(
-            name="conv%d_%d" % (prefix_num, i),
-            input=i_conv_in,
-            filter_size=3,
-            num_channels=i_channels_conv,
-            num_filters=output_channels,
-            stride=strides[i - 1],
-            padding=1,
-            act=conv_act)
-        if with_bn[i - 1]:
-            bn = batch_norm_layer(
-                name="conv%d_%d_bn" % (prefix_num, i),
-                input=conv_output,
-                num_channels=output_channels,
-                act=ReluActivation(),
-                layer_attr=get_extra_layer_attr(drop_rate=drop_rates[i - 1]))
-            group_output = bn
-        else:
-            group_output = conv_output
-    pool = img_pool_layer(
-        name="pool%d" % prefix_num,
-        input=group_output,
-        pool_size=2,
-        num_channels=output_channels,
-        stride=2)
-    return pool
-
-
-def vgg_conv_net(image_size,
-                 num_classes,
-                 num_layers,
-                 channels,
-                 strides,
-                 with_bn,
-                 fc_dims,
-                 drop_rates,
-                 drop_rates_fc=[],
-                 is_color=True,
-                 is_predict=False):
-    """
-    A Wrapper for a VGG network for image classification.
-    It is a set of convolutional groups followed by several fully
-    connected layers, and a cross-entropy classifiation loss.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    num_layers: the number of layers for all the convolution groups.
-    channels: the number of output filters for all the convolution groups.
-    with_bn: whether each layer of a convolution group is followed by a
-    batch normalization.
-    drop_rates: the dropout rates for all the convolutional layers.
-    fc_dims: the dimension for all the fully connected layers.
-    is_color: whether the input images are color.
-    """
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    assert (len(num_layers) == len(channels))
-    assert (len(num_layers) == len(strides))
-    assert (len(num_layers) == len(with_bn))
-    num_fc_layers = len(fc_dims)
-    assert (num_fc_layers + 1 == len(drop_rates_fc))
-
-    for i in range(len(num_layers)):
-        input_layer = data_input if i == 0 else group_output
-        input_channels = 3 if i == 0 else channels[i - 1]
-        group_output = conv_layer_group(
-            prefix_num=i + 1,
-            num_layers=num_layers[i],
-            input=input_layer,
-            input_channels=input_channels,
-            output_channels=channels[i],
-            drop_rates=drop_rates[i],
-            strides=strides[i],
-            with_bn=with_bn[i])
-    conv_output_name = group_output
-    if drop_rates_fc[0] != 0.0:
-        dropped_pool_name = "pool_dropped"
-        conv_output_name = dropout_layer(
-            name=dropped_pool_name,
-            input=conv_output_name,
-            dropout_rate=drop_rates_fc[0])
-    for i in range(len(fc_dims)):
-        input_layer_name = conv_output_name if i == 0 else fc_output
-        active_type = LinearActivation() if i == len(
-            fc_dims) - 1 else ReluActivation()
-        drop_rate = 0.0 if i == len(fc_dims) - 1 else drop_rates_fc[i + 1]
-        fc_output = fc_layer(
-            name="fc%d" % (i + 1),
-            input=input_layer_name,
-            size=fc_dims[i],
-            act=active_type,
-            layer_attr=get_extra_layer_attr(drop_rate))
-    bn = batch_norm_layer(
-        name="fc_bn",
-        input=fc_output,
-        num_channels=fc_dims[len(fc_dims) - 1],
-        act=ReluActivation(),
-        layer_attr=get_extra_layer_attr(drop_rate=drop_rates_fc[-1]))
-    output = fc_layer(
-        name="output", input=bn, size=num_classes, act=SoftmaxActivation())
-    if is_predict:
-        outputs(output)
-    else:
-        cost = classification_cost(name="cost", input=output, label=label_input)
-        outputs(cost)
-
-
-def vgg16_conv_net(image_size, num_classes, is_color=True, is_predict=False):
-    """
-    A Wrapper for a 16 layers VGG network for image classification.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    """
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3, 3],
-                 channels=[64, 128, 256, 512, 512],
-                 strides=[[], [], [], [], []],
-                 with_bn=[[False, True], [False, True], [False, False, True], \
-                          [False, False, True], [False, False, True]],
-                 drop_rates=[[]] * 5,
-                 drop_rates_fc=[0.0, 0.5, 0.5],
-                 fc_dims=[4096, 4096],
-                 is_predict=is_predict)
-
-
-def small_vgg(data_conf, is_predict=False):
-    """
-    A Wrapper for a small VGG network for CIFAR-10 image classification.
-    The detailed architecture of the paper can be found here:
-      92.45% on CIFAR-10 in Torch
-      http://torch.ch/blog/2015/07/30/cifar.html
-    Due to the constraints of CuDNN, it only has four convolutional groups
-    rather than five.
-    Thus, it only achieves 91.2% test accuracy and 98.1% training accuracy.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3],
-                 channels=[64, 128, 256, 512],
-                 strides=[[], [], [], []],
-                 with_bn=[[True, True], [True, True], [True, True, True], \
-                          [True, True, True]],
-                 drop_rates=[[0.3, 0.0], [0.4, 0.0],
-                             [0.4, 0.4, 0.0], [0.4, 0.4, 0.0]],
-                 drop_rates_fc=[0.5, 0.5],
-                 fc_dims=[512],
-                 is_predict=is_predict)
-
-
-def training_settings(learning_rate=0.1,
-                      batch_size=128,
-                      algorithm="sgd",
-                      momentum=0.9,
-                      decay_rate=0.001):
-    """
-    Training settings.
-    learning_rate: learning rate of the training.
-    batch_size: the size of each training batch.
-    algorithm: training algorithm, can be
-       - sgd
-       - adagrad
-       - adadelta
-       - rmsprop
-    momentum: momentum of the training algorithm.
-    decay_rate: weight decay rate.
-    """
-    Settings(
-        algorithm=algorithm,
-        batch_size=batch_size,
-        learning_rate=learning_rate / float(batch_size))
-    default_momentum(momentum)
-    default_decay_rate(decay_rate * batch_size)

From 031d995080ae40b65dfa3f548c0387f7000fb2a4 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 14 Feb 2019 21:09:07 +0800
Subject: [PATCH 62/78] remove legacy v2 codes in benchmark

---
 benchmark/IntelOptimizedPaddle.md             | 112 --------
 benchmark/README.md                           | 168 ------------
 benchmark/fluid/Dockerfile                    |   3 -
 .../{paddle/image => fluid}/check_env.sh      |   0
 benchmark/paddle/image/alexnet.py             |  93 -------
 benchmark/paddle/image/googlenet.py           | 245 ------------------
 benchmark/paddle/image/plotlog.py             | 114 --------
 benchmark/paddle/image/provider.py            |  47 ----
 benchmark/paddle/image/resnet.py              | 230 ----------------
 benchmark/paddle/image/run.sh                 |  53 ----
 benchmark/paddle/image/run_mkl_infer.sh       |  89 -------
 benchmark/paddle/image/run_mkl_train.sh       |  54 ----
 benchmark/paddle/image/run_openblas_infer.sh  |  71 -----
 benchmark/paddle/image/run_openblas_train.sh  |  43 ---
 .../paddle/image/smallnet_mnist_cifar.py      |  49 ----
 benchmark/paddle/image/vgg.py                 | 119 ---------
 benchmark/paddle/rnn/imdb.py                  |  60 -----
 benchmark/paddle/rnn/provider.py              |  86 ------
 benchmark/paddle/rnn/rnn.py                   |  38 ---
 benchmark/paddle/rnn/run.sh                   |  52 ----
 20 files changed, 1726 deletions(-)
 delete mode 100644 benchmark/IntelOptimizedPaddle.md
 delete mode 100644 benchmark/README.md
 rename benchmark/{paddle/image => fluid}/check_env.sh (100%)
 delete mode 100644 benchmark/paddle/image/alexnet.py
 delete mode 100644 benchmark/paddle/image/googlenet.py
 delete mode 100644 benchmark/paddle/image/plotlog.py
 delete mode 100644 benchmark/paddle/image/provider.py
 delete mode 100644 benchmark/paddle/image/resnet.py
 delete mode 100755 benchmark/paddle/image/run.sh
 delete mode 100755 benchmark/paddle/image/run_mkl_infer.sh
 delete mode 100755 benchmark/paddle/image/run_mkl_train.sh
 delete mode 100755 benchmark/paddle/image/run_openblas_infer.sh
 delete mode 100755 benchmark/paddle/image/run_openblas_train.sh
 delete mode 100644 benchmark/paddle/image/smallnet_mnist_cifar.py
 delete mode 100644 benchmark/paddle/image/vgg.py
 delete mode 100755 benchmark/paddle/rnn/imdb.py
 delete mode 100644 benchmark/paddle/rnn/provider.py
 delete mode 100755 benchmark/paddle/rnn/rnn.py
 delete mode 100755 benchmark/paddle/rnn/run.sh

diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
deleted file mode 100644
index 8b7dc5b7db..0000000000
--- a/benchmark/IntelOptimizedPaddle.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# Benchmark
-
-Machine:
-
-- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop: TBD
-
-System: CentOS release 6.3 (Final), Docker 1.12.1.
-
-PaddlePaddle:
-- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
-  - MKL-DNN tag v0.11
-  - MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
-  - OpenBLAS v0.2.20
-	 
-On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
-
-## Benchmark Model
-
-### Server
-
-#### Training
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
-
-Input image size - 3 * 224 * 224, Time: images/second
-
-- VGG-19
-
-| BatchSize    | 64    | 128  | 256     |
-|--------------|-------| -----| --------|
-| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 28.46 | 29.83 | 30.44  |
-
-<img src="figs/vgg-cpu-train.png" width="500">
-
- - ResNet-50
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
-| MKLML        | 32.52 | 31.89 | 33.12  |
-| MKL-DNN      | 81.69 | 82.35 | 84.08  |
-
-<img src="figs/resnet-cpu-train.png" width="500">
-
- - GoogLeNet
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
-| MKLML        | 128.46| 137.89| 158.63 |
-| MKL-DNN      | 250.46| 264.83| 269.50 |
-
-<img src="figs/googlenet-cpu-train.png" width="500">
-
-- AlexNet
-
-| BatchSize    | 64     | 128    | 256    |
-|--------------|--------| ------ | -------|
-| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
-| MKLML        | 66.37  | 105.60 | 144.04 |
-| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
-
-<img src="figs/alexnet-cpu-train.png" width="500">
-
-#### Inference
-Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-- VGG-19
-
-| BatchSize | 1     | 2     | 4     | 8     | 16    |
-|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
-| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
-| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
-
-<img src="figs/vgg-cpu-infer.png" width="500">
-
-- ResNet-50
-
-| BatchSize | 1     | 2      | 4      | 8      | 16     |
-|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
-| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
-| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
-
-<img src="figs/resnet-cpu-infer.png" width="500">
-
-- GoogLeNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
-| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
-| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
-
-<img src="figs/googlenet-cpu-infer.png" width="500">
-
-- AlexNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
-| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
-| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
-
-<img src="figs/alexnet-cpu-infer.png" width="500">
-
-### Laptop
-TBD
diff --git a/benchmark/README.md b/benchmark/README.md
deleted file mode 100644
index 367013f045..0000000000
--- a/benchmark/README.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# Benchmark
-
-Machine: 
-
-- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
-- GPU: Tesla K40m
-- cuDNN: v5.1
-- system: Docker 1.12.1, all platforms are tested in docker environment.
-
-Platforms: 
-
-- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
-- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
-- Caffe: kaixhin/cuda-caffe
-
-Several convolutional neural networks and recurrent neural networks are used to test.
-
-## Image
-
-### Benchmark Model
-
-AlexNet, GoogleNet and a small network used in Caffe.
-
-- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
-
-- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
-
-- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
-
-
-### Single-GPU
-
-- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
-
-| BatchSize    | 64  | 128  | 256   | 512  |
-|--------------|-----| -----| ------| -----|
-| PaddlePaddle | 195 | 334  | 602   | 1629 |
-| TensorFlow   | 223 | 364  | 645   | 1235 |
-| Caffe        | 324 | 627  | 1232  | 2513 |
- 
-**Notation**
-
-All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- 
-- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
-
-
-| BatchSize    | 64    |   128  | 256     |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 613   | 1149   | 2348    |
-| TensorFlow   | 644   | 1176   | 2219    |
-| Caffe        | 694   | 1364   | out of memory   |
-
-- SmallNet: input - 3 * 32 * 32, Time ms/batch
-
-| BatchSize    | 64     |   128    | 256     | 512     |
-|--------------|--------| -------- | --------|---------|
-| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
-| TensorFlow   | 9     | 15       | 28      | 59       |
-| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
-
-**Notation**
-
-All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
-
-In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
-
-### Multi-GPU: 4 GPUs
-
-- AlexNet,  ms / batch
-
-| total-BatchSize | 128 * 4  | 256 * 4    |
-|------------------|----------| -----------|
-| PaddlePaddle     | 347      | 622        |
-| TensorFlow       | 377      | 675        |
-| Caffe            | 1229     | 2435       |
-
-For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
-
-```
-  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
-= (334 * 4)/347 
-= 3.85
-``` 
-
-<img src="figs/alexnet-4gpu.png" width="420">
-
-
-- GoogleNet, ms / batch
-
-| total-BatchSize  | 128 * 4      |  256 * 4    |
-|-------------------|--------------| ----------- |
-| PaddlePaddle      | 1178         | 2367        |
-| TensorFlow        | 1210         | 2292        |
-| Caffe             | 2007         | out of memory  |
-
-<img src="figs/googlenet-4gpu.png" width="420">
-
-
-## RNN
-We use lstm network for text classfication to test benchmark.
-
-### Dataset
--  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
-- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
-- Dictionary size=30000 
-- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
-
-### Single-GPU
-
-#### LSTM in Text Classification
-
-Testing `2 lstm layer + fc` network with different hidden size and batch size.
-  
-- Batch size = 64, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 83    | 184    | 641     |
-| TensorFlow   | 175   | 280    | 818     |
-
-- Batch size = 128, ms / batch
- 
-| hidden_size  | 256    | 512    |  1280   |
-|--------------|------- | -------| --------|
-| PaddlePaddle | 110    | 261    | 1007    |
-| TensorFlow   | 181    | 361    | 1237    |
-
-
-- Batch size = 256, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 170   | 414    | 1655    |
-| TensorFlow   | 238   | 536    | 1905    |
-
-<img src="figs/rnn_lstm_cls.png" width="600">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
- 
-
-### Multi GPU: 4 GPUs
-
-#### LSTM in Text Classification
-
-- hidden_size = 256, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 90     | 118     |
-| TensorFlow   | 226    | 118     |
-
-
-- hidden_size = 512, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 189    | 268     |
-| TensorFlow   | 297    | 383     |
-
-
-<img src="figs/rnn_lstm_4gpus.png" width="420">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 2e1e0d3768..81ea870050 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
 RUN pip uninstall -y paddlepaddle && mkdir /workspace
 
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
diff --git a/benchmark/paddle/image/check_env.sh b/benchmark/fluid/check_env.sh
similarity index 100%
rename from benchmark/paddle/image/check_env.sh
rename to benchmark/fluid/check_env.sh
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
deleted file mode 100644
index 9efc3f0494..0000000000
--- a/benchmark/paddle/image/alexnet.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-height = 227
-width = 227
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-gp = get_config_arg('layer_num', int, 1)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=11,
-    num_channels=3,
-    num_filters=96,
-    stride=4,
-    padding=1)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
-# conv4
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
-
-# conv5
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(net)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=net, label=lab)
-    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
deleted file mode 100644
index 2a850ccb7f..0000000000
--- a/benchmark/paddle/image/googlenet.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-use_gpu = get_config_arg('use_gpu', bool, True)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-conv_projection = conv_projection if use_gpu else img_conv_layer
-
-def inception2(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    conv1 = name + '_1'
-    conv3r = name + '_3r'
-    conv3 = name + '_3'
-    conv5r = name + '_5r'
-    conv5 = name + '_5'
-    maxpool = name + '_max'
-    convproj = name + '_proj'
-
-    cov1 = img_conv_layer(
-        name=conv1,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=conv3r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = img_conv_layer(
-        name=conv3,
-        input=cov3r,
-        filter_size=3,
-        num_filters=filter3,
-        stride=1,
-        padding=1)
-
-    cov5r = img_conv_layer(
-        name=conv5r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = img_conv_layer(
-        name=conv5,
-        input=cov5r,
-        filter_size=5,
-        num_filters=filter5,
-        stride=1,
-        padding=2)
-
-    pool1 = img_pool_layer(
-        name=maxpool,
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = img_conv_layer(
-        name=convproj,
-        input=pool1,
-        filter_size=1,
-        num_filters=proj,
-        stride=1,
-        padding=0)
-
-    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
-    return cat
-
-def inception(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    cov1 = conv_projection(
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=name + '_3r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = conv_projection(
-        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
-
-    cov5r = img_conv_layer(
-        name=name + '_5r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = conv_projection(
-        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
-
-    pool1 = img_pool_layer(
-        name=name + '_max',
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = conv_projection(
-        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
-
-    cat = concat_layer(
-        name=name,
-        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True if use_gpu else False,
-        act=ReluActivation())
-    return cat
-
-
-data = data_layer(name="input", size=3 * height * width)
-
-# stage 1
-conv1 = img_conv_layer(
-    name="conv1",
-    input=data,
-    filter_size=7,
-    num_channels=3,
-    num_filters=64,
-    stride=2,
-    padding=3)
-pool1 = img_pool_layer(
-    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
-
-# stage 2
-conv2_1 = img_conv_layer(
-    name="conv2_1",
-    input=pool1,
-    filter_size=1,
-    num_filters=64,
-    stride=1,
-    padding=0)
-conv2_2 = img_conv_layer(
-    name="conv2_2",
-    input=conv2_1,
-    filter_size=3,
-    num_filters=192,
-    stride=1,
-    padding=1)
-pool2 = img_pool_layer(
-    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
-
-# stage 3
-ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
-ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
-pool3 = img_pool_layer(
-    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
-
-# stage 4
-ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
-ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
-ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
-ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
-ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
-pool4 = img_pool_layer(
-    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
-
-# stage 5
-ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
-ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
-pool5 = img_pool_layer(
-    name="pool5",
-    input=ince5b,
-    num_channels=1024,
-    pool_size=7,
-    stride=7,
-    pool_type=AvgPooling())
-
-# We remove loss1 and loss2 for all system when testing benchmark
-# output 1
-# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
-# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
-# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
-# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
-
-# output 2
-#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
-#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
-#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
-#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
-
-# output 3
-dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
-out3 = fc_layer(
-    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(out3)
-else:
-    lab = data_layer(name="label", size=num_class)
-    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-    outputs(loss3)
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
deleted file mode 100644
index 8679d4f272..0000000000
--- a/benchmark/paddle/image/plotlog.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import argparse
-import matplotlib.pyplot as plt
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Parse Log')
-    parser.add_argument(
-        '--file_path', '-f', type=str, help='the path of the log file')
-    parser.add_argument(
-        '--sample_rate',
-        '-s',
-        type=float,
-        default=1.0,
-        help='the rate to take samples from log')
-    parser.add_argument(
-        '--log_period', '-p', type=int, default=1, help='the period of log')
-
-    args = parser.parse_args()
-    return args
-
-
-def parse_file(file_name):
-    loss = []
-    error = []
-    with open(file_name) as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if not line.startswith('pass'):
-                continue
-            line_split = line.split(' ')
-            if len(line_split) != 5:
-                continue
-
-            loss_str = line_split[2][:-1]
-            cur_loss = float(loss_str.split('=')[-1])
-            loss.append(cur_loss)
-
-            err_str = line_split[3][:-1]
-            cur_err = float(err_str.split('=')[-1])
-            error.append(cur_err)
-
-    accuracy = [1.0 - err for err in error]
-
-    return loss, accuracy
-
-
-def sample(metric, sample_rate):
-    interval = int(1.0 / sample_rate)
-    if interval > len(metric):
-        return metric[:1]
-
-    num = len(metric) / interval
-    idx = [interval * i for i in range(num)]
-    metric_sample = [metric[id] for id in idx]
-    return metric_sample
-
-
-def plot_metric(metric,
-                batch_id,
-                graph_title,
-                line_style='b-',
-                line_label='y',
-                line_num=1):
-    plt.figure()
-    plt.title(graph_title)
-    if line_num == 1:
-        plt.plot(batch_id, metric, line_style, label=line_label)
-    else:
-        for i in range(line_num):
-            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
-    plt.xlabel('batch')
-    plt.ylabel(graph_title)
-    plt.legend()
-    plt.savefig(graph_title + '.jpg')
-    plt.close()
-
-
-def main():
-    args = parse_args()
-    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
-
-    loss, accuracy = parse_file(args.file_path)
-    batch = [args.log_period * i for i in range(len(loss))]
-
-    batch_sample = sample(batch, args.sample_rate)
-    loss_sample = sample(loss, args.sample_rate)
-    accuracy_sample = sample(accuracy, args.sample_rate)
-
-    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
-    plot_metric(
-        accuracy_sample,
-        batch_sample,
-        'accuracy',
-        line_style='g-',
-        line_label='accuracy')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
deleted file mode 100644
index 6ad817ccef..0000000000
--- a/benchmark/paddle/image/provider.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-from paddle.trainer.PyDataProvider2 import *
-
-
-def initHook(settings, height, width, color, num_class, **kwargs):
-    settings.height = height
-    settings.width = width
-    settings.color = color
-    settings.num_class = num_class
-    if settings.color:
-        settings.data_size = settings.height * settings.width * 3
-    else:
-        settings.data_size = settings.height * settings.width
-    settings.is_infer = kwargs.get('is_infer', False)
-    settings.num_samples = kwargs.get('num_samples', 2560)
-    if settings.is_infer:
-        settings.slots = [dense_vector(settings.data_size)]
-    else:
-        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_list):
-    for i in xrange(settings.num_samples):
-        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        if settings.is_infer:
-            yield img.astype('float32')
-        else:
-            lab = random.randint(0, settings.num_class - 1)
-            yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
deleted file mode 100644
index 2846e4763f..0000000000
--- a/benchmark/paddle/image/resnet.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg("layer_num", int, 50)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-
-#######################Network Configuration #############
-def conv_bn_layer(name,
-                  input,
-                  filter_size,
-                  num_filters,
-                  stride,
-                  padding,
-                  channels=None,
-                  active_type=ReluActivation()):
-    """
-    A wrapper for conv layer with batch normalization layers.
-    Note:
-    conv layer has no activation.
-    """
-
-    tmp = img_conv_layer(
-        name=name + "_conv",
-        input=input,
-        filter_size=filter_size,
-        num_channels=channels,
-        num_filters=num_filters,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(
-        name=name + "_bn",
-        input=tmp,
-        act=active_type,
-        use_global_stats=is_infer)
-
-
-def bottleneck_block(name, input, num_filters1, num_filters2):
-    """
-    A wrapper for bottlenect building block in ResNet.
-    Last conv_bn_layer has no activation.
-    Addto layer has activation of relu.
-    """
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=1,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[input, last_name], act=ReluActivation())
-
-
-def mid_projection(name, input, num_filters1, num_filters2, stride=2):
-    """
-    A wrapper for middile projection in ResNet.
-    projection shortcuts are used for increasing dimensions,
-    and other shortcuts are identity
-    branch1: projection shortcuts are used for increasing
-    dimensions, has no activation.
-    branch2x: bottleneck building block, shortcuts are identity.
-    """
-    # stride = 2
-    branch1 = conv_bn_layer(
-        name=name + '_branch1',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=stride,
-        padding=0,
-        active_type=LinearActivation())
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=stride,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
-
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
-    """
-    A wrapper for 50,101,152 layers of ResNet.
-    res2_num: number of blocks stacked in conv2_x
-    res3_num: number of blocks stacked in conv3_x
-    res4_num: number of blocks stacked in conv4_x
-    res5_num: number of blocks stacked in conv5_x
-    """
-    # For ImageNet
-    # conv1: 112x112
-    tmp = conv_bn_layer(
-        "conv1",
-        input=img,
-        filter_size=7,
-        channels=3,
-        num_filters=64,
-        stride=2,
-        padding=3)
-    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
-
-    # conv2_x: 56x56
-    tmp = mid_projection(
-        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
-    for i in xrange(2, res2_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
-
-    # conv3_x: 28x28
-    tmp = mid_projection(
-        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
-    for i in xrange(2, res3_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res3_" + str(i),
-            input=tmp,
-            num_filters1=128,
-            num_filters2=512)
-
-    # conv4_x: 14x14
-    tmp = mid_projection(
-        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
-    for i in xrange(2, res4_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res4_" + str(i),
-            input=tmp,
-            num_filters1=256,
-            num_filters2=1024)
-
-    # conv5_x: 7x7
-    tmp = mid_projection(
-        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
-    for i in xrange(2, res5_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res5_" + str(i),
-            input=tmp,
-            num_filters1=512,
-            num_filters2=2048)
-
-    tmp = img_pool_layer(
-        name='avgpool',
-        input=tmp,
-        pool_size=7,
-        stride=1,
-        pool_type=AvgPooling())
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 50:
-    resnet = deep_res_net(3, 4, 6, 3)
-elif layer_num == 101:
-    resnet = deep_res_net(3, 4, 23, 3)
-elif layer_num == 152:
-    resnet = deep_res_net(3, 8, 36, 3)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(resnet)
-else:
-    lbl = data_layer(name="label", size=num_class)
-    loss = cross_entropy(name='loss', input=resnet, label=lbl)
-    outputs(loss)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
deleted file mode 100755
index 5b58a8d773..0000000000
--- a/benchmark/paddle/image/run.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  bz=$3
-  args="batch_size=$3"
-  prefix=$4
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=True \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
-}
-
-if [ ! -d "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#========single-gpu=========#
-# alexnet
-train alexnet.py 1 64 alexnet
-train alexnet.py 1 128 alexnet
-train alexnet.py 1 256 alexnet
-train alexnet.py 1 512 alexnet
-
-# googlenet
-train googlenet.py 1 64 googlenet
-train googlenet.py 1 128 googlenet
-train googlenet.py 1 256 googlenet
-
-# smallnet
-train smallnet_mnist_cifar.py 1 64 smallnet
-train smallnet_mnist_cifar.py 1 128 smallnet
-train smallnet_mnist_cifar.py 1 256 smallnet
-train smallnet_mnist_cifar.py 1 512 smallnet
-
-
-############################
-#========multi-gpus=========#
-train alexnet.py 4 512 alexnet
-train alexnet.py 4 1024 alexnet
-
-train googlenet.py 4 512 googlenet 
-train googlenet.py 4 1024 googlenet
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
deleted file mode 100755
index 0fad5e04cc..0000000000
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    if [ $thread -gt $bs ]; then
-      thread=$bs
-    fi
-    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "Training model ${topology}_${layer_num}"
-    paddle train --job=train \
-      --config="${topology}.py" \
-      --use_mkldnn=True \
-      --use_gpu=False \
-      --trainer_count=1 \
-      --num_passes=1 \
-      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
-      > /dev/null 2>&1
-    echo "Done"
-  fi
-  log_period=$((256 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 1280 samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-if [ ! -d "models" ]; then
-  mkdir -p models
-fi
-
-# inference benchmark
-for use_mkldnn in True False; do
-  for batchsize in 1 2 4 8 16; do
-    infer vgg 19 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer alexnet 2 $batchsize $use_mkldnn
-  done
-done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
deleted file mode 100755
index 1583bf134a..0000000000
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    # each trainer_count use only 1 core to avoid conflict
-    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for use_mkldnn in True False; do
-  for batchsize in 64 128 256; do
-    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50 $batchsize $use_mkldnn
-    train googlenet v1 $batchsize $use_mkldnn
-    train alexnet 2 $batchsize $use_mkldnn
-  done
-done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
deleted file mode 100755
index 987381cabc..0000000000
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  export OPENBLAS_MAIN_FREE=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  trainers=`nproc`
-  if [ $trainers -gt $bs ]; then
-    trainers=$bs
-  fi
-  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
-  threads=$((`nproc` / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  export OPENBLAS_NUM_THREADS=$threads
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "./run_mkl_infer.sh to save the model first"
-    exit 0
-  fi
-  log_period=$((32 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$trainers \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 160(=32*5) samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# inference benchmark
-for batchsize in 1 2 4 8 16; do
-  infer vgg 19 $batchsize
-  infer resnet 50 $batchsize 
-  infer googlenet v1 $batchsize
-  infer alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
deleted file mode 100755
index cc64e1d09d..0000000000
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  export OPENBLAS_NUM_THREADS=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  thread=`nproc`
-  # each trainer_count use only 1 core to avoid conflict
-  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=3 \
-    --test_period=30 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for batchsize in 64 128 256; do
-  train vgg 19 $batchsize
-  train resnet 50 $batchsize
-  train googlenet v1 $batchsize
-  train alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
deleted file mode 100644
index 58879c454f..0000000000
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-
-height = 32
-width = 32
-num_class = 10
-
-batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=5,
-    num_channels=3,
-    num_filters=32,
-    stride=1,
-    padding=2)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-net = fc_layer(input=net, size=64, act=ReluActivation())
-net = fc_layer(input=net, size=10, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
deleted file mode 100644
index ca0a6798fb..0000000000
--- a/benchmark/paddle/image/vgg.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg('layer_num', int, 19)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.001 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def vgg_network(vgg_num=3):
-    tmp = img_conv_group(
-        input=img,
-        num_channels=3,
-        conv_padding=1,
-        conv_num_filter=[64, 64],
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_size=2,
-        pool_stride=2,
-        pool_type=MaxPooling())
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[128, 128],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    channels = []
-    for i in range(vgg_num):
-        channels.append(256)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    channels = []
-    for i in range(vgg_num):
-        channels.append(512)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 16:
-    vgg = vgg_network(3)
-elif layer_num == 19:
-    vgg = vgg_network(4)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(vgg)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=vgg, label=lab)
-    outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
deleted file mode 100755
index 2a67f9b0cf..0000000000
--- a/benchmark/paddle/rnn/imdb.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import six.moves.cPickle as pickle
-import gzip
-import os
-import numpy
-
-
-def get_dataset_file(dataset, default_dataset, origin):
-    data_dir, data_file = os.path.split(dataset)
-    if (not os.path.isfile(dataset)) and data_file == default_dataset:
-        from six.moves import urllib
-        print('Downloading data from %s' % origin)
-        urllib.request.urlretrieve(origin, dataset)
-
-    return dataset
-
-
-def create_data(path="imdb.pkl"):
-
-    if (not os.path.isfile('imdb.train.pkl')):
-        path = get_dataset_file(
-            path, "imdb.pkl",
-            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
-
-        if path.endswith(".gz"):
-            f = gzip.open(path, 'rb')
-        else:
-            f = open(path, 'rb')
-
-        train_set = pickle.load(f)
-        test_set = pickle.load(f)
-        f.close()
-
-        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
-        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
-
-    if (not os.path.isfile('train.list')):
-        file('train.list', 'w').write('imdb.train.pkl\n')
-
-
-def main():
-    create_data('imdb.pkl')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
deleted file mode 100644
index 23cc0c44a9..0000000000
--- a/benchmark/paddle/rnn/provider.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-import six.moves.cPickle as pickle
-from paddle.trainer.PyDataProvider2 import *
-
-
-def remove_unk(x, n_words):
-    return [[1 if w >= n_words else w for w in sen] for sen in x]
-
-
-# ==============================================================
-#  tensorflow uses fixed length, but PaddlePaddle can process
-#  variable-length. Padding is used in benchmark in order to
-#  compare with other platform. 
-# ==============================================================
-def pad_sequences(sequences,
-                  maxlen=None,
-                  dtype='int32',
-                  padding='post',
-                  truncating='post',
-                  value=0.):
-    lengths = [len(s) for s in sequences]
-
-    nb_samples = len(sequences)
-    if maxlen is None:
-        maxlen = np.max(lengths)
-
-    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
-    for idx, s in enumerate(sequences):
-        if len(s) == 0:
-            continue  # empty list was found
-        if truncating == 'pre':
-            trunc = s[-maxlen:]
-        elif truncating == 'post':
-            trunc = s[:maxlen]
-        else:
-            raise ValueError("Truncating type '%s' not understood" % padding)
-
-        if padding == 'post':
-            x[idx, :len(trunc)] = trunc
-        elif padding == 'pre':
-            x[idx, -len(trunc):] = trunc
-        else:
-            raise ValueError("Padding type '%s' not understood" % padding)
-    return x
-
-
-def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
-    settings.vocab_size = vocab_size
-    settings.pad_seq = pad_seq
-    settings.maxlen = maxlen
-    settings.input_types = [
-        integer_value_sequence(vocab_size), integer_value(2)
-    ]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file):
-    f = open(file, 'rb')
-    train_set = pickle.load(f)
-    f.close()
-    x, y = train_set
-
-    # remove unk, namely remove the words out of dictionary
-    x = remove_unk(x, settings.vocab_size)
-    if settings.pad_seq:
-        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
-
-    for i in range(len(y)):
-        yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
deleted file mode 100755
index 83eb3e5654..0000000000
--- a/benchmark/paddle/rnn/rnn.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-import imdb
-
-num_class = 2
-vocab_size = 30000
-fixedlen = 100
-batch_size = get_config_arg('batch_size', int, 128)
-lstm_num = get_config_arg('lstm_num', int, 1)
-hidden_size = get_config_arg('hidden_size', int, 128)
-# whether to pad sequence into fixed length
-pad_seq = get_config_arg('pad_seq', bool, True)
-imdb.create_data('imdb.pkl')
-
-args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-net = data_layer('data', size=vocab_size)
-net = embedding_layer(input=net, size=128)
-
-for i in xrange(lstm_num):
-    net = simple_lstm(input=net, size=hidden_size)
-
-net = last_seq(input=net)
-net = fc_layer(input=net, size=2, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
deleted file mode 100755
index f99a562b3f..0000000000
--- a/benchmark/paddle/rnn/run.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=1 \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --num_passes=1 \
-    --feed_data=1 \
-    --config_args=$args \
-    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-## padding, single gpu
-#-----config--gpu--lstm_num--padding--hidden_size--batch_size
-## lstm_num=2, batch_size=64
-train rnn.py 1 2 1 256 64 
-train rnn.py 1 2 1 512 64 
-train rnn.py 1 2 1 1280 64 
-
-## lstm_num=2, batch_size=128
-train rnn.py 1 2 1 256 128 
-train rnn.py 1 2 1 512 128 
-train rnn.py 1 2 1 1280 128 
-
-## lstm_num=4, batch_size=256
-train rnn.py 1 2 1 256 256 
-train rnn.py 1 2 1 512 256 
-train rnn.py 1 2 1 1280 256 
-
-
-#==================multi gpus=====================#
-# hidden_size=256, lstm_num=2, different batch size
-train rnn.py 4 2 1 256 128 
-train rnn.py 4 2 1 256 256 
-train rnn.py 4 2 1 256 512 
-
-# hidden_size=512, lstm_num=4, different batch size
-train rnn.py 4 2 1 512 128 
-train rnn.py 4 2 1 512 256 
-train rnn.py 4 2 1 512 512 

From b9999435b7244f0477c3c1902ca8793b5f6c2ace Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 14 Feb 2019 21:29:01 +0800
Subject: [PATCH 63/78] remove 'import paddle.v2' in read data

test=develop
---
 benchmark/tensorflow/machine_translation.py                    | 2 --
 benchmark/tensorflow/mnist.py                                  | 1 -
 benchmark/tensorflow/resnet.py                                 | 1 -
 benchmark/tensorflow/stacked_dynamic_lstm.py                   | 2 --
 benchmark/tensorflow/vgg.py                                    | 1 -
 .../fluid/tests/demo/file_reader/convert_data_to_recordio.py   | 1 -
 python/paddle/fluid/tests/demo/pyreader.py                     | 3 +--
 7 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
index 8f77dce983..7837669edc 100644
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
@@ -35,8 +35,6 @@ import os
 import argparse
 import time
 
-import paddle.v2 as paddle
-
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--embedding_dim",
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
index 7140eed6ea..03d533fecf 100644
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@@ -21,7 +21,6 @@ import time
 import numpy as np
 
 import tensorflow as tf
-import paddle.v2 as paddle
 
 DTYPE = tf.float32
 
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
index c432fa8d59..fdb0441957 100644
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
@@ -27,7 +27,6 @@ import argparse
 import time
 import numpy as np
 
-import paddle.v2 as paddle
 import tensorflow as tf
 
 DTYPE = tf.float32
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
index 5285033005..1f532dc2fa 100644
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -21,8 +21,6 @@ import argparse
 import time
 import tensorflow as tf
 
-import paddle.v2 as paddle
-
 
 def parse_args():
     parser = argparse.ArgumentParser("LSTM model benchmark.")
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
index fba5ec71a4..d32c835bd7 100644
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """VGG16 benchmark in TensorFlow"""
 import tensorflow as tf
-import paddle.v2 as paddle
 import numpy as np
 import argparse
 import time
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index 45a104ec96..b00af91a9d 100644
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import sys
 import paddle.fluid as fluid
-import paddle.v2 as paddle
 
 
 def load_vocab(filename):
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
index ec61e0ebae..bbcef4c3ff 100644
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -20,7 +20,6 @@ import six
 import paddle
 import paddle.dataset.mnist as mnist
 import paddle.fluid as fluid
-import paddle.v2
 
 
 def network(is_train):
@@ -72,7 +71,7 @@ def main():
         use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
 
     train_reader.decorate_paddle_reader(
-        paddle.v2.reader.shuffle(
+        paddle.reader.shuffle(
             paddle.batch(mnist.train(), 512), buf_size=8192))
 
     test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))

From abcefe721117010277fbffb1e159acf2228f08dc Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Thu, 14 Feb 2019 22:39:55 +0800
Subject: [PATCH 64/78] Fix debug mode in fake_quantize_op (#15693)

* Fix debug mode in fake_quantize_op
* Remove template specialization
---
 paddle/fluid/operators/fake_quantize_op.cc | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 8aff911141..d51eb054a9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -21,26 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVectorArrayMap =
-    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using ConstEigenVectorArrayMap =
-    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+template <typename T>
+struct Compare {
+ public:
+  bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); }
+};
 
 template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx, const T* in,
                   const int num, T* out) {
-    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
-    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
-    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
-    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
-
-    out_e = in_e.abs().maximum();
+    *out = *(std::max_element(in + 0, in + num, Compare<T>()));
   }
 };
 

From 989138378d6de5a0c4bd47dc028209565848f202 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 28 Jan 2019 17:39:10 +0800
Subject: [PATCH 65/78] add sugar for fetching parameters

test=develop
---
 paddle/fluid/imperative/layer.cc              |  2 +-
 python/paddle/fluid/imperative/layers.py      | 53 +++++++++++++------
 python/paddle/fluid/imperative/nn.py          |  3 --
 .../tests/unittests/test_imperative_gan.py    |  7 ---
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 47488d4dea..8f20f0c06e 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -207,7 +207,7 @@ framework::LoDTensor& VarBase::GradValue() {
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
     return {};
   }
 
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 71ff95bdea..2641ec4cdd 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import contextlib
 import sys
 import numpy as np
@@ -30,25 +31,13 @@ class Layer(core.Layer):
     def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
         self._built = False
         self._dtype = dtype
+        self._parameters = collections.OrderedDict()
+        self._sub_layers = collections.OrderedDict()
 
     def parameters(self):
-        params = []
-        for key in self.__dict__.keys():
-            value = self.__dict__[key]
-            if isinstance(value, framework.Parameter):
-                params.append(value)
-            elif isinstance(value, core.Layer):
-                params.extend(value.parameters())
-            elif isinstance(value, collections.Container):
-                if len(value) == 0:
-                    continue
-                if isinstance(value[0], framework.Parameter):
-                    params.extend(value)
-                elif isinstance(value[0], core.Layer):
-                    for v in value:
-                        params.extend(v.parameters())
-
-        return params
+        """Returns an OrderedDict with parameters from current and sub-layers.
+        """
+        return self._parameters
 
     def clear_gradients(self):
         for p in self.parameters():
@@ -71,6 +60,36 @@ class Layer(core.Layer):
     def backward(self, *inputs):
         raise ValueError("Layer shouldn't implement backward")
 
+    def __getattr__(self, name):
+        if name in self._parameters:
+            return self._parameters[name]
+        elif name in self._sub_layers:
+            return self._sub_layers[name]
+
+    def __setattr__(self, name, value):
+        if isinstance(value, framework.Parameter):
+            params = self.__dict__.get('_parameters', None)
+            if params is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            params[name] = value
+        elif isinstance(value, core.Layer):
+            layers = self.__dict__.get('_sub_layers', None)
+            if layers is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            layers[name] = value
+        else:
+            object.__setattr__(self, name, value)
+
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._sub_layers:
+            del self._sub_layers[name]
+        else:
+            object.__delattr__(self, name)
+
 
 class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 6c5961cc63..1b0a60df8b 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -225,9 +225,6 @@ class FC(layers.Layer):
             act=act,
             name=name)
 
-    def parameters(self):
-        return [self._w, self._b]
-
     def _build_once(self, input):
         input_shape = input.shape
         param_shape = [
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 681661bfc6..33c196d1ab 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -33,9 +33,6 @@ class Discriminator(fluid.imperative.Layer):
         self._fc1 = FC(size=32, act='elu', name="d_fc1")
         self._fc2 = FC(size=1, name="d_fc2")
 
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters()
-
     def forward(self, inputs):
         x = self._fc1(inputs)
         return self._fc2(x)
@@ -48,10 +45,6 @@ class Generator(fluid.imperative.Layer):
         self._fc2 = FC(size=64, act='elu', name="g_fc2")
         self._fc3 = FC(size=1, name="g_fc3")
 
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters(
-        ) + self._fc3.parameters()
-
     def forward(self, inputs):
         x = self._fc1(inputs)
         x = self._fc2(x)

From 408a9bb2e7705adeb7d8fc21e50d580cf65dcd05 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 29 Jan 2019 12:54:18 +0800
Subject: [PATCH 66/78] polish

test=develop
---
 python/paddle/fluid/imperative/layers.py      | 13 ++-
 .../fluid/tests/unittests/test_base_layer.py  | 92 +++++++++++++++++++
 2 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_base_layer.py

diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 2641ec4cdd..da8233fe39 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -34,16 +34,21 @@ class Layer(core.Layer):
         self._parameters = collections.OrderedDict()
         self._sub_layers = collections.OrderedDict()
 
-    def parameters(self):
-        """Returns an OrderedDict with parameters from current and sub-layers.
+    def parameters(self, include_sublayers=True):
+        """Returns a list of Parameters from current and sub-layers.
         """
-        return self._parameters
+        ret = [p for p in self._parameters.values()]
+        if include_sublayers:
+            for l in self._sub_layers.values():
+                for p in l.parameters(include_sublayers):
+                    ret.append(p)
+        return ret
 
     def clear_gradients(self):
         for p in self.parameters():
             p._clear_gradient()
 
-    def _build_once(self, inputs):
+    def _build_once(self, *args):
         pass
 
     def __call__(self, *inputs):
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
new file mode 100644
index 0000000000..fe6cb7b213
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import sys
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+
+
+class L1(fluid.imperative.Layer):
+    def __init__(self):
+        super(L1, self).__init__()
+        self._helper = LayerHelper(
+            'MyLayer',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+
+        self.w1 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+        self.w2 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+
+    def forward(self):
+        return self.w1 + self.w2
+
+
+class L2(fluid.imperative.Layer):
+    def __init__(self):
+        super(L2, self).__init__()
+        self.layer1 = L1()
+        self.layer2 = L1()
+
+    def forward(self):
+        return self.layer1() + self.layer2()
+
+
+class L3(fluid.imperative.Layer):
+    def __init__(self):
+        super(L3, self).__init__()
+        self.layer1 = L2()
+        self.layer2 = L2()
+
+    def forward(self):
+        return self.layer1() + self.layer2()
+
+
+class TestBaseLayer(unittest.TestCase):
+    def test_one_level(self):
+        with fluid.imperative.guard():
+            l = L1()
+            ret = l()
+            self.assertEqual(l.w1.name, "MyLayer_0.w_0")
+            self.assertEqual(l.w2.name, "MyLayer_0.w_1")
+            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+            sys.stderr.write(
+                '%s %s %s %s\n' %
+                (ret._numpy(), l.w1.name, l.w2.name, l._sub_layers))
+
+    def test_three_level(self):
+        with fluid.imperative.guard():
+            l = L3()
+            ret = l()
+            sys.stderr.write('%s\n' % ret._numpy())
+
+            for p in l.parameters():
+                sys.stderr.write('%s\n' % p.name)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 286823255629ef8e337b3797ced223a6f7672a8a Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Fri, 15 Feb 2019 14:33:02 +0800
Subject: [PATCH 67/78] Fix row_conv doc

test=develop
---
 paddle/fluid/operators/row_conv_op.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 10b1b0c899..d283bddbe9 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
 unidirectional recurrent neural networks. The row convolution operator is 
 different from the 1D sequence convolution, and is computed as follows:
 
-Given an input sequence $in$ of length $t$ and input dimension $d$, 
-and a filter ($W$) of size $context \times d$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$, 
+and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
+out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
 $$
 
 In the above equation:
 
 * $Out_{i}$: The i-th row of output variable with shape [1, D].
 
-* $\\tau$: Future context size.
+* $context$: Future context size.
 
 * $X_{j}$: The j-th row of input variable with shape [1, D].
 
-* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+* $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
 
 More details about row_conv please refer to
 the design document

From 48a5cccbcdc72d350e47271abd7b105b48829d84 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 15 Feb 2019 15:24:08 +0800
Subject: [PATCH 68/78] Fix debug mode in prior_box_op (#15702)

* Fix debug mode in prior_box_op
* Refine code
---
 .../detection/density_prior_box_op.h          | 13 ++--
 .../fluid/operators/detection/prior_box_op.h  | 69 ++++++++-----------
 2 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 3591681fc3..42137215e2 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
-    for (int i = 0; i < fixed_ratios.size(); i++) {
+    for (size_t i = 0; i < fixed_ratios.size(); i++) {
       sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
     }
 
@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
       }
     }
     if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
     }
     framework::Tensor var_t;
     var_t.mutable_data<T>(
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #pragma omp parallel for collapse(2)
 #endif
     for (int i = 0; i < box_num; ++i) {
-      for (int j = 0; j < variances.size(); ++j) {
+      for (size_t j = 0; j < variances.size(); ++j) {
         e_vars(i, j) = variances[j];
       }
     }
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 4e226abbb5..f844056645 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
   }
 }
 
-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
 template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
  public:
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    T* b_t = boxes->data<T>();
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
         T center_x = (w + offset) * step_width;
         T center_y = (h + offset) * step_height;
         T box_width, box_height;
-        int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
           if (min_max_aspect_ratios_order) {
             box_width = box_height = min_size / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
             if (max_sizes.size() > 0) {
               auto max_size = max_sizes[s];
               // square prior with size sqrt(minSize * maxSize)
               box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
             // priors with different aspect ratios
             for (size_t r = 0; r < aspect_ratios.size(); ++r) {
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
               }
               box_width = min_size * sqrt(ar) / 2.;
               box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
           } else {
             // priors with different aspect ratios
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
               float ar = aspect_ratios[r];
               box_width = min_size * sqrt(ar) / 2.;
               box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
             if (max_sizes.size() > 0) {
               auto max_size = max_sizes[s];
               // square prior with size sqrt(minSize * maxSize)
               box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
           }
         }
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     }
 
     if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
     }
 
     framework::Tensor var_t;

From e4b9fcdbd2fa5fc5267ab6c6b9dde7cf3af6fb01 Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Fri, 15 Feb 2019 15:28:01 +0800
Subject: [PATCH 69/78] More restrict check load_combine_op. (#15479)

* fix && test=develop

* fix && test=develop

* test=develop
---
 paddle/fluid/operators/load_combine_op.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index c4a2282e16..f5c802986e 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase {
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
 
       // Get data from fin to tensor
       DeserializeFromStream(*buffer, tensor, dev_ctx);
@@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase {
         tensor->ShareDataWith(fp16_tensor);
       }
     }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
   }
 };
 

From f7b768d3648c5d6d69c2996904712c642ad2e0c8 Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Fri, 15 Feb 2019 16:24:19 +0800
Subject: [PATCH 70/78] fix group_norm (#15727)

* fix group_norm

* test=develop
---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 46ce58fd2d..586eac7fd6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3236,7 +3236,7 @@ def group_norm(input,
     # create output
     mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
     variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_variable(dtype)
+    group_norm_out = helper.create_variable(dtype=dtype)
 
     helper.append_op(
         type="group_norm",

From 54f4d58553afc2f326a4d9dda168a8c4a13ccb8e Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Feb 2019 19:13:47 +0800
Subject: [PATCH 71/78] make parameter and layer access easier

test=develop
---
 python/paddle/fluid/imperative/layers.py      | 51 +++++++++++++++++++
 python/paddle/fluid/imperative/nn.py          |  3 --
 .../fluid/tests/unittests/test_imperative.py  | 12 +++++
 .../unittests/test_imperative_ptb_rnn.py      | 16 ------
 .../tests/unittests/test_imperative_resnet.py | 26 +++++-----
 5 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index da8233fe39..59fe6bbf74 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -36,6 +36,12 @@ class Layer(core.Layer):
 
     def parameters(self, include_sublayers=True):
         """Returns a list of Parameters from current and sub-layers.
+
+        Args:
+            include_sublayers: If true, also include the parameters from
+            sublayers.
+
+        Returns a list of Parameters.
         """
         ret = [p for p in self._parameters.values()]
         if include_sublayers:
@@ -44,6 +50,21 @@ class Layer(core.Layer):
                     ret.append(p)
         return ret
 
+    def sublayers(self, include_sublayers=True):
+        """Returns a list of sub layers.
+
+        Args:
+            include_sublayers: If true, also include the layers from sublayers.
+
+        Returns a list of sub layers.
+        """
+        ret = [l for l in self._sub_layers.values()]
+        if include_sublayers:
+            for l in self._sub_layers.values():
+                for sub_l in l.sublayers(include_sublayers):
+                    ret.append(sub_l)
+        return ret
+
     def clear_gradients(self):
         for p in self.parameters():
             p._clear_gradient()
@@ -65,6 +86,36 @@ class Layer(core.Layer):
     def backward(self, *inputs):
         raise ValueError("Layer shouldn't implement backward")
 
+    def add_sublayer(self, name, sublayer):
+        """Adds a sub Layer instance.
+
+          Added sublayer can be access like self.name.
+
+        Args:
+            name: name of this sublayer.
+            sublayer: an instance of Layer.
+        Returns:
+            the sublayer passed in.
+        """
+        assert isinstance(sublayer, core.Layer)
+        self._sub_layers[name] = sublayer
+        return sublayer
+
+    def add_parameter(self, name, parameter):
+        """Adds a Parameter instance.
+
+          Added parameter can be access like self.name.
+
+        Args:
+            name: name of this sublayer.
+            parameter: an instance of Parameter.
+        Returns:
+            the parameter passed in.
+        """
+        assert isinstance(parameter, framework.Parameter)
+        self._parameters[name] = parameter
+        return parameter
+
     def __getattr__(self, name):
         if name in self._parameters:
             return self._parameters[name]
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 1b0a60df8b..c86a373ae4 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -475,9 +475,6 @@ class Embedding(layers.Layer):
             dtype=self._dtype,
             is_bias=False)
 
-    def parameters(self):
-        return [self._w]
-
     def forward(self, input):
         out = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index baaddf9f2e..c54e998ea8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -333,6 +333,18 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
+        params = mlp.parameters(True)
+        self.assertEqual("FC_0.w_0", params[0].name)
+        self.assertEqual("FC_0.b_0", params[1].name)
+        self.assertEqual("FC_1.w_0", params[2].name)
+        self.assertEqual("FC_1.b_0", params[3].name)
+        self.assertEqual(len(params), 4)
+
+        sublayers = mlp.sublayers(True)
+        self.assertEqual(mlp._fc1, sublayers[0])
+        self.assertEqual(mlp._fc2, sublayers[1])
+        self.assertEqual(len(sublayers), 2)
+
     def test_rnn(self):
         np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                            [10.0, 11.0, 12.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index afe990e74f..82aff18b72 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -75,16 +75,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
-    def parameters(self):
-        parameters = list()
-        for param in self.weight_1_arr:
-            parameters.append(param)
-        for param in self.weight_2_arr:
-            parameters.append(param)
-        for bias in self.bias_arr:
-            parameters.append(bias)
-        return parameters
-
     def forward(self, input_embedding, init_hidden=None, init_cell=None):
         res = []
         for index in range(self._num_steps):
@@ -177,12 +167,6 @@ class PtbModel(fluid.imperative.Layer):
     def _build_once(self, input, label, init_hidden, init_cell):
         pass
 
-    def parameters(self):
-        parameters = self.simple_lstm_rnn.parameters() + [
-            self.softmax_weight, self.softmax_bias
-        ] + self.embedding.parameters()
-        return parameters
-
     def forward(self, input, label, init_hidden, init_cell):
 
         init_h = fluid.layers.reshape(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index c27fd0b802..128d18621d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.imperative.base import to_variable
 from test_imperative_base import new_program_scope
@@ -173,11 +172,13 @@ class ResNet(fluid.imperative.Layer):
         for block in range(len(depth)):
             shortcut = False
             for i in range(depth[block]):
-                bottleneck_block = BottleneckBlock(
-                    num_channels=num_channels,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1,
-                    shortcut=shortcut)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
@@ -223,8 +224,7 @@ class TestImperativeResnet(unittest.TestCase):
                 batch_size=batch_size)
 
             dy_param_init_value = {}
-            for param in fluid.default_main_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                 dy_param_init_value[param.name] = param._numpy()
 
             for batch_id, data in enumerate(train_reader()):
@@ -247,16 +247,14 @@ class TestImperativeResnet(unittest.TestCase):
                 dy_out = avg_loss._numpy()
 
                 if batch_id == 0:
-                    for param in fluid.default_main_program().global_block(
-                    ).all_parameters():
+                    for param in resnet.parameters():
                         if param.name not in dy_param_init_value:
                             dy_param_init_value[param.name] = param._numpy()
 
                 avg_loss._backward()
 
                 dy_grad_value = {}
-                for param in fluid.default_main_program().global_block(
-                ).all_parameters():
+                for param in resnet.parameters():
                     if not param.stop_gradient:
                         np_array = np.array(param._ivar._grad_ivar().value()
                                             .get_tensor())
@@ -267,8 +265,7 @@ class TestImperativeResnet(unittest.TestCase):
                 resnet.clear_gradients()
 
                 dy_param_value = {}
-                for param in fluid.default_main_program().global_block(
-                ).all_parameters():
+                for param in resnet.parameters():
                     dy_param_value[param.name] = param._numpy()
 
         with new_program_scope():
@@ -349,6 +346,7 @@ class TestImperativeResnet(unittest.TestCase):
         self.assertTrue(np.allclose(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
             self.assertTrue(np.isfinite(value.all()))

From 792719fb7ec941b863ab8a2db9dbf39508a86322 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Feb 2019 19:53:30 +0800
Subject: [PATCH 72/78] polish test

test=develop
---
 .../paddle/fluid/tests/unittests/test_base_layer.py  | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index fe6cb7b213..bf00698d63 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import contextlib
 import unittest
 import numpy as np
-import six
-import sys
 
-import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 
@@ -74,18 +70,12 @@ class TestBaseLayer(unittest.TestCase):
             self.assertEqual(l.w1.name, "MyLayer_0.w_0")
             self.assertEqual(l.w2.name, "MyLayer_0.w_1")
             self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
-            sys.stderr.write(
-                '%s %s %s %s\n' %
-                (ret._numpy(), l.w1.name, l.w2.name, l._sub_layers))
 
     def test_three_level(self):
         with fluid.imperative.guard():
             l = L3()
             ret = l()
-            sys.stderr.write('%s\n' % ret._numpy())
-
-            for p in l.parameters():
-                sys.stderr.write('%s\n' % p.name)
+            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
 
 
 if __name__ == '__main__':

From e5d3d7c63d6c536b72210a4e4d1e3ae437d4c1cb Mon Sep 17 00:00:00 2001
From: "Zhang, Guoming" <guoming.zhang@intel.com>
Date: Sat, 16 Feb 2019 00:07:37 +0800
Subject: [PATCH 73/78] resolve #15724 1.Remove the code for setting mkldnn
 environment in the test_calibration.py; 2.Update the cmake file for MKLDNN
 environment enabling; 3.Update the INT8 inference doc.

test=develop
---
 python/paddle/fluid/contrib/int8_inference/README.md  | 4 ++--
 python/paddle/fluid/contrib/tests/CMakeLists.txt      | 6 +++++-
 python/paddle/fluid/contrib/tests/test_calibration.py | 4 ----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
index a9691dad44..460ae393f1 100644
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -63,10 +63,10 @@ Notes:
 ## 4. How to reproduce the results
 * Small dataset
 ```bash
-python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 
 * Full dataset
 ```bash
-DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 81aee1233d..a2c5941646 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
 endif()
 
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+    if(src MATCHES "test_calibration")
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+    else()
+        py_test(${src} SRCS ${src}.py)
+    endif()
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index 424ea245a0..b9f938bebe 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
 
     def run_program(self, model_path, generate_int8=False, algo='direct'):
         image_shape = [3, 224, 224]
-        os.environ['FLAGS_use_mkldnn'] = 'True'
 
         fluid.memory_optimize(fluid.default_main_program())
 
@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
             label = label.reshape([-1, 1])
             running_program = calibrator.sampling_program.clone(
             ) if generate_int8 else infer_program.clone()
-            for op in running_program.current_block().ops:
-                if op.has_attr("use_mkldnn"):
-                    op._set_attr("use_mkldnn", True)
 
             t1 = time.time()
             _, acc1, _ = exe.run(

From 1e46ab2e3ebbee882aa229dd0a8793415e18f3f3 Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 15 Feb 2019 18:57:21 +0800
Subject: [PATCH 74/78] follow comment test=develop

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f4c4fc3b65..3183a49794 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5938,7 +5938,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
                        are the same variable, otherwise, the input and output of
                        ``layers.reshape`` are different variables. Note that if :attr:`x`
-                       is more than one layers' input, ``inplace`` must be :attr:`False`.
+                       is more than one layer's input, ``inplace`` must be :attr:`False`.
         name (str): The name of this layer. It is optional.
 
     Returns:

From 8666902b9d2c9ae79daca93802b4fab974d27ced Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Mon, 18 Feb 2019 09:37:56 +0800
Subject: [PATCH 75/78] fix test_transpiler random fail test=develop (#15736)

---
 .../fluid/tests/unittests/test_dist_transpiler.py      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3566fed215..12132477d2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -22,6 +22,9 @@ import six
 import unittest
 import numpy as np
 
+import gc
+gc.set_debug(gc.DEBUG_COLLECTABLE)
+
 import paddle.fluid as fluid
 
 
@@ -99,6 +102,12 @@ class TranspilerTest(unittest.TestCase):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
                 self.transpiler_test_impl()
+        # NOTE: run gc.collect to eliminate pybind side objects to
+        # prevent random double-deallocate when inherited in python.
+        del self.transpiler
+        del main
+        del startup
+        gc.collect()
 
 
 class TestBasicModel(TranspilerTest):
@@ -797,6 +806,7 @@ class TestNCCL2Transpile(TranspilerTest):
             print([op.type for op in startup.global_block().ops])
             self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
             self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+            gc.collect()
         else:
             pass
 

From 077d12b93951d48117011472ea1917e4760f14ef Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 18 Feb 2019 11:31:26 +0800
Subject: [PATCH 76/78] fix scale cleaner (#15742)

---
 .../fluid/framework/ir/identity_scale_op_clean_pass.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 3b738aa159..5bdc0c5fae 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
                       ->assert_is_op("scale")
                       ->assert_op_attr<float>("scale", 1.)
                       ->assert_op_attr<float>("bias", 0.);
-  auto scale_out = detector.mutable_pattern()
-                       ->NewNode("scale_out")
-                       ->assert_is_op_output("scale");
+  auto scale_out =
+      detector.mutable_pattern()
+          ->NewNode("scale_out")
+          ->assert_is_op_output("scale")
+          // scale's output var should has only one consumer, or it can't be
+          // removed.
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
 
   pre_op->LinksTo({scale_in});
   scale_op->LinksFrom({scale_in}).LinksTo({scale_out});

From 5e6834d891252723961efb4de4b89e189745fd12 Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Mon, 18 Feb 2019 15:21:55 +0800
Subject: [PATCH 77/78] inplace group_norm (#15754)

* inplace group

* test=develop
---
 paddle/fluid/operators/group_norm_op.cc | 39 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index e18d9841bb..cbdffa0db8 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+class GroupNormInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{"X", "Y"}};
+  }
+};
+
+class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
+  }
+};
+
+class GroupNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return {{"X", /*->*/ "Y"}};
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  ops::GroupNormGradMaker);
-REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+                  ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
+                  ops::GroupNormInplaceInToOut);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
+                  ops::GroupNormGradInplaceInToOut);
 REGISTER_OP_CPU_KERNEL(
     group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);

From 685a20ef5683100aa139177a566d2d3758a5def4 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuaxu@hotmail.com>
Date: Mon, 18 Feb 2019 18:29:32 +0800
Subject: [PATCH 78/78] Add JIT CRF_decoding and Layer_norm unit-test (#15699)

* Add the CRFDecoding and LayerNorm's test case

test=develop

* Fix the size checking issue

test=develop

* Remove the remnant code

test=develop

* Add TestAllImpls and double support

test=develop

* Clean Code

test=develop

* Add benchmark test for LayerNorm & CRFDecoding

test=develop
---
 paddle/fluid/operators/jit/benchmark.cc |  75 +++++++++++++
 paddle/fluid/operators/jit/test.cc      | 133 +++++++++++++++++++++++-
 2 files changed, 207 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 97ddf223ae..77a2d04ebf 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -339,6 +339,71 @@ void BenchSoftmaxKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchLayerNormKernel() {
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        int sz = left * right;
+        Tensor x, mean, var, scale, bias, out;
+        x.Resize({n, x_dim_0, x_dim_1});
+        out.Resize({n, x_dim_0, x_dim_1});
+        mean.Resize({n, x_dim_0});
+        var.Resize({n, x_dim_0});
+        scale.Resize({x_dim_1});
+        bias.Resize({x_dim_1});
+
+        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
+
+        const T* scale_data = scale.data<T>();
+        const T* bias_data = bias.data<T>();
+        T* x_data = x.data<T>();
+        T* mean_data = mean.data<T>();
+        T* var_data = var.data<T>();
+        T* out_data = out.mutable_data<T>(PlaceType());
+
+        BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
+            right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+      }
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchCRFDecodingKernel() {
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      Tensor x, w, alpha, track;
+      x.Resize({seq_len, tag_num});
+      w.Resize({tag_num + state_trans_base_idx, tag_num});
+      alpha.Resize({seq_len, tag_num});
+      track.Resize({seq_len, tag_num});
+
+      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
+
+      const T* x_data = x.data<T>();
+      const T* w_data = w.data<T>();
+      T* alpha_data = alpha.mutable_data<T>(PlaceType());
+      int* track_data = track.mutable_data<int>(PlaceType());
+
+      BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
+          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
+    }
+  }
+}
+
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;
 
@@ -382,6 +447,16 @@ BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
 // softmax
 BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
 
+// layernorm
+BENCH_FP32_CPU(kLayerNorm) {
+  BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
+}
+
+// crfdecoding
+BENCH_FP32_CPU(kCRFDecoding) {
+  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 237e588d35..85b50b79d9 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -292,6 +292,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, int, float, int> {
+  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
+                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
+                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
+                  const std::vector<T>& scale, const std::vector<T>& bias,
+                  int left, const float epsilon, int right) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+    EXPECT_EQ(var.size(), static_cast<size_t>(left));
+    EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+    EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+    std::vector<T> outtgt(outref.size());
+    const T* scale_data = scale.data();
+    const T* bias_data = bias.data();
+    T* x_data = x.data();
+    T* mean_data = mean.data();
+    T* var_data = var.data();
+    T* outref_data = outref.data();
+    T* outtgt_data = outtgt.data();
+
+    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
+        epsilon, right);
+    ExpectEQ<T>(outtgt_data, outref_data, left * right);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<int>,
+                         int> {
+  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
+                  const int seq_len, const std::vector<T>& x,
+                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
+                  std::vector<int>& trackref, int tag_num) {          // NOLINT
+    constexpr int state_trans_base_idx = 2;
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(w.size(),
+              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
+    EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
+    std::vector<T> alphatgt(alpharef.size());
+    std::vector<int> tracktgt(trackref.size());
+
+    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
+    tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
+        tracktgt.data(), tag_num);
+    ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
+    ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
+  }
+};
+
 template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
           typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
@@ -640,6 +697,71 @@ void TestNCHW16CMulNCKernel() {
   }
 }
 
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestLayerNormKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        int sz = left * right;
+        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
+            outref(sz);
+        RandomVec<T>(sz, x.data(), -2.f, 2.f);
+        RandomVec<T>(left, mean.data(), -2.f, 2.f);
+        RandomVec<T>(left, var.data(), -2.f, 2.f);
+        RandomVec<T>(right, scale.data(), -2.f, 2.f);
+        RandomVec<T>(right, bias.data(), -2.f, 2.f);
+
+        const T* scale_data = scale.data();
+        const T* bias_data = bias.data();
+        T* x_data = x.data();
+        T* mean_data = mean.data();
+        T* var_data = var.data();
+        T* outref_data = outref.data();
+
+        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+
+        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
+                     std::vector<T>, std::vector<T>, std::vector<T>,
+                     std::vector<T>, std::vector<T>, int, float>(
+            right, x, outref, mean, var, scale, bias, left, epsilon, right);
+      }
+    }
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestCRFDecodingKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
+      std::vector<int> trackref(x_sz);
+      RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
+
+      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
+          trackref.data(), tag_num);
+
+      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
+                   std::vector<T>, std::vector<T>, std::vector<T>,
+                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
+                                          trackref, tag_num);
+    }
+  }
+}
+
 // XYZNTuple
 TEST(JITKernel, kVMul) {
   TestXYZNKernel<jit::kVMul, float, CPUPlace>();
@@ -761,7 +883,16 @@ TEST(JITKernel, kNCHW16CMulNC) {
   TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
 }
 
-// TODO(yihua/TJ): add crf decoding and layer norm unit tests
+TEST(JITKernel, kLayerNorm) {
+  TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
+  TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kCRFDecoding) {
+  TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
+  TestCRFDecodingKernel<jit::kCRFDecoding, double,
+                        paddle::platform::CPUPlace>();
+}
 
 TEST(JITKernel, pool) {
   // TODO(TJ): add some test