From d8e00facf7884c20257d1faa1fc92620be048e7b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 25 Jul 2018 23:46:48 +0800
Subject: [PATCH 01/40] reuse im_size

---
 paddle/fluid/operators/math/im2col.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index bb55ce21b0..101e046acb 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -54,12 +54,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) {
       int col_matrix_width = output_width * output_height;
+      int im_size = im_height * im_width;
       size_t copy_size = sizeof(T) * output_width;
       for (int oh = 0; oh < output_height; ++oh) {
         const T* im_data_start = im_data + oh * im_width;
         T* dst_data = col_data + oh * output_width;
         for (int ic = 0; ic < im_channels; ++ic) {
-          const T* src_data = im_data_start + ic * im_height * im_width;
+          const T* src_data = im_data_start + ic * im_size;
           for (int kh = 0; kh < filter_height; ++kh) {
             for (int kw = 0; kw < filter_width; ++kw) {
               std::memcpy(dst_data, src_data + kw, copy_size);

From 660df122ceb80fb522dbf6baf30aa574c13ce5bf Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 25 Jul 2018 23:59:24 +0800
Subject: [PATCH 02/40]  enable padding!=0 and fill height padding with 0

---
 paddle/fluid/operators/math/im2col.cc | 64 ++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 101e046acb..311401b3d7 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -48,29 +48,63 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
     // TODO(TJ): change me to template
-    // further optimaze:
-    // 1. padding != 1
-    // 2. could also support stride_h != 1
+    // further optimize: padding == 1 need special
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
-        dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) {
+        dilation[1] == 1) {
       int col_matrix_width = output_width * output_height;
       int im_size = im_height * im_width;
-      size_t copy_size = sizeof(T) * output_width;
-      for (int oh = 0; oh < output_height; ++oh) {
-        const T* im_data_start = im_data + oh * im_width;
-        T* dst_data = col_data + oh * output_width;
-        for (int ic = 0; ic < im_channels; ++ic) {
-          const T* src_data = im_data_start + ic * im_size;
-          for (int kh = 0; kh < filter_height; ++kh) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        size_t copy_size = sizeof(T) * output_width;
+        for (int oh = 0; oh < output_height; ++oh) {
+          const T* im_data_start = im_data + oh * im_width;
+          T* dst_data = col_data + oh * output_width;
+          for (int ic = 0; ic < im_channels; ++ic) {
+            const T* src_data = im_data_start + ic * im_size;
+            for (int kh = 0; kh < filter_height; ++kh) {
+              for (int kw = 0; kw < filter_width; ++kw) {
+                std::memcpy(dst_data, src_data + kw, copy_size);
+                dst_data = dst_data + col_matrix_width;
+              }
+              src_data = src_data + im_width;
+            }
+          }
+        }
+        return;
+      } else {
+        int plh = padding[0];
+        // int plw = padding[1];
+        int prh =
+            (output_height - 1) * stride[0] + filter_height - im_height - plh;
+        // int prw =  (output_width - 1) * stride[1] + filter_width - im_width -
+        // plw;
+
+        // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
+        // TODO(TJ): reuse sizes
+        assert(plh == prh);  // because stride_h == 1
+        for (int ph = 0; ph < plh; ++ph) {
+          size_t sz = sizeof(T) * output_width * (plh - ph);
+          T* col_start_l = col_data + ph * filter_width * col_matrix_width;
+          T* col_start_r =
+              col_data +
+              (filter_width - ph - 1) * filter_width * col_matrix_width +
+              col_matrix_width - output_width * (plh - ph);
+          for (int ic = 0; ic < im_channels; ++ic) {
+            T* dst_data_l =
+                col_start_l +
+                ic * filter_width * filter_height * col_matrix_width;
+            T* dst_data_r =
+                col_start_r +
+                ic * filter_width * filter_height * col_matrix_width;
             for (int kw = 0; kw < filter_width; ++kw) {
-              std::memcpy(dst_data, src_data + kw, copy_size);
-              dst_data = dst_data + col_matrix_width;
+              std::memset(dst_data_l, 0, sz);
+              std::memset(dst_data_r, 0, sz);
+              dst_data_l = dst_data_l + col_matrix_width;
+              dst_data_r = dst_data_r + col_matrix_width;
             }
-            src_data = src_data + im_width;
           }
         }
+        return;
       }
-      return;
     }
 
     for (int c = 0; c < channels_col; ++c) {

From 92518c519fd56065ef142b5e21cfe350648d1c47 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 00:25:19 +0800
Subject: [PATCH 03/40] reuse sizes saving time

---
 paddle/fluid/operators/math/im2col.cc | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 311401b3d7..90783ba1c6 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -79,25 +79,22 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
         // plw;
 
         // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
-        // TODO(TJ): reuse sizes
+        // TODO(TJ): refine ph*xxx
         assert(plh == prh);  // because stride_h == 1
+        int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+        int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
         for (int ph = 0; ph < plh; ++ph) {
-          size_t sz = sizeof(T) * output_width * (plh - ph);
-          T* col_start_l = col_data + ph * filter_width * col_matrix_width;
-          T* col_start_r =
-              col_data +
-              (filter_width - ph - 1) * filter_width * col_matrix_width +
-              col_matrix_width - output_width * (plh - ph);
+          int sz = output_width * (plh - ph);
+          size_t copy_sz = sizeof(T) * sz;
+          T* col_start_l = col_data + ph * col_block_fh;
+          T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh +
+                           col_matrix_width - sz;
           for (int ic = 0; ic < im_channels; ++ic) {
-            T* dst_data_l =
-                col_start_l +
-                ic * filter_width * filter_height * col_matrix_width;
-            T* dst_data_r =
-                col_start_r +
-                ic * filter_width * filter_height * col_matrix_width;
+            T* dst_data_l = col_start_l + ic * col_block_ic;
+            T* dst_data_r = col_start_r + ic * col_block_ic;
             for (int kw = 0; kw < filter_width; ++kw) {
-              std::memset(dst_data_l, 0, sz);
-              std::memset(dst_data_r, 0, sz);
+              std::memset(dst_data_l, 0, copy_sz);
+              std::memset(dst_data_r, 0, copy_sz);
               dst_data_l = dst_data_l + col_matrix_width;
               dst_data_r = dst_data_r + col_matrix_width;
             }

From 5b183557f321c1bb43541fede5091be612cfedf0 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 25 Jul 2018 09:44:44 +0800
Subject: [PATCH 04/40] graph viz pass

---
 paddle/fluid/framework/CMakeLists.txt         |  2 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |  1 +
 paddle/fluid/framework/ir/graph_viz_pass.cc   | 66 +++++++++++++++++++
 paddle/fluid/framework/ir/graph_viz_pass.h    | 44 +++++++++++++
 paddle/fluid/framework/parallel_executor.cc   | 11 ++++
 .../unittests/parallel_executor_test_base.py  |  1 +
 .../unittests/test_parallel_executor_mnist.py | 42 ++++++------
 7 files changed, 147 insertions(+), 20 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.cc
 create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 93ec047c80..f3c1e7c528 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -99,7 +99,7 @@ else()
 endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6447452ae5..a6bdd12b63 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -2,5 +2,6 @@ cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node)
+cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
new file mode 100644
index 0000000000..c839ebadac
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> GraphVizPass::Apply(
+    std::unique_ptr<ir::Graph> graph) const {
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path_));
+  PADDLE_ENFORCE(fout->good());
+  std::ostream& sout = *fout;
+
+  size_t var_id = 0;
+  std::unordered_map<const ir::Node*, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kVariable) continue;
+    size_t cur_var_id = var_id++;
+    vars[n] = cur_var_id;
+
+    sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]"
+         << std::endl;
+  }
+
+  size_t op_id = 0;
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : n->inputs) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : n->outputs) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
new file mode 100644
index 0000000000..08c534f417
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class GraphVizPass : public Pass {
+ public:
+  explicit GraphVizPass(const std::string& graph_viz_path)
+      : graph_viz_path_(graph_viz_path) {}
+
+  std::unique_ptr<ir::Graph> Apply(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+ private:
+  const std::string graph_viz_path_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 02c836bea1..fbd5acc3e5 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
@@ -133,7 +134,17 @@ ParallelExecutor::ParallelExecutor(
   }
   builder_ = builder_factory.Create();
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+  if (!build_strategy.debug_graphviz_path_.empty()) {
+    const std::string origin_graph_path = string::Sprintf(
+        "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph));
+  }
   graph = builder_->Apply(std::move(graph));
+  if (!build_strategy.debug_graphviz_path_.empty()) {
+    const std::string origin_graph_path = string::Sprintf(
+        "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec");
+    graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph));
+  }
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index fcf86cc583..c5e69e41be 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -71,6 +71,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             exec_strategy.allow_op_delay = allow_op_delay
 
             build_strategy = fluid.BuildStrategy()
+            build_strategy.debug_graphviz_path = "/tmp/graphviz"
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 76389d916f..d740eb5443 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -152,16 +152,6 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             use_reduce=use_reduce)
 
-    def test_simple_fc(self):
-        # use_cuda
-        self.check_simple_fc_convergence(True)
-        self.check_simple_fc_convergence(False)
-
-    def test_simple_fc_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self._compare_reduce_and_allreduce(simple_fc_net, True)
-        self._compare_reduce_and_allreduce(simple_fc_net, False)
-
     def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
@@ -188,10 +178,6 @@ class TestMNIST(TestParallelExecutorBase):
         for p_l in parallel_last_loss:
             self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
 
-    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(True)
-        self.check_simple_fc_parallel_accuracy(False)
-
     def check_batchnorm_fc_convergence(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
@@ -206,13 +192,31 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda)
 
-    def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(True)
-        self.check_batchnorm_fc_convergence(False)
+    def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            fc_with_batchnorm, use_cuda=use_cuda, use_reduce=False)
+        """
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+        """
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
+        self.check_batchnorm_fc_convergence_use_reduce(True)
+        # self.check_batchnorm_fc_convergence_use_reduce(False)
 
 
 if __name__ == '__main__':

From 142e832d21715c0ce651e4ac04f10554945e5ad7 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 25 Jul 2018 19:59:24 +0800
Subject: [PATCH 05/40] pass registration

---
 .../details/multi_devices_graph_builder.cc    |  31 ++---
 .../details/multi_devices_graph_builder.h     |  27 ++--
 .../details/ssa_graph_builder_factory.cc      |  33 ++---
 .../framework/details/ssa_graph_checker.h     |  12 +-
 .../framework/details/ssa_graph_printer.h     |  34 ++---
 paddle/fluid/framework/ir/graph_viz_pass.cc   |   6 +-
 paddle/fluid/framework/ir/graph_viz_pass.h    |   6 -
 paddle/fluid/framework/ir/pass.cc             |   9 +-
 paddle/fluid/framework/ir/pass.h              | 117 +++++++++++++++++-
 paddle/fluid/framework/parallel_executor.cc   |  20 ++-
 10 files changed, 191 insertions(+), 104 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 22f0cb20d0..4fad520f40 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -34,30 +34,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+void MultiDevSSAGraphBuilder::Init() const {
+  loss_var_name_ = Get<std::string>("loss_var_name");
+  places_ = Get<std::vector<platform::Place>>("places");
+  local_scopes_ = Get<std::vector<Scope *>>("local_scopes");
+  strategy_ = Get<BuildStrategy>("strategy");
 #ifdef PADDLE_WITH_CUDA
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs),
-      strategy_(strategy) {
-#else
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      strategy_(strategy) {
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-  for (auto &p : params) {
+
+  for (auto &p : Get<std::unordered_set<std::string>>("params")) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
@@ -241,6 +227,7 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
 
 std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
     std::unique_ptr<ir::Graph> graph) const {
+  Init();
   // Give the topology sort order and rebuild the graph structure.
   std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
   auto nodes = graph->ReleaseNodes();
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 55076f227b..c8c1b2a438 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -32,20 +32,6 @@ namespace details {
 
 class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  public:
-#ifdef PADDLE_WITH_CUDA
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          const BuildStrategy &strategy);
-#else
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          const BuildStrategy &strategy);
-#endif
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override;
   int GetVarDeviceID(const std::string &varname) const override;
@@ -53,15 +39,16 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  private:
   void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
                          size_t device_id) const;
+  void Init() const;
 
  private:
-  std::string loss_var_name_;
-  const std::vector<platform::Place> &places_;
-  const std::vector<Scope *> &local_scopes_;
-  std::unordered_set<std::string> grad_names_;
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
 
 #ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nccl_ctxs_;
+  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
   bool IsScaleLossOp(ir::Node *node) const;
@@ -113,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::vector<std::string> &var_names) const;
 
  private:
-  BuildStrategy strategy_;
+  mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
   mutable std::unordered_map<std::string, int> var_name_on_devices_;
   mutable std::vector<int64_t> balance_vars_;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
index b4b49d3de6..e8d83943ac 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -22,26 +22,29 @@ namespace paddle {
 namespace framework {
 namespace details {
 std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(
+  std::unique_ptr<SSAGraphBuilder> res(new MultiDevSSAGraphBuilder);
+  res->SetNotOwned<std::vector<platform::Place>>("places", &places_);
+  res->SetNotOwned<std::string>("loss_var_name", &loss_var_name_);
+  res->SetNotOwned<std::unordered_set<std::string>>("params", &param_names_);
+  res->SetNotOwned<std::vector<Scope *>>("local_scopes", &local_scopes_);
+  res->SetNotOwned<BuildStrategy>("strategy", &strategy_);
 #ifdef PADDLE_WITH_CUDA
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, nccl_ctxs_, strategy_)
-#else
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, strategy_)
+  res->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nccl_ctxs_);
 #endif
-          );  // NOLINT
 
   if (!strategy_.debug_graphviz_path_.empty()) {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(strategy_.debug_graphviz_path_));
-    PADDLE_ENFORCE(fout->good());
-    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
-        new GraphvizSSAGraphPrinter());
-    res.reset(new SSAGraghBuilderWithPrinter(
-        std::move(fout), std::move(graphviz_printer), std::move(res)));
+    SSAGraphBuilder *previous_pass = res.release();
+    res.reset(new SSAGraghBuilderWithPrinter);
+    res->Set<SSAGraphBuilder>("previous_pass", previous_pass);
+    res->SetNotOwned<std::string>("debug_graphviz_path",
+                                  &strategy_.debug_graphviz_path_);
+    res->Set<GraphvizSSAGraphPrinter>("graph_printer",
+                                      new GraphvizSSAGraphPrinter);
   }
-  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
+
+  SSAGraphBuilder *previous_pass = res.release();
+  res.reset(new SSAGraghBuilderWithChecker);
+  res->Set<SSAGraphBuilder>("previous_pass", previous_pass);
 
   return res;
 }
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 51ce6e5eca..ae5ad16b0c 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -24,25 +24,19 @@ namespace details {
 
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
  public:
-  explicit SSAGraghBuilderWithChecker(
-      std::unique_ptr<SSAGraphBuilder>&& builder)
-      : builder_(std::move(builder)) {}
-
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
+    auto new_graph =
+        Get<SSAGraphBuilder>("previous_pass").Apply(std::move(graph));
     PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
     return new_graph;
   }
 
   int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
+    return Get<SSAGraphBuilder>("previous_pass").GetVarDeviceID(var_name);
   }
 
   bool IsValidGraph(const ir::Graph* graph) const;
-
- private:
-  std::unique_ptr<SSAGraphBuilder> builder_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index a77c1bad3f..2a939ef4c9 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <fstream>
 #include <iosfwd>
+#include <ostream>
 #include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
@@ -35,37 +37,21 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 
 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
  public:
-  SSAGraghBuilderWithPrinter(std::ostream& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ref_(sout) {}
-
-  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ptr_(std::move(sout)),
-        stream_ref_(*stream_ptr_) {}
-
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
-    printer_->Print(*new_graph, stream_ref_);
+    auto new_graph =
+        Get<SSAGraphBuilder>("previous_pass").Apply(std::move(graph));
+
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(Get<std::string>("debug_graphviz_path")));
+    PADDLE_ENFORCE(fout->good());
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*new_graph, *fout);
     return new_graph;
   }
 
   int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
+    return Get<SSAGraphBuilder>("previous_pass").GetVarDeviceID(var_name);
   }
-
- private:
-  std::unique_ptr<SSAGraphPrinter> printer_;
-  std::unique_ptr<SSAGraphBuilder> builder_;
-  std::unique_ptr<std::ostream> stream_ptr_;
-  std::ostream& stream_ref_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index c839ebadac..7d1cff7178 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -23,7 +23,8 @@ namespace ir {
 
 std::unique_ptr<ir::Graph> GraphVizPass::Apply(
     std::unique_ptr<ir::Graph> graph) const {
-  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path_));
+  const std::string graph_viz_path = Get<std::string>("graph_viz_path");
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
 
@@ -61,6 +62,9 @@ std::unique_ptr<ir::Graph> GraphVizPass::Apply(
   sout << "}\n";
   return graph;
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 08c534f417..04c0c35d12 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -29,14 +29,8 @@ namespace ir {
 
 class GraphVizPass : public Pass {
  public:
-  explicit GraphVizPass(const std::string& graph_viz_path)
-      : graph_viz_path_(graph_viz_path) {}
-
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override;
-
- private:
-  const std::string graph_viz_path_;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index c05d7d0bb5..0e68ecb56f 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -15,5 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+PassRegistry& PassRegistry::Instance() {
+  static PassRegistry g_pass_info_map;
+  return g_pass_info_map;
+}
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index f52ba788d5..9466924262 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -14,9 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
+#include <map>
+#include <string>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
@@ -25,10 +30,120 @@ namespace ir {
 class Pass {
  public:
   Pass() = default;
-  virtual ~Pass() {}
+  virtual ~Pass() {
+    for (auto &attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
 
   virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+  }
+
+ private:
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+using PassCreator = std::function<std::unique_ptr<Pass>()>;
+
+class Registrar {
+ public:
+  // In our design, various kinds of passes,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_PASS macros to
+  // call this method. So, as long as the callee code calls USE_PASS, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
 };
+
+class PassRegistry {
+ public:
+  static PassRegistry &Instance();
+
+  bool Has(const std::string &pass_type) const {
+    return map_.find(pass_type) != map_.end();
+  }
+
+  void Insert(const std::string &type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(type), "Pass %s has been registered", type);
+    map_.insert({type, pass_creator});
+  }
+
+  std::unique_ptr<Pass> Get(const std::string &type) const {
+    PADDLE_ENFORCE(Has(type), "Pass %s has not been registered", type);
+    return map_.at(type)();
+  }
+
+ private:
+  PassRegistry() = default;
+  std::unordered_map<std::string, PassCreator> map_;
+
+  DISABLE_COPY_AND_ASSIGN(PassRegistry);
+};
+
+template <typename PassType>
+struct PassRegistrar : public Registrar {
+  explicit PassRegistrar(const char *pass_type) {
+    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
+                   "'%s' is registered more than once.", pass_type);
+    PassRegistry::Instance().Insert(pass_type, []() -> std::unique_ptr<Pass> {
+      return std::unique_ptr<Pass>(new PassType());
+    });
+  }
+};
+
+#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define REGISTER_PASS(pass_type, pass_class)                \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
+      __reg_pass__##pass_type,                              \
+      "REGISTER_PASS must be called in global namespace");  \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
+      __pass_registrar_##pass_type##__(#pass_type);         \
+  int TouchPassRegistrar_##pass_type() {                    \
+    __pass_registrar_##pass_type##__.Touch();               \
+    return 0;                                               \
+  }
+
+#define USE_PASS(pass_type)                                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __use_pass_itself_##pass_type,                                  \
+      "USE_PASS must be called in global namespace");                 \
+  extern int TouchPassRegistrar_##pass_type();                        \
+  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+      TouchPassRegistrar_##pass_type()
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index fbd5acc3e5..ff661d0013 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -132,19 +132,27 @@ ParallelExecutor::ParallelExecutor(
     PADDLE_THROW("Not compiled with CUDA.");
 #endif
   }
-  builder_ = builder_factory.Create();
+
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   if (!build_strategy.debug_graphviz_path_.empty()) {
-    const std::string origin_graph_path = string::Sprintf(
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
         "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph));
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
   }
+
+  builder_ = builder_factory.Create();
   graph = builder_->Apply(std::move(graph));
+
   if (!build_strategy.debug_graphviz_path_.empty()) {
-    const std::string origin_graph_path = string::Sprintf(
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
         "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec");
-    graph = ir::GraphVizPass(origin_graph_path).Apply(std::move(graph));
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
   }
+
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -297,3 +305,5 @@ ParallelExecutor::~ParallelExecutor() {
 
 }  // namespace framework
 }  // namespace paddle
+
+USE_PASS(graph_viz_pass);

From e3131e2d73252280e38b3096ca681951a122dddd Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 10:27:00 +0800
Subject: [PATCH 06/40] enable width padding

---
 paddle/fluid/operators/math/im2col.cc | 64 +++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 90783ba1c6..c29a137319 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -72,11 +72,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
         return;
       } else {
         int plh = padding[0];
-        // int plw = padding[1];
+        int plw = padding[1];
         int prh =
             (output_height - 1) * stride[0] + filter_height - im_height - plh;
-        // int prw =  (output_width - 1) * stride[1] + filter_width - im_width -
-        // plw;
+        int prw =
+            (output_width - 1) * stride[1] + filter_width - im_width - plw;
 
         // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
         // TODO(TJ): refine ph*xxx
@@ -100,6 +100,64 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
             }
           }
         }
+
+        // fill width padding
+        assert(plw == prw);  // because stride_w == 1
+        if (plw == 1) {
+          auto pad = static_cast<T>(0);  // padding zero
+          for (int ic = 0; ic < im_channels; ++ic) {
+            // TODO(TJ): use add and resue stride
+            T* dst_data_ic = col_data + ic * col_block_ic;
+            for (int kh = 0; kh < filter_height; ++kh) {
+              T* dst_data_kh = dst_data_ic + kh * col_block_fh;
+              for (T* dst_data :
+                   {dst_data_kh, dst_data_kh +
+                                     (filter_width - prw) * col_matrix_width +
+                                     output_width - 1}) {
+                // TODO(TJ): from plh, saving repeated assignment
+                for (int oh = 0; oh < output_height; ++oh) {
+                  *dst_data = pad;
+                  dst_data = dst_data + output_width;
+                }
+              }
+            }
+          }
+        } else {
+          // padding_size > 1
+          for (int ic = 0; ic < im_channels; ++ic) {
+            // TODO(TJ): use add and resue stride
+            T* dst_data_ic =
+                col_data + ic * filter_width * filter_height * col_matrix_width;
+            for (int kh = 0; kh < filter_height; ++kh) {
+              T* dst_data_kh =
+                  dst_data_ic + kh * filter_width * col_matrix_width;
+              for (int kw = 0; kw < plw; ++kw) {
+                // TODO(TJ): reuse array outside this for
+                size_t sz = sizeof(T) * (plw - kw);
+                T* dst_data = dst_data_kh + kw * col_matrix_width;
+                // TODO(TJ): from plh, saving repeated assignment
+                for (int oh = 0; oh < output_height; ++oh) {
+                  std::memset(dst_data, 0, sz);
+                  dst_data = dst_data + output_width;
+                }
+              }
+              // TODO(TJ): use reverse to save cache
+              for (int kw = 0; kw < prw; ++kw) {
+                // TODO(TJ): reuse array outside this for
+                auto num = (prw - kw);
+                size_t sz = sizeof(T) * num;
+                T* dst_data = dst_data_kh +
+                              (filter_width - 1 - kw) * col_matrix_width +
+                              output_width - num;
+                // TODO(TJ): from plh, saving repeated assignment
+                for (int oh = 0; oh < output_height; ++oh) {
+                  std::memset(dst_data, 0, sz);
+                  dst_data = dst_data + output_width;
+                }
+              }
+            }
+          }
+        }
         return;
       }
     }

From e4d7d7ae8fd7198447df21188d3fd85868c8bafa Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 26 Jul 2018 11:06:12 +0800
Subject: [PATCH 07/40] pass refactoring

---
 .../details/multi_devices_graph_builder.cc    | 46 ++++++---
 .../details/multi_devices_graph_builder.h     |  6 +-
 .../scope_buffered_ssa_graph_executor.h       |  3 +
 .../framework/details/ssa_graph_builder.h     |  4 +-
 .../details/ssa_graph_builder_factory.cc      | 12 +--
 .../details/ssa_graph_builder_factory.h       | 14 +--
 .../framework/details/ssa_graph_checker.cc    |  3 +
 .../framework/details/ssa_graph_checker.h     |  7 +-
 .../framework/details/ssa_graph_executor.h    |  4 +-
 .../framework/details/ssa_graph_printer.cc    |  3 +
 .../framework/details/ssa_graph_printer.h     |  7 +-
 .../details/threaded_ssa_graph_executor.h     |  1 +
 paddle/fluid/framework/ir/graph.h             |  2 +
 paddle/fluid/framework/ir/pass.h              |  2 +
 paddle/fluid/framework/parallel_executor.cc   | 98 ++++++++++++-------
 paddle/fluid/framework/parallel_executor.h    |  1 -
 .../operators/distributed/send_recv.proto     | 97 ++++++++++++++++++
 17 files changed, 229 insertions(+), 81 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/send_recv.proto

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4fad520f40..d211f02689 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -244,6 +244,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
   result.Set("vars", new GraphVars(places_.size()));
   result.Set("dep_vars", new GraphDepVars);
   result.Set("ops", new GraphOps);
+  result.Set("sharded_var_device", new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
   // realted op in the place 0
@@ -276,11 +277,12 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(node);
+      int op_dev_id = GetOpDeviceID(result, node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
-          var_name_on_devices_.emplace(n->Name(), op_dev_id);
+          graph->Get<ShardedVarDevice>("sharded_var_device")
+              .emplace(n->Name(), op_dev_id);
         }
       } else {
         // This op runs on all devices, and its output may have parameter's
@@ -317,7 +319,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    graph->Get<ShardedVarDevice>("sharded_var_device")
+                        .emplace(g_name, cur_device_id);
                     bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
@@ -499,7 +502,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
+                                           ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
@@ -512,15 +516,17 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1]);
+  int dev_id = GetVarDeviceID(graph, param_grad[1]);
   PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
                     node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
-  auto got = var_name_on_devices_.find(varname);
-  return got == var_name_on_devices_.end() ? -1 : got->second;
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
+                                            const std::string &varname) const {
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>("sharded_var_device");
+  auto got = sharded_var_device.find(varname);
+  return got == sharded_var_device.end() ? -1 : got->second;
 }
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
@@ -625,20 +631,23 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   if (node->Op()->Type() == "split_byref" ||
       node->Op()->Type() == "split_selected_rows") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>("sharded_var_device")
+            .emplace(varname, op_dev_id);
       }
     }
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>("sharded_var_device")
+          .emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>("sharded_var_device")
+          .emplace(varname, op_dev_id);
     }
   } else {
     PADDLE_ENFORCE(
@@ -663,7 +672,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
+    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
     PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
@@ -678,7 +687,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
       }
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>("sharded_var_device")
+            .emplace(varname, op_dev_id);
       }
     }
   } else if (node->Op()->Type() == "recv") {
@@ -688,7 +698,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
     }
     op_dev_id = GetAppropriateDeviceID(output_var_names);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>("sharded_var_device")
+          .emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier and fetch_barrier op can be scheduled on device 0
@@ -730,3 +741,6 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_pass,
+              paddle::framework::details::MultiDevSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index c8c1b2a438..baea091af3 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -34,7 +34,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  public:
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override;
-  int GetVarDeviceID(const std::string &varname) const override;
 
  private:
   void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
@@ -51,6 +50,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
+  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+
   bool IsScaleLossOp(ir::Node *node) const;
 
   void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
@@ -84,7 +85,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(ir::Node *node) const;
+  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
   void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
@@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  private:
   mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-  mutable std::unordered_map<std::string, int> var_name_on_devices_;
   mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index cbfbcb1c0c..1b188aec59 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -40,6 +40,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+
+  const ir::Graph& Graph() const { return underlying_executor_->Graph(); }
+
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 2b4f31f2ff..e0ad027315 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -47,13 +47,13 @@ typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
 // unordered.
 typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
 
+typedef std::unordered_map<std::string, int> ShardedVarDevice;
+
 class SSAGraphBuilder : public ir::Pass {
  public:
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
 
-  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
-
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
index e8d83943ac..2254a3b41e 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -21,8 +21,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
-std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(new MultiDevSSAGraphBuilder);
+std::unique_ptr<ir::Pass> ParallelExecutorPassManager::Create() {
+  std::unique_ptr<ir::Pass> res(new MultiDevSSAGraphBuilder);
   res->SetNotOwned<std::vector<platform::Place>>("places", &places_);
   res->SetNotOwned<std::string>("loss_var_name", &loss_var_name_);
   res->SetNotOwned<std::unordered_set<std::string>>("params", &param_names_);
@@ -33,18 +33,18 @@ std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
 #endif
 
   if (!strategy_.debug_graphviz_path_.empty()) {
-    SSAGraphBuilder *previous_pass = res.release();
+    ir::Pass *previous_pass = res.release();
     res.reset(new SSAGraghBuilderWithPrinter);
-    res->Set<SSAGraphBuilder>("previous_pass", previous_pass);
+    res->Set<ir::Pass>("previous_pass", previous_pass);
     res->SetNotOwned<std::string>("debug_graphviz_path",
                                   &strategy_.debug_graphviz_path_);
     res->Set<GraphvizSSAGraphPrinter>("graph_printer",
                                       new GraphvizSSAGraphPrinter);
   }
 
-  SSAGraphBuilder *previous_pass = res.release();
+  ir::Pass *previous_pass = res.release();
   res.reset(new SSAGraghBuilderWithChecker);
-  res->Set<SSAGraphBuilder>("previous_pass", previous_pass);
+  res->Set<ir::Pass>("previous_pass", previous_pass);
 
   return res;
 }
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
index 91a119de83..1bfc3e71e8 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
@@ -29,13 +29,13 @@ namespace framework {
 class Scope;
 namespace details {
 
-class SSAGraphBuilderFactory {
+class ParallelExecutorPassManager {
  public:
-  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
-                         const std::string& loss_var_name,
-                         const std::unordered_set<std::string>& param_names,
-                         const std::vector<Scope*>& local_scopes,
-                         const BuildStrategy& strategy)
+  ParallelExecutorPassManager(
+      const std::vector<platform::Place>& places,
+      const std::string& loss_var_name,
+      const std::unordered_set<std::string>& param_names,
+      const std::vector<Scope*>& local_scopes, const BuildStrategy& strategy)
       : places_(places),
         loss_var_name_(loss_var_name),
         param_names_(param_names),
@@ -52,7 +52,7 @@ class SSAGraphBuilderFactory {
   }
 #endif
 
-  std::unique_ptr<SSAGraphBuilder> Create();
+  std::unique_ptr<ir::Pass> Create();
 
  private:
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
index 0438b09610..2994329f48 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -85,3 +85,6 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_check_pass,
+              paddle::framework::details::SSAGraghBuilderWithChecker);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index ae5ad16b0c..fb766fb415 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -26,16 +26,11 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
  public:
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph =
-        Get<SSAGraphBuilder>("previous_pass").Apply(std::move(graph));
+    auto new_graph = Get<ir::Pass>("previous_pass").Apply(std::move(graph));
     PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
     return new_graph;
   }
 
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return Get<SSAGraphBuilder>("previous_pass").GetVarDeviceID(var_name);
-  }
-
   bool IsValidGraph(const ir::Graph* graph) const;
 };
 
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 8815ec89b2..96fffb7d94 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -32,7 +32,9 @@ class SSAGraphExecutor {
 
   virtual ~SSAGraphExecutor();
 
-  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+  virtual const ir::Graph& Graph() const = 0;
+
+  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
index 20aab14644..95d0641d72 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_print_pass,
+              paddle::framework::details::SSAGraghBuilderWithPrinter);
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index 2a939ef4c9..b7d20aa983 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -39,8 +39,7 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
  public:
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph =
-        Get<SSAGraphBuilder>("previous_pass").Apply(std::move(graph));
+    auto new_graph = Get<ir::Pass>("previous_pass").Apply(std::move(graph));
 
     std::unique_ptr<std::ostream> fout(
         new std::ofstream(Get<std::string>("debug_graphviz_path")));
@@ -48,10 +47,6 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
     Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*new_graph, *fout);
     return new_graph;
   }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return Get<SSAGraphBuilder>("previous_pass").GetVarDeviceID(var_name);
-  }
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 3d67daa45e..82d6b5272a 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -42,6 +42,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                            const std::vector<platform::Place> &places,
                            std::unique_ptr<ir::Graph> &&graph);
 
+  const ir::Graph &Graph() const { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 4f59ec82a7..49f39df4b9 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -42,6 +42,8 @@ class Graph {
 
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for graph.", attr_name);
     return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 9466924262..5ab7f9a1e2 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -44,6 +44,8 @@ class Pass {
 
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for pass.", attr_name);
     return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ff661d0013..a23fd2a41a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -33,6 +33,48 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+#ifdef PADDLE_WITH_CUDA
+    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
+#else
+    const BuildStrategy &strategy) {
+#endif
+  details::ParallelExecutorPassManager builder_factory(
+      places, loss_var_name, param_names, local_scopes, strategy);
+  if (use_cuda) {
+#ifdef PADDLE_WITH_CUDA
+    builder_factory.SetNCCLContextMap(nccl_ctxs);
+#else
+    PADDLE_THROW("Not compiled with CUDA.");
+#endif
+  }
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+
+  auto builder = builder_factory.Create();
+  graph = builder->Apply(std::move(graph));
+
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_before_exec");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -120,38 +162,18 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  details::SSAGraphBuilderFactory builder_factory(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy);
-  if (member_->use_cuda_) {
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #ifdef PADDLE_WITH_CUDA
-    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
-  }
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-  if (!build_strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-
-  builder_ = builder_factory.Create();
-  graph = builder_->Apply(std::move(graph));
-
-  if (!build_strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", build_strategy.debug_graphviz_path_.c_str(), "_before_exec");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));
@@ -165,11 +187,18 @@ void ParallelExecutor::BCastParamsToDevices(
   // the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? true : false;
-
+  bool initializing = member_->executor_ ? false : true;
   for (auto &var : vars) {
-    int var_dev_id =
-        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    int var_dev_id = -1;
+    if (member_->executor_) {
+      auto &sharded_var_device =
+          member_->executor_->Graph().Get<details::ShardedVarDevice>(
+              "sharded_var_device");
+      if (sharded_var_device.find(var) != sharded_var_device.end()) {
+        var_dev_id = sharded_var_device.at(var);
+      }
+    }
+
     if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
@@ -307,3 +336,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace paddle
 
 USE_PASS(graph_viz_pass);
+USE_PASS(multi_device_pass);
+USE_PASS(multi_device_check_pass);
+USE_PASS(multi_device_print_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ffb9934a2d..d624956acd 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -70,7 +70,6 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
-  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
new file mode 100644
index 0000000000..d0595ef108
--- /dev/null
+++ b/paddle/fluid/operators/distributed/send_recv.proto
@@ -0,0 +1,97 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto3";
+package sendrecv;
+
+option cc_generic_services = false;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
+
+  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// LoDTensor
+// SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+  NCCL_ID = 2;
+}
+
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
+message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+
+  message LodData { repeated int64 lod_data = 1; }
+  string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  Type data_type = 3;
+  repeated int64 dims = 4;
+
+  // lod details:
+  int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  int64 slr_height = 7;
+  // tensor data
+  bytes serialized = 8;
+  // selected_rows data
+  bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
+  // If 1, the ps server will start profiling, the ps
+  // server stops profiling and generates a profile to /tmp/profile_ps_*
+  // when profile switches from 1 to 2.
+  int64 profile = 11;
+}
+
+message VoidMessage {}

From aa1085ddc54c5dc40ae91468d5f2a2a7ecdf85cc Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 26 Jul 2018 12:54:55 +0800
Subject: [PATCH 08/40] all passes add doc

---
 doc/fluid/design/ir/draft.md                  | 38 ++++++++++
 paddle/fluid/framework/CMakeLists.txt         |  2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  3 -
 .../details/multi_devices_graph_builder.cc    | 10 +--
 .../details/ssa_graph_builder_factory.cc      | 53 --------------
 .../details/ssa_graph_builder_factory.h       | 71 -------------------
 .../framework/details/ssa_graph_checker.h     |  5 +-
 .../framework/details/ssa_graph_printer.h     |  8 +--
 paddle/fluid/framework/ir/pass.h              |  5 ++
 paddle/fluid/framework/parallel_executor.cc   | 46 ++++++++----
 10 files changed, 87 insertions(+), 154 deletions(-)
 delete mode 100644 paddle/fluid/framework/details/ssa_graph_builder_factory.cc
 delete mode 100644 paddle/fluid/framework/details/ssa_graph_builder_factory.h

diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
index a33b5a9c93..65bfaea6a1 100644
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@@ -71,6 +71,44 @@ is a `Graph` and its output is also a `Graph`. For example,
 a `Pass` can simply print out the `Graph`. A `Pass`
 can also fuse some `Graph`'s `Node`s.
 
+```cpp
+class Pass {
+ public:
+
+  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ public:
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass);
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
 #### Optimize
 
 `Optimize` contains a series of `Pass` with defined order.
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index f3c1e7c528..d822a1c9c4 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -99,7 +99,7 @@ else()
 endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 9df7df1f42..5d652d3730 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -31,9 +31,6 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
-
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index d211f02689..ff90f31cdb 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -35,15 +35,15 @@ namespace framework {
 namespace details {
 
 void MultiDevSSAGraphBuilder::Init() const {
-  loss_var_name_ = Get<std::string>("loss_var_name");
-  places_ = Get<std::vector<platform::Place>>("places");
-  local_scopes_ = Get<std::vector<Scope *>>("local_scopes");
-  strategy_ = Get<BuildStrategy>("strategy");
+  loss_var_name_ = Get<const std::string>("loss_var_name");
+  places_ = Get<const std::vector<platform::Place>>("places");
+  local_scopes_ = Get<const std::vector<Scope *>>("local_scopes");
+  strategy_ = Get<const BuildStrategy>("strategy");
 #ifdef PADDLE_WITH_CUDA
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
 
-  for (auto &p : Get<std::unordered_set<std::string>>("params")) {
+  for (auto &p : Get<const std::unordered_set<std::string>>("params")) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
deleted file mode 100644
index 2254a3b41e..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
-#include <fstream>
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::unique_ptr<ir::Pass> ParallelExecutorPassManager::Create() {
-  std::unique_ptr<ir::Pass> res(new MultiDevSSAGraphBuilder);
-  res->SetNotOwned<std::vector<platform::Place>>("places", &places_);
-  res->SetNotOwned<std::string>("loss_var_name", &loss_var_name_);
-  res->SetNotOwned<std::unordered_set<std::string>>("params", &param_names_);
-  res->SetNotOwned<std::vector<Scope *>>("local_scopes", &local_scopes_);
-  res->SetNotOwned<BuildStrategy>("strategy", &strategy_);
-#ifdef PADDLE_WITH_CUDA
-  res->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nccl_ctxs_);
-#endif
-
-  if (!strategy_.debug_graphviz_path_.empty()) {
-    ir::Pass *previous_pass = res.release();
-    res.reset(new SSAGraghBuilderWithPrinter);
-    res->Set<ir::Pass>("previous_pass", previous_pass);
-    res->SetNotOwned<std::string>("debug_graphviz_path",
-                                  &strategy_.debug_graphviz_path_);
-    res->Set<GraphvizSSAGraphPrinter>("graph_printer",
-                                      new GraphvizSSAGraphPrinter);
-  }
-
-  ir::Pass *previous_pass = res.release();
-  res.reset(new SSAGraghBuilderWithChecker);
-  res->Set<ir::Pass>("previous_pass", previous_pass);
-
-  return res;
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
deleted file mode 100644
index 1bfc3e71e8..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class ParallelExecutorPassManager {
- public:
-  ParallelExecutorPassManager(
-      const std::vector<platform::Place>& places,
-      const std::string& loss_var_name,
-      const std::unordered_set<std::string>& param_names,
-      const std::vector<Scope*>& local_scopes, const BuildStrategy& strategy)
-      : places_(places),
-        loss_var_name_(loss_var_name),
-        param_names_(param_names),
-        local_scopes_(local_scopes),
-        strategy_(strategy) {
-#ifdef PADDLE_WITH_CUDA
-    nccl_ctxs_ = nullptr;
-#endif
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
-    nccl_ctxs_ = nccl_ctxs;
-  }
-#endif
-
-  std::unique_ptr<ir::Pass> Create();
-
- private:
-  std::vector<platform::Place> places_;
-  std::string loss_var_name_;
-  std::unordered_set<std::string> param_names_;
-  std::vector<Scope*> local_scopes_;
-  BuildStrategy strategy_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap* nccl_ctxs_;
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index fb766fb415..25891cf74d 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -26,9 +26,8 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
  public:
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph = Get<ir::Pass>("previous_pass").Apply(std::move(graph));
-    PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
-    return new_graph;
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
   }
 
   bool IsValidGraph(const ir::Graph* graph) const;
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index b7d20aa983..bd4498c061 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -39,13 +39,11 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
  public:
   std::unique_ptr<ir::Graph> Apply(
       std::unique_ptr<ir::Graph> graph) const override {
-    auto new_graph = Get<ir::Pass>("previous_pass").Apply(std::move(graph));
-
     std::unique_ptr<std::ostream> fout(
-        new std::ofstream(Get<std::string>("debug_graphviz_path")));
+        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
     PADDLE_ENFORCE(fout->good());
-    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*new_graph, *fout);
-    return new_graph;
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
+    return graph;
   }
 };
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 5ab7f9a1e2..f254ef62df 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,7 @@ class Pass {
 
   virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
 
+  // Get a reference to the attributed previously set.
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
@@ -49,6 +50,7 @@ class Pass {
     return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
   template <typename AttrType>
   void Set(const std::string &attr_name, AttrType *attr) {
     PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
@@ -59,6 +61,8 @@ class Pass {
     };
   }
 
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
   template <typename AttrType>
   void SetNotOwned(const std::string &attr_name, AttrType *attr) {
     PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
@@ -127,6 +131,7 @@ struct PassRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+// Register a new pass that can be applied on the IR.
 #define REGISTER_PASS(pass_type, pass_class)                \
   STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
       __reg_pass__##pass_type,                              \
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a23fd2a41a..77bed5c999 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,7 +26,8 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -43,16 +44,6 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
 #else
     const BuildStrategy &strategy) {
 #endif
-  details::ParallelExecutorPassManager builder_factory(
-      places, loss_var_name, param_names, local_scopes, strategy);
-  if (use_cuda) {
-#ifdef PADDLE_WITH_CUDA
-    builder_factory.SetNCCLContextMap(nccl_ctxs);
-#else
-    PADDLE_THROW("Not compiled with CUDA.");
-#endif
-  }
-
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   if (!strategy.debug_graphviz_path_.empty()) {
     auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
@@ -62,8 +53,37 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     graph = viz_pass->Apply(std::move(graph));
   }
 
-  auto builder = builder_factory.Create();
-  graph = builder->Apply(std::move(graph));
+  auto multi_device_pass =
+      ir::PassRegistry::Instance().Get("multi_device_pass");
+  multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
+                                                                     &places);
+  multi_device_pass->SetNotOwned<const std::string>("loss_var_name",
+                                                    &loss_var_name);
+  multi_device_pass->SetNotOwned<const std::unordered_set<std::string>>(
+      "params", &param_names);
+  multi_device_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                             &local_scopes);
+  multi_device_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+  multi_device_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+  graph = multi_device_pass->Apply(std::move(graph));
+
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto multi_device_print_pass =
+        ir::PassRegistry::Instance().Get("multi_device_print_pass");
+    multi_device_print_pass->SetNotOwned<const std::string>(
+        "debug_graphviz_path", &strategy.debug_graphviz_path_);
+    multi_device_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+        "graph_printer", new details::GraphvizSSAGraphPrinter);
+    graph = multi_device_print_pass->Apply(std::move(graph));
+  }
+
+  auto multi_device_check_pass =
+      ir::PassRegistry::Instance().Get("multi_device_check_pass");
+  graph = multi_device_check_pass->Apply(std::move(graph));
 
   if (!strategy.debug_graphviz_path_.empty()) {
     auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");

From 4eeed0b5e48db8bc822cd6de35a30e2a0d7705b4 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 15:01:21 +0800
Subject: [PATCH 09/40] refine width padding and enable core copy

---
 paddle/fluid/operators/math/im2col.cc | 67 +++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index c29a137319..be373c99d1 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -126,11 +126,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
           // padding_size > 1
           for (int ic = 0; ic < im_channels; ++ic) {
             // TODO(TJ): use add and resue stride
-            T* dst_data_ic =
-                col_data + ic * filter_width * filter_height * col_matrix_width;
+            T* dst_data_ic = col_data + ic * col_block_ic;
             for (int kh = 0; kh < filter_height; ++kh) {
-              T* dst_data_kh =
-                  dst_data_ic + kh * filter_width * col_matrix_width;
+              T* dst_data_kh = dst_data_ic + kh * col_block_fh;
               for (int kw = 0; kw < plw; ++kw) {
                 // TODO(TJ): reuse array outside this for
                 size_t sz = sizeof(T) * (plw - kw);
@@ -158,6 +156,67 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
             }
           }
         }
+
+        // fill im_data
+        // padding cover two cases:
+        // 1. kw > 2*pw: kw = 3, pw = 1
+        // 0 x x x x ... x x x x 0
+        // 1 1 1             1 1 1
+        // ==>
+        // 0 x ... x x
+        // x x ... x x
+        // x x ... x 0
+        // 2. kw < 2*pw: kw = 3, pw = 2
+        // 0 0 x x x ... x x x 0 0
+        // 1 1 1             1 1 1
+        // ==>
+        // 0 0 x ... x x x
+        // 0 x x ... x x 0
+        // x x x ... x 0 0
+
+        // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+        // (output_width-1)}
+        // length of copy_size is equal kw.
+        if (plw + prw < filter_width) {
+          for (int oh = 0; oh < output_height; ++oh) {
+            const T* im_data_start =
+                im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+            T* dst_data = col_data + oh * output_width;
+            for (int ic = 0; ic < im_channels; ++ic) {
+              const T* src_data = im_data_start + ic * im_size;
+              for (int kh = 0; kh < filter_height; ++kh) {
+                if ((oh < plh && kh < plh) ||
+                    (oh > (output_height - prh - 1) &&
+                     kh > (filter_height - prh - 1))) {
+                  dst_data = dst_data + filter_width * col_matrix_width;
+                  continue;
+                }
+                // TODO(TJ): reuse plw-kw outside this for
+                // try to unify
+                for (int kw = 0; kw < plw; ++kw) {
+                  std::memcpy(dst_data + (plw - kw), src_data,
+                              sizeof(T) * (output_width - (plw - kw)));
+                  dst_data = dst_data + col_matrix_width;
+                }
+                for (int kw = plw; kw < filter_width - prw; ++kw) {
+                  std::memcpy(dst_data, src_data + (kw - plw),
+                              sizeof(T) * output_width);
+                  dst_data = dst_data + col_matrix_width;
+                }
+                int i = 1;
+                for (int kw = filter_width - prw; kw < filter_width;
+                     ++kw, ++i) {
+                  std::memcpy(dst_data, src_data + (kw - plw),
+                              sizeof(T) * (output_width - i));
+                  dst_data = dst_data + col_matrix_width;
+                }
+                src_data = src_data + im_width;
+              }
+            }
+          }
+        } else {
+          LOG(FATAL) << "Not implement yet";
+        }
         return;
       }
     }

From ab72d28a5ec3efd4243df8c7cd3370b9354e009f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 26 Jul 2018 15:02:51 +0800
Subject: [PATCH 10/40] clean up and correctness check

---
 doc/fluid/design/ir/draft.md                  |  10 +-
 .../details/multi_devices_graph_builder.cc    | 103 ++++++++++--------
 .../details/multi_devices_graph_builder.h     |   4 +-
 .../framework/details/ssa_graph_builder.cc    |  13 ++-
 .../framework/details/ssa_graph_builder.h     |   4 +
 .../framework/details/ssa_graph_checker.cc    |  12 +-
 .../framework/details/ssa_graph_checker.h     |   4 +-
 .../framework/details/ssa_graph_printer.cc    |   6 +-
 .../framework/details/ssa_graph_printer.h     |   4 +-
 .../details/threaded_ssa_graph_executor.cc    |   8 +-
 paddle/fluid/framework/ir/graph.h             |   8 +-
 paddle/fluid/framework/ir/graph_viz_pass.cc   |   8 +-
 paddle/fluid/framework/ir/graph_viz_pass.h    |   4 +-
 paddle/fluid/framework/ir/pass.cc             |  16 +++
 paddle/fluid/framework/ir/pass.h              |  70 +++++++++---
 paddle/fluid/framework/parallel_executor.cc   |   2 +-
 16 files changed, 184 insertions(+), 92 deletions(-)

diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
index 65bfaea6a1..e141ce0959 100644
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@@ -75,7 +75,12 @@ can also fuse some `Graph`'s `Node`s.
 class Pass {
  public:
 
-  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
 
   // Get a reference to the attributed previously set.
   template <typename AttrType>
@@ -89,6 +94,9 @@ class Pass {
   // should delete the attribute.
   template <typename AttrType>
   void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
 };
 
 // In my_pass.cc
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index ff90f31cdb..b63c2f695a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -34,16 +34,22 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+static const char kLossVarName[] = "loss_var_name";
+static const char kPlaces[] = "places";
+static const char kParams[] = "params";
+static const char kLocalScopes[] = "local_scopes";
+static const char kStrategy[] = "strategy";
+
 void MultiDevSSAGraphBuilder::Init() const {
-  loss_var_name_ = Get<const std::string>("loss_var_name");
-  places_ = Get<const std::vector<platform::Place>>("places");
-  local_scopes_ = Get<const std::vector<Scope *>>("local_scopes");
-  strategy_ = Get<const BuildStrategy>("strategy");
+  loss_var_name_ = Get<const std::string>(kLossVarName);
+  places_ = Get<const std::vector<platform::Place>>(kPlaces);
+  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
+  strategy_ = Get<const BuildStrategy>(kStrategy);
 #ifdef PADDLE_WITH_CUDA
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
 
-  for (auto &p : Get<const std::unordered_set<std::string>>("params")) {
+  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
@@ -58,7 +64,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
                                                 ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   op_handle->SetDeviceContext(p,
                               platform::DeviceContextPool::Instance().Get(p));
 
@@ -225,7 +231,7 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
   return sorted_ret;
 }
 
-std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   Init();
   // Give the topology sort order and rebuild the graph structure.
@@ -241,10 +247,10 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
-  result.Set("vars", new GraphVars(places_.size()));
-  result.Set("dep_vars", new GraphDepVars);
-  result.Set("ops", new GraphOps);
-  result.Set("sharded_var_device", new ShardedVarDevice);
+  result.Set(kGraphVars, new GraphVars(places_.size()));
+  result.Set(kGraphDepVars, new GraphDepVars);
+  result.Set(kGraphOps, new GraphOps);
+  result.Set(kShardedVarDevice, new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
   // realted op in the place 0
@@ -281,7 +287,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
       if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
-          graph->Get<ShardedVarDevice>("sharded_var_device")
+          graph->Get<ShardedVarDevice>(kShardedVarDevice)
               .emplace(n->Name(), op_dev_id);
         }
       } else {
@@ -319,7 +325,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    graph->Get<ShardedVarDevice>("sharded_var_device")
+                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
                         .emplace(g_name, cur_device_id);
                     bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
@@ -406,16 +412,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_);
 #endif
-  result->Get<GraphOps>("ops").emplace_back(op_handle);
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
   auto *in =
-      result->Get<GraphVars>("vars").at(src_dev_id).at(p_name).back().get();
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars").at(i).at(p_name);
+    auto &vars = result->Get<GraphVars>(kGraphVars).at(i).at(p_name);
     auto *out_var = new VarHandle(
         result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
         i, p_name, p);
@@ -427,7 +433,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
-  result->Get<GraphOps>("ops").emplace_back(
+  result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id]));
   CreateOpHandleIOs(result, node, dev_id);
@@ -436,20 +442,20 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
@@ -465,20 +471,20 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
     ir::Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     for (const std::string &d_name : datas) {
-      auto &vars = result->Get<GraphVars>("vars")[i][d_name];
+      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
       PADDLE_ENFORCE(!vars.empty());
       op_handle->AddInput(vars.back().get());
       auto var = new VarHandle(
@@ -524,7 +530,7 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
 
 int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
                                             const std::string &varname) const {
-  auto &sharded_var_device = graph.Get<ShardedVarDevice>("sharded_var_device");
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
   auto got = sharded_var_device.find(varname);
   return got == sharded_var_device.end() ? -1 : got->second;
 }
@@ -544,7 +550,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
         local_scopes_.size(), local_scopes_[i], places_[i],
         communication_dev_ctx);
-    result->Get<GraphOps>("ops").emplace_back(op_handle);
+    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
     // factor. So it does not depend on any other operators.
@@ -565,7 +571,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>("ops").emplace_back(
+    result->Get<GraphOps>(kGraphOps).emplace_back(
         new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
     CreateOpHandleIOs(result, node, scope_idx);
   }
@@ -575,25 +581,25 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
   }
-  auto &vars = result->Get<GraphVars>("vars")[dst_dev_id][og];
+  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
   auto var =
       new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
                     vars.size(), dst_dev_id, og, places_[dst_dev_id]);
@@ -606,11 +612,11 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
 // on it.
 void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>("ops")) {
+  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
     if (prev_op->Name() == prev_op_name) {
       auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
       prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
       op->AddInput(dep_var);
     }
   }
@@ -635,18 +641,18 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>("sharded_var_device")
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
             .emplace(varname, op_dev_id);
       }
     }
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>("sharded_var_device")
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
           .emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
     op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>("sharded_var_device")
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
           .emplace(varname, op_dev_id);
     }
   } else {
@@ -661,7 +667,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 
   CreateComputationalOp(result, node, op_dev_id);
   if (node->Op()->Type() == "concat") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "fetch_barrier");
   }
 }
@@ -687,7 +693,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
       }
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>("sharded_var_device")
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
             .emplace(varname, op_dev_id);
       }
     }
@@ -698,7 +704,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
     }
     op_dev_id = GetAppropriateDeviceID(output_var_names);
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>("sharded_var_device")
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
           .emplace(varname, op_dev_id);
     }
   } else {
@@ -709,17 +715,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
   PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                  node->Op()->Type());
 
-  result->Get<GraphOps>("ops").emplace_back(new RPCOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
       result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
       node->Op()->Type(), places_[op_dev_id]));
 
   if (node->Op()->Type() == "send_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
   } else if (node->Op()->Type() == "recv") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "send_barrier");
   } else if (node->Op()->Type() == "fetch_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "recv");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
   } else if (node->Op()->Type() == "send") {
     // do nothing
   } else {
@@ -743,4 +749,9 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
 }  // namespace paddle
 
 REGISTER_PASS(multi_device_pass,
-              paddle::framework::details::MultiDevSSAGraphBuilder);
+              paddle::framework::details::MultiDevSSAGraphBuilder)
+    .RequirePassAttr(paddle::framework::details::kLossVarName)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kParams)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes)
+    .RequirePassAttr(paddle::framework::details::kStrategy);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index baea091af3..099dbe5abe 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -31,8 +31,8 @@ class Scope;
 namespace details {
 
 class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
- public:
-  std::unique_ptr<ir::Graph> Apply(
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 506e7eb35c..575532540a 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       if (name_pair.second.size() <= 1) {
         continue;
@@ -50,7 +50,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
           auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
           read_op->AddOutput(dep_var);
           write_op->AddInput(dep_var);
-          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+          graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
         }
       }
     }
@@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
     ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
-  auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
+  auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
   auto &var_holder = var_holders[node->Name()];
   VarHandle *var = nullptr;
   if (var_holder.empty()) {
@@ -83,7 +83,8 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
                                      ir::Node *new_node,
                                      const platform::Place &place,
                                      size_t place_offset) {
-  auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
+  auto &vars =
+      graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
   size_t version = vars.size();
   auto var =
       new VarHandle(new_node, version, place_offset, new_node->Name(), place);
@@ -92,12 +93,12 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
 }
 
 void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
-  for (auto &op : graph->Get<GraphOps>("ops")) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (!op->Outputs().empty()) {
       continue;
     }
     auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
     op->AddOutput(dummy_leaf);
   }
 }
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index e0ad027315..53a4ad003d 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -39,15 +39,19 @@ namespace details {
 typedef std::vector<
     std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
     GraphVars;
+const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
 typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+const char kGraphDepVars[] = "dep_vars";
 
 // all operators. NOTE that even we use a vector here, the operators is
 // unordered.
 typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+const char kGraphOps[] = "ops";
 
 typedef std::unordered_map<std::string, int> ShardedVarDevice;
+const char kShardedVarDevice[] = "sharded_var_device";
 
 class SSAGraphBuilder : public ir::Pass {
  public:
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
index 2994329f48..b9e1cda1f2 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
     }
   };
 
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
     }
   }
 
-  for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
     insert_pending_var(var.get());
   }
 
-  for (auto &op : graph->Get<GraphOps>("ops")) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (op->Inputs().empty()) {
       ready_ops.insert(op.get());
     } else {
@@ -87,4 +87,8 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
 }  // namespace paddle
 
 REGISTER_PASS(multi_device_check_pass,
-              paddle::framework::details::SSAGraghBuilderWithChecker);
+              paddle::framework::details::SSAGraghBuilderWithChecker)
+    .RequireGraphAttr(paddle::framework::details::kGraphVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphOps)
+    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 25891cf74d..0e861ecb23 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -23,8 +23,8 @@ namespace framework {
 namespace details {
 
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
- public:
-  std::unique_ptr<ir::Graph> Apply(
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override {
     PADDLE_ENFORCE(IsValidGraph(graph.get()));
     return graph;
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
index 95d0641d72..ec3f31ab8d 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -22,7 +22,7 @@ namespace details {
 
 template <typename Callback>
 static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
-  for (auto &each : graph.Get<GraphVars>("vars")) {
+  for (auto &each : graph.Get<GraphVars>(kGraphVars)) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
         callback(*pair2);
@@ -30,7 +30,7 @@ static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
     }
   }
 
-  for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph.Get<GraphDepVars>(kGraphDepVars)) {
     callback(*var);
   }
 }
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.Get<GraphOps>("ops")) {
+  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index bd4498c061..5eafd1805c 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -36,8 +36,8 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 };
 
 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
- public:
-  std::unique_ptr<ir::Graph> Apply(
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override {
     std::unique_ptr<std::ostream> fout(
         new std::ofstream(Get<const std::string>("debug_graphviz_path")));
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c19f74476f..eec4050733 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -45,18 +45,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
-  for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
-  for (auto &op : graph_->Get<details::GraphOps>("ops")) {
+  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
@@ -162,7 +162,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 49f39df4b9..78094e46fb 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -40,10 +40,14 @@ class Graph {
     attr_dels_.clear();
   }
 
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
-    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
-                   "%s attr not registered for graph.", attr_name);
+    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
+                   attr_name);
     return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 7d1cff7178..8cb812d138 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -20,10 +20,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+static const char kGraphVizPath[] = "graph_viz_path";
 
-std::unique_ptr<ir::Graph> GraphVizPass::Apply(
+std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  const std::string graph_viz_path = Get<std::string>("graph_viz_path");
+  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
@@ -67,4 +68,5 @@ std::unique_ptr<ir::Graph> GraphVizPass::Apply(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass);
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 04c0c35d12..1fd8c8a26e 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -28,8 +28,8 @@ namespace framework {
 namespace ir {
 
 class GraphVizPass : public Pass {
- public:
-  std::unique_ptr<ir::Graph> Apply(
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override;
 };
 
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 0e68ecb56f..2ebc3c7430 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -17,6 +17,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  for (const std::string& attr : required_pass_attrs_) {
+    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
+                   "Required pass atrribute %s not registered.", attr);
+  }
+  for (const std::string& attr : required_graph_attrs_) {
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not exist.",
+                   attr);
+  }
+  auto applied_graph = ApplyImpl(std::move(graph));
+  // TODO(panyx0718): Add more verifications.
+  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+                 "Illegal Pass. Generated graph shouldn't has cycle.");
+  return applied_graph;
+}
+
 PassRegistry& PassRegistry::Instance() {
   static PassRegistry g_pass_info_map;
   return g_pass_info_map;
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index f254ef62df..3f65794fab 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/variant.h"
@@ -26,6 +27,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+template <typename PassType>
+struct PassRegistrar;
 
 class Pass {
  public:
@@ -40,7 +43,7 @@ class Pass {
     attr_dels_.clear();
   }
 
-  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
 
   // Get a reference to the attributed previously set.
   template <typename AttrType>
@@ -69,7 +72,25 @@ class Pass {
     attrs_[attr_name] = attr;
   }
 
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(
+      std::unique_ptr<Graph> graph) const = 0;
+
  private:
+  template <typename PassType>
+  friend struct PassRegistrar;
+
+  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
+    required_pass_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  void RegisterRequiredGraphAttrs(
+      const std::unordered_set<std::string> &attrs) {
+    required_graph_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
 };
@@ -119,10 +140,28 @@ struct PassRegistrar : public Registrar {
   explicit PassRegistrar(const char *pass_type) {
     PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                    "'%s' is registered more than once.", pass_type);
-    PassRegistry::Instance().Insert(pass_type, []() -> std::unique_ptr<Pass> {
-      return std::unique_ptr<Pass>(new PassType());
-    });
+    PassRegistry::Instance().Insert(
+        pass_type, [this]() -> std::unique_ptr<Pass> {
+          std::unique_ptr<Pass> pass(new PassType());
+          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
+          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          return pass;
+        });
   }
+
+  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
+    required_pass_attrs_.insert(attr);
+    return *this;
+  }
+
+  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
+    required_graph_attrs_.insert(attr);
+    return *this;
+  }
+
+ private:
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
 };
 
 #define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
@@ -132,16 +171,19 @@ struct PassRegistrar : public Registrar {
                 msg)
 
 // Register a new pass that can be applied on the IR.
-#define REGISTER_PASS(pass_type, pass_class)                \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
-      __reg_pass__##pass_type,                              \
-      "REGISTER_PASS must be called in global namespace");  \
-  static ::paddle::framework::ir::PassRegistrar<pass_class> \
-      __pass_registrar_##pass_type##__(#pass_type);         \
-  int TouchPassRegistrar_##pass_type() {                    \
-    __pass_registrar_##pass_type##__.Touch();               \
-    return 0;                                               \
-  }
+#define REGISTER_PASS(pass_type, pass_class)                          \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __reg_pass__##pass_type,                                        \
+      "REGISTER_PASS must be called in global namespace");            \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      __pass_registrar_##pass_type##__(#pass_type);                   \
+  int TouchPassRegistrar_##pass_type() {                              \
+    __pass_registrar_##pass_type##__.Touch();                         \
+    return 0;                                                         \
+  }                                                                   \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+          __pass_registrar_##pass_type##__
 
 #define USE_PASS(pass_type)                                           \
   STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 77bed5c999..112b48ca31 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices(
     if (member_->executor_) {
       auto &sharded_var_device =
           member_->executor_->Graph().Get<details::ShardedVarDevice>(
-              "sharded_var_device");
+              details::kShardedVarDevice);
       if (sharded_var_device.find(var) != sharded_var_device.end()) {
         var_dev_id = sharded_var_device.at(var);
       }

From 12e9bf6c178aa1eb3d7839a2d28fe0c2a98dbead Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 26 Jul 2018 15:05:08 +0800
Subject: [PATCH 11/40] clean up

---
 .../operators/distributed/send_recv.proto     | 97 -------------------
 .../unittests/parallel_executor_test_base.py  |  1 -
 .../unittests/test_parallel_executor_mnist.py | 42 ++++----
 3 files changed, 19 insertions(+), 121 deletions(-)
 delete mode 100644 paddle/fluid/operators/distributed/send_recv.proto

diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
deleted file mode 100644
index d0595ef108..0000000000
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-option cc_generic_services = false;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-
-  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
-}
-
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-}
-
-message VoidMessage {}
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index c5e69e41be..fcf86cc583 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -71,7 +71,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             exec_strategy.allow_op_delay = allow_op_delay
 
             build_strategy = fluid.BuildStrategy()
-            build_strategy.debug_graphviz_path = "/tmp/graphviz"
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index d740eb5443..76389d916f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -152,6 +152,16 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             use_reduce=use_reduce)
 
+    def test_simple_fc(self):
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+        self.check_simple_fc_convergence(False)
+
+    def test_simple_fc_with_new_strategy(self):
+        # use_cuda, use_reduce
+        self._compare_reduce_and_allreduce(simple_fc_net, True)
+        self._compare_reduce_and_allreduce(simple_fc_net, False)
+
     def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
@@ -178,6 +188,10 @@ class TestMNIST(TestParallelExecutorBase):
         for p_l in parallel_last_loss:
             self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
 
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(True)
+        self.check_simple_fc_parallel_accuracy(False)
+
     def check_batchnorm_fc_convergence(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
@@ -192,31 +206,13 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda)
 
-    def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        self.check_network_convergence(
-            fc_with_batchnorm, use_cuda=use_cuda, use_reduce=False)
-        """
-        img, label = self._init_data()
-
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=False)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=True)
-        """
+    def test_batchnorm_fc(self):
+        self.check_batchnorm_fc_convergence(True)
+        self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence_use_reduce(True)
-        # self.check_batchnorm_fc_convergence_use_reduce(False)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
 if __name__ == '__main__':

From 507c14304788e3cf5160db354dc23bdc9e725ff0 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 17:27:17 +0800
Subject: [PATCH 12/40] im2col cfo cpu code clean

---
 paddle/fluid/operators/math/im2col.cc        | 203 +-------------
 paddle/fluid/operators/math/im2col_cfo_cpu.h | 265 +++++++++++++++++++
 2 files changed, 270 insertions(+), 198 deletions(-)
 create mode 100644 paddle/fluid/operators/math/im2col_cfo_cpu.h

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index be373c99d1..478900e1c2 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,210 +36,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int output_height = col->dims()[3];
-    int output_width = col->dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    // TODO(TJ): change me to template
-    // further optimize: padding == 1 need special
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
-      int col_matrix_width = output_width * output_height;
-      int im_size = im_height * im_width;
       if (padding[0] == 0 && padding[1] == 0) {
-        size_t copy_size = sizeof(T) * output_width;
-        for (int oh = 0; oh < output_height; ++oh) {
-          const T* im_data_start = im_data + oh * im_width;
-          T* dst_data = col_data + oh * output_width;
-          for (int ic = 0; ic < im_channels; ++ic) {
-            const T* src_data = im_data_start + ic * im_size;
-            for (int kh = 0; kh < filter_height; ++kh) {
-              for (int kw = 0; kw < filter_width; ++kw) {
-                std::memcpy(dst_data, src_data + kw, copy_size);
-                dst_data = dst_data + col_matrix_width;
-              }
-              src_data = src_data + im_width;
-            }
-          }
-        }
-        return;
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
       } else {
-        int plh = padding[0];
-        int plw = padding[1];
-        int prh =
-            (output_height - 1) * stride[0] + filter_height - im_height - plh;
-        int prw =
-            (output_width - 1) * stride[1] + filter_width - im_width - plw;
-
-        // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
-        // TODO(TJ): refine ph*xxx
-        assert(plh == prh);  // because stride_h == 1
-        int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
-        int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
-        for (int ph = 0; ph < plh; ++ph) {
-          int sz = output_width * (plh - ph);
-          size_t copy_sz = sizeof(T) * sz;
-          T* col_start_l = col_data + ph * col_block_fh;
-          T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh +
-                           col_matrix_width - sz;
-          for (int ic = 0; ic < im_channels; ++ic) {
-            T* dst_data_l = col_start_l + ic * col_block_ic;
-            T* dst_data_r = col_start_r + ic * col_block_ic;
-            for (int kw = 0; kw < filter_width; ++kw) {
-              std::memset(dst_data_l, 0, copy_sz);
-              std::memset(dst_data_r, 0, copy_sz);
-              dst_data_l = dst_data_l + col_matrix_width;
-              dst_data_r = dst_data_r + col_matrix_width;
-            }
-          }
-        }
-
-        // fill width padding
-        assert(plw == prw);  // because stride_w == 1
-        if (plw == 1) {
-          auto pad = static_cast<T>(0);  // padding zero
-          for (int ic = 0; ic < im_channels; ++ic) {
-            // TODO(TJ): use add and resue stride
-            T* dst_data_ic = col_data + ic * col_block_ic;
-            for (int kh = 0; kh < filter_height; ++kh) {
-              T* dst_data_kh = dst_data_ic + kh * col_block_fh;
-              for (T* dst_data :
-                   {dst_data_kh, dst_data_kh +
-                                     (filter_width - prw) * col_matrix_width +
-                                     output_width - 1}) {
-                // TODO(TJ): from plh, saving repeated assignment
-                for (int oh = 0; oh < output_height; ++oh) {
-                  *dst_data = pad;
-                  dst_data = dst_data + output_width;
-                }
-              }
-            }
-          }
-        } else {
-          // padding_size > 1
-          for (int ic = 0; ic < im_channels; ++ic) {
-            // TODO(TJ): use add and resue stride
-            T* dst_data_ic = col_data + ic * col_block_ic;
-            for (int kh = 0; kh < filter_height; ++kh) {
-              T* dst_data_kh = dst_data_ic + kh * col_block_fh;
-              for (int kw = 0; kw < plw; ++kw) {
-                // TODO(TJ): reuse array outside this for
-                size_t sz = sizeof(T) * (plw - kw);
-                T* dst_data = dst_data_kh + kw * col_matrix_width;
-                // TODO(TJ): from plh, saving repeated assignment
-                for (int oh = 0; oh < output_height; ++oh) {
-                  std::memset(dst_data, 0, sz);
-                  dst_data = dst_data + output_width;
-                }
-              }
-              // TODO(TJ): use reverse to save cache
-              for (int kw = 0; kw < prw; ++kw) {
-                // TODO(TJ): reuse array outside this for
-                auto num = (prw - kw);
-                size_t sz = sizeof(T) * num;
-                T* dst_data = dst_data_kh +
-                              (filter_width - 1 - kw) * col_matrix_width +
-                              output_width - num;
-                // TODO(TJ): from plh, saving repeated assignment
-                for (int oh = 0; oh < output_height; ++oh) {
-                  std::memset(dst_data, 0, sz);
-                  dst_data = dst_data + output_width;
-                }
-              }
-            }
-          }
-        }
-
-        // fill im_data
-        // padding cover two cases:
-        // 1. kw > 2*pw: kw = 3, pw = 1
-        // 0 x x x x ... x x x x 0
-        // 1 1 1             1 1 1
-        // ==>
-        // 0 x ... x x
-        // x x ... x x
-        // x x ... x 0
-        // 2. kw < 2*pw: kw = 3, pw = 2
-        // 0 0 x x x ... x x x 0 0
-        // 1 1 1             1 1 1
-        // ==>
-        // 0 0 x ... x x x
-        // 0 x x ... x x 0
-        // x x x ... x 0 0
-
-        // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
-        // (output_width-1)}
-        // length of copy_size is equal kw.
-        if (plw + prw < filter_width) {
-          for (int oh = 0; oh < output_height; ++oh) {
-            const T* im_data_start =
-                im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
-            T* dst_data = col_data + oh * output_width;
-            for (int ic = 0; ic < im_channels; ++ic) {
-              const T* src_data = im_data_start + ic * im_size;
-              for (int kh = 0; kh < filter_height; ++kh) {
-                if ((oh < plh && kh < plh) ||
-                    (oh > (output_height - prh - 1) &&
-                     kh > (filter_height - prh - 1))) {
-                  dst_data = dst_data + filter_width * col_matrix_width;
-                  continue;
-                }
-                // TODO(TJ): reuse plw-kw outside this for
-                // try to unify
-                for (int kw = 0; kw < plw; ++kw) {
-                  std::memcpy(dst_data + (plw - kw), src_data,
-                              sizeof(T) * (output_width - (plw - kw)));
-                  dst_data = dst_data + col_matrix_width;
-                }
-                for (int kw = plw; kw < filter_width - prw; ++kw) {
-                  std::memcpy(dst_data, src_data + (kw - plw),
-                              sizeof(T) * output_width);
-                  dst_data = dst_data + col_matrix_width;
-                }
-                int i = 1;
-                for (int kw = filter_width - prw; kw < filter_width;
-                     ++kw, ++i) {
-                  std::memcpy(dst_data, src_data + (kw - plw),
-                              sizeof(T) * (output_width - i));
-                  dst_data = dst_data + col_matrix_width;
-                }
-                src_data = src_data + im_width;
-              }
-            }
-          }
-        } else {
-          LOG(FATAL) << "Not implement yet";
-        }
-        return;
-      }
-    }
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < output_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < output_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * output_height + h) * output_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
+        im2col_sh1sw1dh1dw1<T>(im, padding, col);
       }
+      return;
     }
+    im2col_common<T>(im, dilation, stride, padding, col);
   }
 };
 
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
new file mode 100644
index 0000000000..ebdb062b7a
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -0,0 +1,265 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/*
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ * */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + oh * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+// further optimize: padding == 1 need special
+template <typename T>
+inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
+                                const std::vector<int>& padding,
+                                framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  const int sh = 1;
+  const int sw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+
+  int plh = padding[0];
+  int plw = padding[1];
+  int prh = (output_height - 1) * sh + filter_height - im_height - plh;
+  int prw = (output_width - 1) * sw + filter_width - im_width - plw;
+
+  // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
+  // TODO(TJ): refine ph*xxx
+  assert(plh == prh);                                  // because stride_h == 1
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+  for (int ph = 0; ph < plh; ++ph) {
+    int sz = output_width * (plh - ph);
+    size_t copy_sz = sizeof(T) * sz;
+    T* col_start_l = col_data + ph * col_block_fh;
+    T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh +
+                     col_matrix_width - sz;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l + ic * col_block_ic;
+      T* dst_data_r = col_start_r + ic * col_block_ic;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_sz);
+        std::memset(dst_data_r, 0, copy_sz);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+    }
+  }
+
+  // fill width padding
+  assert(plw == prw);  // because stride_w == 1
+  if (plw == 1) {
+    auto pad = static_cast<T>(0);  // padding zero
+    for (int ic = 0; ic < im_channels; ++ic) {
+      // TODO(TJ): use add and resue stride
+      T* dst_data_ic = col_data + ic * col_block_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data_kh = dst_data_ic + kh * col_block_fh;
+        for (T* dst_data :
+             {dst_data_kh, dst_data_kh +
+                               (filter_width - prw) * col_matrix_width +
+                               output_width - 1}) {
+          // TODO(TJ): from plh, saving repeated assignment
+          for (int oh = 0; oh < output_height; ++oh) {
+            *dst_data = pad;
+            dst_data = dst_data + output_width;
+          }
+        }
+      }
+    }
+  } else {
+    // padding_size > 1
+    for (int ic = 0; ic < im_channels; ++ic) {
+      // TODO(TJ): use add and resue stride
+      T* dst_data_ic = col_data + ic * col_block_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data_kh = dst_data_ic + kh * col_block_fh;
+        for (int kw = 0; kw < plw; ++kw) {
+          // TODO(TJ): reuse array outside this for
+          size_t sz = sizeof(T) * (plw - kw);
+          T* dst_data = dst_data_kh + kw * col_matrix_width;
+          // TODO(TJ): from plh, saving repeated assignment
+          for (int oh = 0; oh < output_height; ++oh) {
+            std::memset(dst_data, 0, sz);
+            dst_data = dst_data + output_width;
+          }
+        }
+        // TODO(TJ): use reverse to save cache
+        for (int kw = 0; kw < prw; ++kw) {
+          // TODO(TJ): reuse array outside this for
+          auto num = (prw - kw);
+          size_t sz = sizeof(T) * num;
+          T* dst_data = dst_data_kh +
+                        (filter_width - 1 - kw) * col_matrix_width +
+                        output_width - num;
+          // TODO(TJ): from plh, saving repeated assignment
+          for (int oh = 0; oh < output_height; ++oh) {
+            std::memset(dst_data, 0, sz);
+            dst_data = dst_data + output_width;
+          }
+        }
+      }
+    }
+  }
+
+  // fill im_data
+  // padding cover two cases:
+  // 1. kw > 2*pw: kw = 3, pw = 1
+  // 0 x x x x ... x x x x 0
+  // 1 1 1             1 1 1
+  // ==>
+  // 0 x ... x x
+  // x x ... x x
+  // x x ... x 0
+  // 2. kw < 2*pw: kw = 3, pw = 2
+  // 0 0 x x x ... x x x 0 0
+  // 1 1 1             1 1 1
+  // ==>
+  // 0 0 x ... x x x
+  // 0 x x ... x x 0
+  // x x x ... x 0 0
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  if (plw + prw < filter_width) {
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + filter_width * col_matrix_width;
+            continue;
+          }
+          // TODO(TJ): reuse plw-kw outside this for
+          // try to unify
+          for (int kw = 0; kw < plw; ++kw) {
+            std::memcpy(dst_data + (plw - kw), src_data,
+                        sizeof(T) * (output_width - (plw - kw)));
+            dst_data = dst_data + col_matrix_width;
+          }
+          for (int kw = plw; kw < filter_width - prw; ++kw) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * output_width);
+            dst_data = dst_data + col_matrix_width;
+          }
+          int i = 1;
+          for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * (output_width - i));
+            dst_data = dst_data + col_matrix_width;
+          }
+          src_data = src_data + im_width;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "Not implement yet";
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle

From 8d6be4fb5f2ad665b5cfa06c7573bd1cc895e3fb Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 18:08:36 +0800
Subject: [PATCH 13/40] refine im2col test and add benchmark

---
 paddle/fluid/operators/math/im2col_test.cc | 140 +++++++++++----------
 1 file changed, 73 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index db61f68db3..b22d21a9a3 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <sys/time.h>
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -160,82 +162,86 @@ void testIm2col() {
   delete context;
 }
 
+TEST(math, im2col) {
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
+
+#define PREPARE_IM2COL_CPU                                                   \
+  paddle::platform::CPUPlace place;                                          \
+  paddle::platform::CPUDeviceContext context(place);                         \
+  paddle::framework::Tensor input;                                           \
+  paddle::framework::Tensor out;                                             \
+  paddle::framework::Tensor ref;                                             \
+  std::vector<int> padding({ph, pw});                                        \
+  std::vector<int> stride({1, 1});                                           \
+  std::vector<int> dilation({1, 1});                                         \
+  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
+  for (int i = 0; i < input.numel(); ++i) {                                  \
+    input_ptr[i] = static_cast<float>(i + 1);                                \
+  }                                                                          \
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
+  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  paddle::operators::math::Im2ColFunctor<                                    \
+      paddle::operators::math::ColFormat::kCFO,                              \
+      paddle::platform::CPUDeviceContext, float>                             \
+      im2col
+
 void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output;
-  paddle::framework::Tensor ref_output;
-  std::vector<int> padding({ph, pw});
-  std::vector<int> stride({1, 1});    // stride_y, stride_x
-  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
-  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;
-  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;
-  float* input_ptr =
-      input.mutable_data<float>({ic, ih, iw}, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] = static_cast<float>(i + 1);
+  PREPARE_IM2COL_CPU;
+
+  im2col(context, input, dilation, stride, padding, &out);
+  paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                padding, &ref);
+
+  float* ref_data = ref.data<float>();
+  float* out_data = out.data<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], ref_data[i]);
   }
+}
 
-  paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext context(place);
-  output.mutable_data<float>({ic, fh, fw, output_height, output_width}, place);
-  ref_output.mutable_data<float>({ic, fh, fw, output_height, output_width},
-                                 place);
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO,
-      paddle::platform::CPUDeviceContext, float>
-      im2col;
-  im2col(context, input, dilation, stride, padding, &output);
-  auto ref_im2col = [&](
-      const paddle::framework::Tensor& im, const std::vector<int>& dilation,
-      const std::vector<int>& stride, const std::vector<int>& padding,
-      paddle::framework::Tensor* col) {
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int output_height = col->dims()[3];
-    int output_width = col->dims()[4];
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const float* im_data = im.data<float>();
-    float* col_data = col->data<float>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < output_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < output_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * output_height + h) * output_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? 0.f
-                                  : im_data[im_idx];
-        }
-      }
-    }
+void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+  constexpr int repeat = 30;
+  auto GetCurrentMs = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
   };
+  auto t1 = GetCurrentMs();
+  for (int i = 0; i < repeat; ++i) {
+    im2col(context, input, dilation, stride, padding, &out);
+  }
+  auto t2 = GetCurrentMs();
 
-  ref_im2col(input, dilation, stride, padding, &ref_output);
-
-  float* out_cfo_ptr = output.data<float>();
-  float* out_ref_ptr = ref_output.data<float>();
-  for (int i = 0; i < output.numel(); ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]);
+  for (int i = 0; i < repeat; ++i) {
+    paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                  padding, &ref);
   }
+  auto t3 = GetCurrentMs();
+
+  LOG(INFO) << "before: " << (t3 - t2) / repeat
+            << ",after: " << (t2 - t1) / repeat;
 }
 
-TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-  testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0,
+TEST(math, im2col_cputest) {
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0,
                 /*pw*/ 0);
   testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
                 /*pw*/ 1);
-#ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext,
-             paddle::platform::CUDAPlace>();
-#endif
+
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
+              /*pw*/ 1);
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0,
+              /*pw*/ 0);
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1,
+              /*pw*/ 1);
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0,
+              /*pw*/ 0);
 }

From 3017f4607642e0f124e6a1f12f310f853a9536b5 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 21:47:10 +0800
Subject: [PATCH 14/40] add more test cases

---
 paddle/fluid/operators/math/im2col_cfo_cpu.h |  4 +-
 paddle/fluid/operators/math/im2col_test.cc   | 40 +++++++++++++++-----
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index ebdb062b7a..af581f3218 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -108,8 +108,8 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
   int filter_width = col->dims()[2];
   int output_height = col->dims()[3];
   int output_width = col->dims()[4];
-  const int sh = 1;
-  const int sw = 1;
+  constexpr int sh = 1;
+  constexpr int sw = 1;
 
   const T* im_data = im.data<T>();
   T* col_data = col->data<T>();
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index b22d21a9a3..7897be2ec3 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -208,7 +208,7 @@ void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
 
 void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
   PREPARE_IM2COL_CPU;
-  constexpr int repeat = 30;
+  constexpr int repeat = 100;
   auto GetCurrentMs = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -231,17 +231,39 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
 }
 
 TEST(math, im2col_cputest) {
-  testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0,
-                /*pw*/ 0);
-  testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
-                /*pw*/ 1);
+  // padding_h == padding_w
+  for (int p = 0; p < 4; ++p) {
+    // width == height
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
+                  /*pw*/ p);
 
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
-              /*pw*/ 1);
+    // height != width
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+
+    // filter == 1
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+  }
+  // padding_h != padding_w
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 2);
+
+  // benchmark
+  LOG(INFO) << "padding == 0";
   benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0,
               /*pw*/ 0);
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1,
-              /*pw*/ 1);
   benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0,
               /*pw*/ 0);
+  LOG(INFO) << "padding == 1";
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
+              /*pw*/ 1);
+  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1,
+              /*pw*/ 1);
 }

From 52eb86e30f6739a0d5ca115507b90f48bc8b89fd Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 26 Jul 2018 23:19:54 +0800
Subject: [PATCH 15/40] refine im2col benchmark

---
 paddle/fluid/operators/math/im2col_test.cc | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 7897be2ec3..789d8e684a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -256,14 +256,11 @@ TEST(math, im2col_cputest) {
                 /*pw*/ 2);
 
   // benchmark
-  LOG(INFO) << "padding == 0";
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 0,
-              /*pw*/ 0);
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 0,
-              /*pw*/ 0);
-  LOG(INFO) << "padding == 1";
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
-              /*pw*/ 1);
-  benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ 5, /*fw*/ 5, /*ph*/ 1,
-              /*pw*/ 1);
+  for (int p : {0, 1, 2}) {
+    for (int k : {3, 5}) {
+      LOG(INFO) << "padding == " << p << ", filter == " << k;
+      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
+                  /*ph*/ p, /*pw*/ p);
+    }
+  }
 }

From 99c0c20468cb3b816a34e039a8d2f6021d907437 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 27 Jul 2018 09:48:57 +0800
Subject: [PATCH 16/40] add pass test

---
 doc/fluid/design/ir/draft.md                |  58 +++++++++-
 paddle/fluid/framework/ir/CMakeLists.txt    |   8 +-
 paddle/fluid/framework/ir/graph.h           |   3 +-
 paddle/fluid/framework/ir/pass.cc           |   8 +-
 paddle/fluid/framework/ir/pass.h            |  18 ++--
 paddle/fluid/framework/ir/pass_test.cc      | 112 ++++++++++++++++++++
 paddle/fluid/framework/parallel_executor.cc |  14 ++-
 7 files changed, 195 insertions(+), 26 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/pass_test.cc

diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
index e141ce0959..c29337cba1 100644
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@@ -64,6 +64,41 @@ can also contain other things that describe some properties of
 the `Graph` or `Graph` nodes. `Attribute` can be passed
 across `Pass`. However, it should be used with care.
 
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
 #### Pass
 
 `Pass` represents a transformation of `Graph`. Its input
@@ -101,13 +136,15 @@ class Pass {
 
 // In my_pass.cc
 class MyPass : public Pass {
- public:
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
     // do something.
     return graph;
   }
 }
-REGISTER_PASS(my_pass, MyPass);
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
 
 
 // To use the pass.
@@ -132,4 +169,17 @@ maintaining the original modeling logic.
 * Graph is transformed from raw model logic to a
 form that is efficient to execute.
 
-Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a6bdd12b63..bf7d76a8a6 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,7 +1,9 @@
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
-cc_library(pass SRCS pass.cc DEPS graph node)
+cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
-cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry)
-cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry)
+
+cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
+cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 78094e46fb..c9d55fbf52 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -53,7 +53,8 @@ class Graph {
 
   template <typename AttrType>
   void Set(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 2ebc3c7430..d7158eba62 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -13,23 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
+  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
   for (const std::string& attr : required_pass_attrs_) {
     PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
-                   "Required pass atrribute %s not registered.", attr);
+                   "Required pass atrribute %s not set.", attr);
   }
   for (const std::string& attr : required_graph_attrs_) {
-    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not exist.",
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
                    attr);
   }
   auto applied_graph = ApplyImpl(std::move(graph));
   // TODO(panyx0718): Add more verifications.
   PADDLE_ENFORCE(!HasCircle(*applied_graph),
                  "Illegal Pass. Generated graph shouldn't has cycle.");
+  applied_ = true;
   return applied_graph;
 }
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 3f65794fab..0f14083d25 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/variant.h"
@@ -56,7 +55,8 @@ class Pass {
   // Set a pointer to the attribute. Pass takes ownership of the attribute.
   template <typename AttrType>
   void Set(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
@@ -89,6 +89,7 @@ class Pass {
     required_graph_attrs_.insert(attrs.begin(), attrs.end());
   }
 
+  mutable bool applied_{false};
   std::unordered_set<std::string> required_pass_attrs_;
   std::unordered_set<std::string> required_graph_attrs_;
   std::map<std::string, boost::any> attrs_;
@@ -118,14 +119,15 @@ class PassRegistry {
     return map_.find(pass_type) != map_.end();
   }
 
-  void Insert(const std::string &type, const PassCreator &pass_creator) {
-    PADDLE_ENFORCE(!Has(type), "Pass %s has been registered", type);
-    map_.insert({type, pass_creator});
+  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
+    map_.insert({pass_type, pass_creator});
   }
 
-  std::unique_ptr<Pass> Get(const std::string &type) const {
-    PADDLE_ENFORCE(Has(type), "Pass %s has not been registered", type);
-    return map_.at(type)();
+  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
+    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
+                   pass_type);
+    return map_.at(pass_type)();
   }
 
  private:
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
new file mode 100644
index 0000000000..5b5011412e
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+class TestPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    graph->Set<int>("copy_test_pass_attr", new int);
+    graph->Set<int>("copy_test_graph_attr", new int);
+
+    int test_pass_attr = this->Get<int>("test_pass_attr");
+    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
+
+    int test_graph_attr = graph->Get<int>("test_graph_attr");
+    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
+    return graph;
+  }
+};
+
+TEST(PassTest, TestPassAttrCheck) {
+  ProgramDesc prog;
+  auto pass = PassRegistry::Instance().Get("test_pass");
+  std::unique_ptr<Graph> graph(new Graph(prog));
+  std::string exception;
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
+
+  int val = 1;
+  graph.reset(new Graph(prog));
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
+
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 1;
+  graph = pass->Apply(std::move(graph));
+  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
+  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+
+  pass = PassRegistry::Instance().Get("test_pass");
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+  graph.reset(new Graph(prog));
+  BuildCircleGraph(graph.get());
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 2;
+  try {
+    auto tmp = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
+    .RequirePassAttr("test_pass_attr")
+    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 112b48ca31..b5f01a9a2b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -44,7 +44,10 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
 #else
     const BuildStrategy &strategy) {
 #endif
+  // Convert the program to graph.
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  // Apply a graph viz pass to record a graph.
   if (!strategy.debug_graphviz_path_.empty()) {
     auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
     const std::string graph_path = string::Sprintf(
@@ -53,6 +56,7 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     graph = viz_pass->Apply(std::move(graph));
   }
 
+  // Convert graph to run on multi-devices.
   auto multi_device_pass =
       ir::PassRegistry::Instance().Get("multi_device_pass");
   multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
@@ -71,6 +75,7 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
 #endif
   graph = multi_device_pass->Apply(std::move(graph));
 
+  // Apply a graph print pass to record a graph with device info.
   if (!strategy.debug_graphviz_path_.empty()) {
     auto multi_device_print_pass =
         ir::PassRegistry::Instance().Get("multi_device_print_pass");
@@ -81,17 +86,10 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     graph = multi_device_print_pass->Apply(std::move(graph));
   }
 
+  // Verify that the graph is correct for multi-device executor.
   auto multi_device_check_pass =
       ir::PassRegistry::Instance().Get("multi_device_check_pass");
   graph = multi_device_check_pass->Apply(std::move(graph));
-
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_before_exec");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
   return graph;
 }
 

From 65d418f060507999d74c7adca0575e8b991e60b4 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 27 Jul 2018 15:27:08 +0800
Subject: [PATCH 17/40] complete im2col with padding==1 and speedup filter
 width==1

---
 paddle/fluid/operators/math/im2col.cc        |   8 +-
 paddle/fluid/operators/math/im2col_cfo_cpu.h | 218 +++++++++----------
 paddle/fluid/operators/math/im2col_test.cc   |  12 +-
 3 files changed, 113 insertions(+), 125 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 478900e1c2..1472edbbf4 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -40,10 +40,12 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
         dilation[1] == 1) {
       if (padding[0] == 0 && padding[1] == 0) {
         im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
-      } else {
-        im2col_sh1sw1dh1dw1<T>(im, padding, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
       }
-      return;
+      // TODO(TJ): complete padding >=2
     }
     im2col_common<T>(im, dilation, stride, padding, col);
   }
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index af581f3218..ac843cdc71 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-/*
+/**
  * The most common im2col algorithm.
  * Support dilation, stride and padding.
  */
@@ -61,9 +61,9 @@ inline void im2col_common(const framework::Tensor& im,
   }
 }
 
-/*
+/**
  * im2col algorithm with strides == 1, dilations == 1, paddings == 0
- * */
+ */
 template <typename T>
 inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
                                       framework::Tensor* col) {
@@ -96,11 +96,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
   }
 }
 
-// further optimize: padding == 1 need special
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
 template <typename T>
-inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
-                                const std::vector<int>& padding,
-                                framework::Tensor* col) {
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col) {
   int im_channels = im.dims()[0];
   int im_height = im.dims()[1];
   int im_width = im.dims()[2];
@@ -108,119 +110,57 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
   int filter_width = col->dims()[2];
   int output_height = col->dims()[3];
   int output_width = col->dims()[4];
-  constexpr int sh = 1;
-  constexpr int sw = 1;
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
 
   const T* im_data = im.data<T>();
   T* col_data = col->data<T>();
-  int col_matrix_width = output_width * output_height;
   int im_size = im_height * im_width;
-
-  int plh = padding[0];
-  int plw = padding[1];
-  int prh = (output_height - 1) * sh + filter_height - im_height - plh;
-  int prw = (output_width - 1) * sw + filter_width - im_width - plw;
-
-  // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
-  // TODO(TJ): refine ph*xxx
-  assert(plh == prh);                                  // because stride_h == 1
+  int col_matrix_width = output_width * output_height;
   int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
   int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
-  for (int ph = 0; ph < plh; ++ph) {
-    int sz = output_width * (plh - ph);
-    size_t copy_sz = sizeof(T) * sz;
-    T* col_start_l = col_data + ph * col_block_fh;
-    T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh +
-                     col_matrix_width - sz;
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
     for (int ic = 0; ic < im_channels; ++ic) {
+      // TODO(TJ): move * outside
       T* dst_data_l = col_start_l + ic * col_block_ic;
       T* dst_data_r = col_start_r + ic * col_block_ic;
       for (int kw = 0; kw < filter_width; ++kw) {
-        std::memset(dst_data_l, 0, copy_sz);
-        std::memset(dst_data_r, 0, copy_sz);
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
         dst_data_l = dst_data_l + col_matrix_width;
         dst_data_r = dst_data_r + col_matrix_width;
       }
     }
   }
 
-  // fill width padding
-  assert(plw == prw);  // because stride_w == 1
-  if (plw == 1) {
-    auto pad = static_cast<T>(0);  // padding zero
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
     for (int ic = 0; ic < im_channels; ++ic) {
-      // TODO(TJ): use add and resue stride
+      // TODO(TJ): move * outside
       T* dst_data_ic = col_data + ic * col_block_ic;
       for (int kh = 0; kh < filter_height; ++kh) {
-        T* dst_data_kh = dst_data_ic + kh * col_block_fh;
-        for (T* dst_data :
-             {dst_data_kh, dst_data_kh +
-                               (filter_width - prw) * col_matrix_width +
-                               output_width - 1}) {
-          // TODO(TJ): from plh, saving repeated assignment
-          for (int oh = 0; oh < output_height; ++oh) {
-            *dst_data = pad;
-            dst_data = dst_data + output_width;
-          }
+        // TODO(TJ): move * outside
+        T* dst_data = dst_data_ic + kh * col_block_fh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
         }
       }
     }
-  } else {
-    // padding_size > 1
-    for (int ic = 0; ic < im_channels; ++ic) {
-      // TODO(TJ): use add and resue stride
-      T* dst_data_ic = col_data + ic * col_block_ic;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        T* dst_data_kh = dst_data_ic + kh * col_block_fh;
-        for (int kw = 0; kw < plw; ++kw) {
-          // TODO(TJ): reuse array outside this for
-          size_t sz = sizeof(T) * (plw - kw);
-          T* dst_data = dst_data_kh + kw * col_matrix_width;
-          // TODO(TJ): from plh, saving repeated assignment
-          for (int oh = 0; oh < output_height; ++oh) {
-            std::memset(dst_data, 0, sz);
-            dst_data = dst_data + output_width;
-          }
-        }
-        // TODO(TJ): use reverse to save cache
-        for (int kw = 0; kw < prw; ++kw) {
-          // TODO(TJ): reuse array outside this for
-          auto num = (prw - kw);
-          size_t sz = sizeof(T) * num;
-          T* dst_data = dst_data_kh +
-                        (filter_width - 1 - kw) * col_matrix_width +
-                        output_width - num;
-          // TODO(TJ): from plh, saving repeated assignment
-          for (int oh = 0; oh < output_height; ++oh) {
-            std::memset(dst_data, 0, sz);
-            dst_data = dst_data + output_width;
-          }
-        }
-      }
-    }
-  }
-
-  // fill im_data
-  // padding cover two cases:
-  // 1. kw > 2*pw: kw = 3, pw = 1
-  // 0 x x x x ... x x x x 0
-  // 1 1 1             1 1 1
-  // ==>
-  // 0 x ... x x
-  // x x ... x x
-  // x x ... x 0
-  // 2. kw < 2*pw: kw = 3, pw = 2
-  // 0 0 x x x ... x x x 0 0
-  // 1 1 1             1 1 1
-  // ==>
-  // 0 0 x ... x x x
-  // 0 x x ... x x 0
-  // x x x ... x 0 0
-
-  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
-  // (output_width-1)}
-  // length of copy_size is equal kw.
-  if (plw + prw < filter_width) {
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
     for (int oh = 0; oh < output_height; ++oh) {
       const T* im_data_start =
           im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
@@ -230,33 +170,73 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
         for (int kh = 0; kh < filter_height; ++kh) {
           if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
                                          kh > (filter_height - prh - 1))) {
-            dst_data = dst_data + filter_width * col_matrix_width;
-            continue;
-          }
-          // TODO(TJ): reuse plw-kw outside this for
-          // try to unify
-          for (int kw = 0; kw < plw; ++kw) {
-            std::memcpy(dst_data + (plw - kw), src_data,
-                        sizeof(T) * (output_width - (plw - kw)));
-            dst_data = dst_data + col_matrix_width;
-          }
-          for (int kw = plw; kw < filter_width - prw; ++kw) {
-            std::memcpy(dst_data, src_data + (kw - plw),
-                        sizeof(T) * output_width);
-            dst_data = dst_data + col_matrix_width;
-          }
-          int i = 1;
-          for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
-            std::memcpy(dst_data, src_data + (kw - plw),
-                        sizeof(T) * (output_width - i));
             dst_data = dst_data + col_matrix_width;
+            continue;
           }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
           src_data = src_data + im_width;
         }
       }
     }
-  } else {
-    LOG(FATAL) << "Not implement yet";
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  for (int ic = 0; ic < im_channels; ++ic) {
+    // TODO(TJ): move * outside
+    T* dst_data_ic = col_data + ic * col_block_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      // TODO(TJ): move * outside
+      T* dst_data_kh = dst_data_ic + kh * col_block_fh;
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+    }
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw), src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 789d8e684a..ae2c90b33a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -227,7 +227,8 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
   auto t3 = GetCurrentMs();
 
   LOG(INFO) << "before: " << (t3 - t2) / repeat
-            << ",after: " << (t2 - t1) / repeat;
+            << ",after: " << (t2 - t1) / repeat
+            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
 }
 
 TEST(math, im2col_cputest) {
@@ -244,6 +245,10 @@ TEST(math, im2col_cputest) {
     // height != width
     testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
                   /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
 
     // filter == 1
     testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
@@ -251,13 +256,14 @@ TEST(math, im2col_cputest) {
     testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
                   /*pw*/ p);
   }
+
   // padding_h != padding_w
   testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
                 /*pw*/ 2);
 
   // benchmark
-  for (int p : {0, 1, 2}) {
-    for (int k : {3, 5}) {
+  for (int p : {0, 1}) {
+    for (int k : {1, 3, 5}) {
       LOG(INFO) << "padding == " << p << ", filter == " << k;
       benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
                   /*ph*/ p, /*pw*/ p);

From 2aa732ba34e4456242a597c3f3701c2c47e09b78 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 29 Jul 2018 17:23:59 +0800
Subject: [PATCH 18/40] fix distribute transpiler to optimize selected rows

---
 paddle/fluid/framework/operator.cc                      | 2 ++
 python/paddle/fluid/transpiler/distribute_transpiler.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d1dc5fcd97..7c1c29fd9a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -679,6 +679,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 4a9ea6af74..4b03bad0dd 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -494,6 +494,7 @@ class DistributeTranspiler(object):
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            optimize_blocks.append(table_opt_block)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(

From 91f63cd40128dc74e1fe37e0ffaa072af22c10bb Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 29 Jul 2018 19:59:12 +0800
Subject: [PATCH 19/40] fix split_ids_op and add unit test

---
 paddle/fluid/operators/split_ids_op.h         | 12 ++--
 .../tests/unittests/test_split_ids_op.py      | 60 +++++++++++++++++++
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d263426e07..86a3eaa5c4 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -68,9 +69,11 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
       // get rows for outputs
-      for (auto &id : ids_rows) {
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(id);
+      std::map<int64_t, size_t> id_to_index;
+      for (size_t i = 0; i < ids_rows.size(); ++i) {
+        id_to_index[ids_rows[i]] = i;
+        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
       }
 
       int64_t row_width = ids_dims[1];
@@ -80,7 +83,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+          memcpy(output + i * row_width,
+                 ids + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index e9f0a06a56..adf3345f1d 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -15,6 +15,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSplitIdsOp(OpTest):
@@ -31,5 +33,63 @@ class TestSplitIdsOp(OpTest):
         self.check_output()
 
 
+class TestSpliteIds(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 9]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            for j in range(row_numel):
+                np_array[i, j] = rows[i] + j
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        outs_name = ["out%d" % i for i in xrange(3)]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out0_rows = [0, 9]
+        expected_out1_rows = [7, 4]
+        expected_out2_rows = [5]
+
+        op = Operator("split_ids", Ids="X", Out=outs_name)
+
+        op.run(scope, place)
+
+        self.assertEqual(outs[0].rows(), expected_out0_rows)
+        self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[2].rows(), expected_out2_rows)
+
+        self.assertAlmostEqual(0.0, np.array(outs[0].get_tensor())[0, 0])
+        self.assertAlmostEqual(1.0, np.array(outs[0].get_tensor())[0, 1])
+        self.assertAlmostEqual(9.0, np.array(outs[0].get_tensor())[1, 0])
+        self.assertAlmostEqual(10.0, np.array(outs[0].get_tensor())[1, 1])
+
+        self.assertAlmostEqual(7.0, np.array(outs[1].get_tensor())[0, 0])
+        self.assertAlmostEqual(8.0, np.array(outs[1].get_tensor())[0, 1])
+        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 0])
+        self.assertAlmostEqual(5.0, np.array(outs[1].get_tensor())[1, 1])
+
+        self.assertAlmostEqual(5.0, np.array(outs[2].get_tensor())[0, 0])
+        self.assertAlmostEqual(6.0, np.array(outs[2].get_tensor())[0, 1])
+
+
 if __name__ == '__main__':
     unittest.main()

From 41693b6ae7460f3c8f5a8bf49287980dfd2da466 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 29 Jul 2018 20:13:03 +0800
Subject: [PATCH 20/40] optimize code

---
 .../tests/unittests/test_split_ids_op.py      | 29 +++++++------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index adf3345f1d..4001877290 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -65,30 +65,21 @@ class TestSpliteIds(unittest.TestCase):
         ]
 
         # expected output selected rows
-        expected_out0_rows = [0, 9]
-        expected_out1_rows = [7, 4]
-        expected_out2_rows = [5]
+        expected_out_rows = [[0, 9], [7, 4], [5]]
 
         op = Operator("split_ids", Ids="X", Out=outs_name)
 
         op.run(scope, place)
 
-        self.assertEqual(outs[0].rows(), expected_out0_rows)
-        self.assertEqual(outs[1].rows(), expected_out1_rows)
-        self.assertEqual(outs[2].rows(), expected_out2_rows)
-
-        self.assertAlmostEqual(0.0, np.array(outs[0].get_tensor())[0, 0])
-        self.assertAlmostEqual(1.0, np.array(outs[0].get_tensor())[0, 1])
-        self.assertAlmostEqual(9.0, np.array(outs[0].get_tensor())[1, 0])
-        self.assertAlmostEqual(10.0, np.array(outs[0].get_tensor())[1, 1])
-
-        self.assertAlmostEqual(7.0, np.array(outs[1].get_tensor())[0, 0])
-        self.assertAlmostEqual(8.0, np.array(outs[1].get_tensor())[0, 1])
-        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 0])
-        self.assertAlmostEqual(5.0, np.array(outs[1].get_tensor())[1, 1])
-
-        self.assertAlmostEqual(5.0, np.array(outs[2].get_tensor())[0, 0])
-        self.assertAlmostEqual(6.0, np.array(outs[2].get_tensor())[0, 1])
+        for i in range(len(outs)):
+            expected_rows = expected_out_rows[i]
+            self.assertEqual(outs[i].rows(), expected_rows)
+            for j in range(len(expected_rows)):
+                row = expected_rows[j]
+                self.assertAlmostEqual(
+                    float(row), np.array(outs[i].get_tensor())[j, 0])
+                self.assertAlmostEqual(
+                    float(row + 1), np.array(outs[i].get_tensor())[j, 1])
 
 
 if __name__ == '__main__':

From 4d2405d851812daded4f2344bb817a61dc804f47 Mon Sep 17 00:00:00 2001
From: Superjomn <yanchunwei@outlook.com>
Date: Mon, 30 Jul 2018 01:26:48 +0000
Subject: [PATCH 21/40] inference analysis support ssa

---
 .../inference/analysis/data_flow_graph.h      |  2 ++
 .../analysis/fluid_to_data_flow_graph_pass.cc | 19 +++++++++++++++----
 .../analysis/fluid_to_data_flow_graph_pass.h  |  2 +-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 1c60d5de21..bc1875f4d8 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -36,6 +36,8 @@ namespace analysis {
 
 /*
  * DataFlowGraph - A container of Value and Function Nodes.
+ *
+ * This is the base graph for any other type of graphs, such as SSA or CFG.
  */
 struct DataFlowGraph {
   NodeMap nodes;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 496921db9e..88fdf8c9cb 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -40,6 +40,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
   // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
   for (int i = 0; i < main_block.vars_size(); i++) {
@@ -51,6 +53,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     var2id[var.name()] = v->id();
   }
 
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,33 +73,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    std::unordered_set<Node *> inlinks;
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
         auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
         in->outlinks.push_back(o);
         o->inlinks.push_back(in);
-        inlinks.insert(in);
       }
     }
     for (int j = 0; j < op.outputs_size(); j++) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (inlinks.count(out)) {
+        if (unique_written_vars.count(out)) {
           // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
           auto *out_alias = graph->nodes.Create(Node::Type::kValue);
           out_alias->SetName(out->name());
           out_alias->SetPbDesc(out->pb_desc());
           out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
           LOG(INFO) << "loop found in graph, create SSA alias node ["
                     << out_alias->repr() << "] for [" << out->repr() << "]";
           out = out_alias;
         }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
+        unique_written_vars.insert(out);
       }
     }
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index da8463b63b..fb948bf224 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -30,7 +30,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Transform a FluidDesc to a data flow graph.
+ * Transform a FluidDesc to a SSA.
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:

From d8d2dbcfacfd9e53f9d4d4e6d70ab98bd05e5d38 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 30 Jul 2018 11:29:17 +0800
Subject: [PATCH 22/40] further optimize im2col using variables

---
 paddle/fluid/operators/math/im2col_cfo_cpu.h | 35 ++++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index ac843cdc71..0d32bc5bd0 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -80,11 +80,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
   int col_matrix_width = output_width * output_height;
   int im_size = im_height * im_width;
   size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
   for (int oh = 0; oh < output_height; ++oh) {
-    const T* im_data_start = im_data + oh * im_width;
-    T* dst_data = col_data + oh * output_width;
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
     for (int ic = 0; ic < im_channels; ++ic) {
-      const T* src_data = im_data_start + ic * im_size;
+      const T* src_data = src_data_ic;
       for (int kh = 0; kh < filter_height; ++kh) {
         for (int kw = 0; kw < filter_width; ++kw) {
           std::memcpy(dst_data, src_data + kw, copy_size);
@@ -92,7 +94,10 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
         }
         src_data = src_data + im_width;
       }
+      src_data_ic = src_data_ic + im_size;
     }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
   }
 }
 
@@ -130,34 +135,36 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
     T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
                      col_matrix_width - output_width;
     for (int ic = 0; ic < im_channels; ++ic) {
-      // TODO(TJ): move * outside
-      T* dst_data_l = col_start_l + ic * col_block_ic;
-      T* dst_data_r = col_start_r + ic * col_block_ic;
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
       for (int kw = 0; kw < filter_width; ++kw) {
         std::memset(dst_data_l, 0, copy_size);
         std::memset(dst_data_r, 0, copy_size);
         dst_data_l = dst_data_l + col_matrix_width;
         dst_data_r = dst_data_r + col_matrix_width;
       }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
     }
   }
 
   auto pad = static_cast<T>(0);
   if (filter_width == 1) {
     // fill width padding
+    T* dst_data_ic = col_data;
     for (int ic = 0; ic < im_channels; ++ic) {
-      // TODO(TJ): move * outside
-      T* dst_data_ic = col_data + ic * col_block_ic;
+      T* dst_data_kh = dst_data_ic;
       for (int kh = 0; kh < filter_height; ++kh) {
-        // TODO(TJ): move * outside
-        T* dst_data = dst_data_ic + kh * col_block_fh;
+        T* dst_data = dst_data_kh;
         for (int oh = 0; oh < output_height; ++oh) {
           *dst_data = pad;
           dst_data = dst_data + output_width - 1;
           *dst_data = pad;
           ++dst_data;
         }
+        dst_data_kh = dst_data_kh + col_block_fh;
       }
+      dst_data_ic = dst_data_ic + col_block_ic;
     }
     // fill core
     size_t copy_size = sizeof(T) * (output_width - plw - prw);
@@ -184,12 +191,10 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
 
   // filter_width != 1
   // fill width padding
+  T* dst_data_ic = col_data;
   for (int ic = 0; ic < im_channels; ++ic) {
-    // TODO(TJ): move * outside
-    T* dst_data_ic = col_data + ic * col_block_ic;
+    T* dst_data_kh = dst_data_ic;
     for (int kh = 0; kh < filter_height; ++kh) {
-      // TODO(TJ): move * outside
-      T* dst_data_kh = dst_data_ic + kh * col_block_fh;
       for (T* dst_data :
            {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
                              output_width - 1}) {
@@ -199,7 +204,9 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
           dst_data = dst_data + output_width;
         }
       }
+      dst_data_kh = dst_data_kh + col_block_fh;
     }
+    dst_data_ic = dst_data_ic + col_block_ic;
   }
 
   // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *

From 91b114a7870b62764f0a493c520c5fab1d89f6a7 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 30 Jul 2018 11:39:48 +0800
Subject: [PATCH 23/40] change map to unordered_map

---
 paddle/fluid/operators/split_ids_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index 86a3eaa5c4..d6caf626cb 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <map>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -69,7 +69,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
       // get rows for outputs
-      std::map<int64_t, size_t> id_to_index;
+      std::unordered_map<int64_t, size_t> id_to_index;
       for (size_t i = 0; i < ids_rows.size(); ++i) {
         id_to_index[ids_rows[i]] = i;
         size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;

From 147bf00ffee4f02882d852ca433ea9322c156e3d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 30 Jul 2018 12:35:18 +0800
Subject: [PATCH 24/40] clear mutable rows for the output of split_ids_op

---
 paddle/fluid/operators/split_ids_op.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d6caf626cb..c4af5a65fc 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -68,6 +68,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
+      for (auto &out : outs) {
+        out->mutable_rows()->clear();
+      }
       // get rows for outputs
       std::unordered_map<int64_t, size_t> id_to_index;
       for (size_t i = 0; i < ids_rows.size(); ++i) {

From f4c4c6179c9628055e13fc2237851a0fba801702 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 30 Jul 2018 14:18:48 +0800
Subject: [PATCH 25/40] optimize unit test of test_split_ids_op

---
 .../tests/unittests/test_split_ids_op.py      | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index 4001877290..ca78613098 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -69,17 +69,18 @@ class TestSpliteIds(unittest.TestCase):
 
         op = Operator("split_ids", Ids="X", Out=outs_name)
 
-        op.run(scope, place)
-
-        for i in range(len(outs)):
-            expected_rows = expected_out_rows[i]
-            self.assertEqual(outs[i].rows(), expected_rows)
-            for j in range(len(expected_rows)):
-                row = expected_rows[j]
-                self.assertAlmostEqual(
-                    float(row), np.array(outs[i].get_tensor())[j, 0])
-                self.assertAlmostEqual(
-                    float(row + 1), np.array(outs[i].get_tensor())[j, 1])
+        for _ in range(3):
+            op.run(scope, place)
+
+            for i in range(len(outs)):
+                expected_rows = expected_out_rows[i]
+                self.assertEqual(outs[i].rows(), expected_rows)
+                for j in range(len(expected_rows)):
+                    row = expected_rows[j]
+                    self.assertAlmostEqual(
+                        float(row), np.array(outs[i].get_tensor())[j, 0])
+                    self.assertAlmostEqual(
+                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
 
 
 if __name__ == '__main__':

From f628b1dfad227ffd339fa57a27d13614af6ab958 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 30 Jul 2018 15:23:54 +0800
Subject: [PATCH 26/40] fix_tests_on_gcc482

---
 paddle/fluid/platform/device_tracer.cc | 10 +++++++---
 tools/manylinux1/Dockerfile.x64        |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf..696ed61981 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -273,14 +273,18 @@ class DeviceTracerImpl : public DeviceTracer {
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
+    std::string kernel_name;
     for (const KernelRecord &r : kernel_records_) {
       if (correlations_.find(r.correlation_id) == correlations_.end()) {
-        fprintf(stderr, "cannot relate a kernel activity\n");
-        continue;
+        // fprintf(stderr, "cannot relate a kernel activity\n");
+        // continue;
+        kernel_name = "Unknown";
+      } else {
+        kernel_name = correlations_.at(r.correlation_id);
       }
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      event->set_name(correlations_.at(r.correlation_id));
+      event->set_name(kernel_name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index bca0b77ad7..0b72ea323b 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts

From b7b600225ec954cac89dd40485480567123f750b Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 30 Jul 2018 15:31:39 +0800
Subject: [PATCH 27/40] clean up

---
 paddle/fluid/platform/device_tracer.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 696ed61981..90ab85905d 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -276,15 +276,12 @@ class DeviceTracerImpl : public DeviceTracer {
     std::string kernel_name;
     for (const KernelRecord &r : kernel_records_) {
       if (correlations_.find(r.correlation_id) == correlations_.end()) {
-        // fprintf(stderr, "cannot relate a kernel activity\n");
-        // continue;
-        kernel_name = "Unknown";
-      } else {
-        kernel_name = correlations_.at(r.correlation_id);
+        fprintf(stderr, "cannot relate a kernel activity\n");
+        continue;
       }
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      event->set_name(kernel_name);
+      event->set_name(correlations_.at(r.correlation_id));
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);

From ff97c709dfe25f2470e489505db4293eb0206d95 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 30 Jul 2018 15:32:20 +0800
Subject: [PATCH 28/40] clean up

---
 paddle/fluid/platform/device_tracer.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 90ab85905d..d9e2afadaf 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -273,7 +273,6 @@ class DeviceTracerImpl : public DeviceTracer {
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
-    std::string kernel_name;
     for (const KernelRecord &r : kernel_records_) {
       if (correlations_.find(r.correlation_id) == correlations_.end()) {
         fprintf(stderr, "cannot relate a kernel activity\n");

From f372f27e3f724a55ed221f18d3c30b18b4cde8cf Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 30 Jul 2018 15:36:15 +0800
Subject: [PATCH 29/40] Hidden APIs for While, StaticRNN, ParallelDo. (#12332)

* Hidden APIs for While, StaticRNN, ParallelDo.
---
 paddle/fluid/API.spec                      |  5 +----
 python/paddle/fluid/layers/control_flow.py | 22 +++++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6efb03dabe..5f3bfa2965 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var
 paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=
 paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
 paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f05ae6d5d1..3ee1c636ac 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -21,6 +21,7 @@ from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
 import numpy
+import warnings
 
 __all__ = [
     'While',
@@ -280,6 +281,9 @@ class ParallelDo(object):
     """
 
     def __init__(self, places, use_nccl=False, name=None):
+        warnings.warn(
+            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
+            Warning)
         self.helper = LayerHelper("parallel_do", name=name)
         self.inputs = []
         self.places = places
@@ -338,7 +342,7 @@ class ParallelDo(object):
 
         return [parent_block.var(name) for name in params]
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -394,7 +398,7 @@ class BlockGuardWithCompletion(BlockGuard):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_op()
+        self.rnn._complete_op()
         return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
                                                               exc_tb)
 
@@ -470,7 +474,7 @@ class StaticRNN(object):
             if shape is None or batch_ref is None:
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
@@ -527,7 +531,7 @@ class StaticRNN(object):
             outputs={'Out': tmp_o},
             attrs={'dtype': o.dtype})
 
-        out_var = self.parent_block().create_var(
+        out_var = self._parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
             dtype=tmp_o.dtype)
@@ -543,7 +547,7 @@ class StaticRNN(object):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
 
-    def parent_block(self):
+    def _parent_block(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
@@ -560,10 +564,10 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
 
         local_inputs = set()
 
@@ -643,7 +647,7 @@ class WhileGuard(BlockGuard):
         if exc_type is not None:
             return False
         self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
+        self.while_op._complete()
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -690,7 +694,7 @@ class While(object):
     def block(self):
         return WhileGuard(self)
 
-    def complete(self):
+    def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()

From 3e4083ed1f7a75e17f67b3a31831db56d01a1bb4 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 30 Jul 2018 19:35:05 +0800
Subject: [PATCH 30/40] Make exception handling of threaded_ssa_graph_executor
 an independent class

---
 .../framework/details/exception_holder.h      | 83 +++++++++++++++++++
 .../details/threaded_ssa_graph_executor.cc    | 27 ++----
 .../details/threaded_ssa_graph_executor.h     |  4 +-
 3 files changed, 90 insertions(+), 24 deletions(-)
 create mode 100644 paddle/fluid/framework/details/exception_holder.h

diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000..6e302a2923
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  bool ExceptionCatched() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void Throw() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+        break;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown exception.";
+    }
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset();
+    type_ = kNone;
+  }
+
+ private:
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c19f74476f..00f1f262a6 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Clean run context
   run_op_futures_.clear();
-  exception_.reset();
+  exception_holder_.Clear();
 
   // Step 3. Execution
   while (!pending_vars.empty()) {
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::unique_lock<std::mutex> l(exception_mu_);
-      if (exception_) {
-        l.unlock();
+      if (exception_holder_.ExceptionCatched()) {
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        l.lock();
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
-        }
+        exception_holder_.Throw();
       } else {
         continue;
       }
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
+      exception_holder_.Catch(ex);
     } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
+      exception_holder_.Catch(ex);
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 3d67daa45e..4f3e5a6288 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -58,8 +59,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,

From 0b861bbca9ca8cf383e8c559a1d88f1e62b3d518 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 30 Jul 2018 22:27:35 +0800
Subject: [PATCH 31/40] add profiler for listen_and_serv op

---
 paddle/fluid/operators/listen_and_serv_op.cc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 438b44b42a..fb475e3253 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -25,6 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_int32(listen_and_serv_profile_period);
+DEFINE_int32(listen_and_serv_profile_period, 0,
+             "the period of listen_and_serv to do profile");
+
 namespace paddle {
 namespace operators {
 
@@ -122,7 +126,13 @@ void ListenAndServOp::RunSyncLoop(
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
+
+  int32_t profile_step = 0;
   while (true) {
+    if (FLAGS_listen_and_serv_profile_period > 0 && profile_step == 0) {
+      auto pf_state = paddle::platform::ProfilerState::kCPU;
+      paddle::platform::EnableProfiler(pf_state);
+    }
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
     rpc_service_->SetCond(distributed::kRequestSend);
@@ -164,6 +174,14 @@ void ListenAndServOp::RunSyncLoop(
     // reset received sparse vars to avoid reuse it in the next mini-batch
     dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
+    if (FLAGS_listen_and_serv_profile_period > 0 &&
+        profile_step == FLAGS_listen_and_serv_profile_period) {
+      paddle::platform::DisableProfiler(
+          paddle::platform::EventSortingKey::kTotal, "/dev/null");
+      profile_step = 0;
+    } else {
+      profile_step++;
+    }
   }  // while(true)
 }
 

From c8c8c01a235f4e370073f5946b08122b9bb92c97 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 30 Jul 2018 22:49:54 +0800
Subject: [PATCH 32/40] fix mac build of graph_executor

---
 .../framework/details/scope_buffered_ssa_graph_executor.h     | 4 +++-
 paddle/fluid/framework/details/threaded_ssa_graph_executor.h  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 1b188aec59..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
 
-  const ir::Graph& Graph() const { return underlying_executor_->Graph(); }
+  const ir::Graph& Graph() const override {
+    return underlying_executor_->Graph();
+  }
 
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 82d6b5272a..b0aaf60701 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -42,7 +42,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                            const std::vector<platform::Place> &places,
                            std::unique_ptr<ir::Graph> &&graph);
 
-  const ir::Graph &Graph() const { return *graph_; }
+  const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;

From 91fb0156ca1a97a247f571960440bae993c28a1c Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Tue, 31 Jul 2018 09:09:33 +0800
Subject: [PATCH 33/40] Memory/reshape op (#12414)

* "remove inplace in single op"

* "fix ci"

* "add transpiler case"

* fix conflict

* "fix reshape"

* "delete reshape inplace attr"

* "follo the comments"

* "rerun ci"
---
 paddle/fluid/operators/reshape_op.cc          | 31 +++----------------
 python/paddle/fluid/layers/nn.py              |  9 +++---
 .../test_memory_optimization_transpiler.py    | 24 ++++++++++++++
 .../fluid/tests/unittests/test_reshape_op.py  |  6 ++--
 4 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a9fd1869c9..a1dfe39c3a 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
-    AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -233,16 +227,9 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    bool inplace = ctx.Attr<bool>("inplace");
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data(ctx.GetPlace(), in->type());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
   }
 };
 
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(in_dims);
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5d7f1eadd9..058acd4a50 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4473,15 +4473,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape", **locals())
-    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
         inputs=inputs,
-        attrs={"shape": shape,
-               "inplace": inplace},
-        outputs={"Out": reshaped})
+        attrs={"shape": shape},
+        outputs={"Out": out})
 
-    return helper.append_activation(reshaped)
+    return helper.append_activation(out)
 
 
 def lod_reset(x, y=None, target_lod=None):
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index cfd6e63e12..67733807f8 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
         print(str(result_program))
 
 
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.program = program
+
+    def test_inplace_ops(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index f51b5a7e99..2f5558578a 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
 
     def test_check_output(self):
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):

From 0b62f61d2968cc34556c8e74e540469a45b21914 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 31 Jul 2018 09:16:48 +0800
Subject: [PATCH 34/40] add init flag in __init__.py for
 listen_and_serv_profile_period

---
 paddle/fluid/CMakeLists.txt                  | 2 +-
 paddle/fluid/operators/listen_and_serv_op.cc | 3 ++-
 python/paddle/fluid/__init__.py              | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29..e2e26fc5d4 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -6,4 +6,4 @@ add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
 # NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+#add_subdirectory(inference)
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index fb475e3253..760cda60a9 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,13 +19,14 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/operators/detail/macros.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
-DECLARE_int32(listen_and_serv_profile_period);
 DEFINE_int32(listen_and_serv_profile_period, 0,
              "the period of listen_and_serv to do profile");
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index d1d6dd75ee..956e3c4348 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -127,6 +127,7 @@ def __bootstrap__():
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
+        read_env_flags.append('listen_and_serv_profile_period')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [

From d04dca37983c16ede418f8d0f1888394be8a5ec2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 31 Jul 2018 09:17:33 +0800
Subject: [PATCH 35/40] revert cmakelist

---
 paddle/fluid/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index e2e26fc5d4..d274d96c29 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -6,4 +6,4 @@ add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
 # NOTE: please add subdirectory inference at last.
-#add_subdirectory(inference)
+add_subdirectory(inference)

From 7e46a8d172f808b5731784d1703f1d2fa26b5bde Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 31 Jul 2018 09:35:23 +0800
Subject: [PATCH 36/40] fix logical bug, optimize code

---
 paddle/fluid/operators/listen_and_serv_op.cc | 26 ++++++++++++--------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 760cda60a9..e14b148cc0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -130,9 +130,14 @@ void ListenAndServOp::RunSyncLoop(
 
   int32_t profile_step = 0;
   while (true) {
-    if (FLAGS_listen_and_serv_profile_period > 0 && profile_step == 0) {
-      auto pf_state = paddle::platform::ProfilerState::kCPU;
-      paddle::platform::EnableProfiler(pf_state);
+    PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period,
+                      "profile_step should not be larger then "
+                      "FLAGS_listen_and_serv_profile_period");
+    if (FLAGS_listen_and_serv_profile_period > 0) {
+      if (profile_step == 0) {
+        auto pf_state = paddle::platform::ProfilerState::kCPU;
+        paddle::platform::EnableProfiler(pf_state);
+      }
     }
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
@@ -175,13 +180,14 @@ void ListenAndServOp::RunSyncLoop(
     // reset received sparse vars to avoid reuse it in the next mini-batch
     dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
-    if (FLAGS_listen_and_serv_profile_period > 0 &&
-        profile_step == FLAGS_listen_and_serv_profile_period) {
-      paddle::platform::DisableProfiler(
-          paddle::platform::EventSortingKey::kTotal, "/dev/null");
-      profile_step = 0;
-    } else {
-      profile_step++;
+    if (FLAGS_listen_and_serv_profile_period > 0) {
+      if (profile_step == FLAGS_listen_and_serv_profile_period) {
+        paddle::platform::DisableProfiler(
+            paddle::platform::EventSortingKey::kTotal, "/dev/null");
+        profile_step = 0;
+      } else {
+        profile_step++;
+      }
     }
   }  // while(true)
 }

From 5e6f7bc569b767519881c4224c7d985262492714 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 31 Jul 2018 10:56:28 +0800
Subject: [PATCH 37/40] compress the fluid.tgz

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0f334b2892..a8bc16f1b5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -534,7 +534,7 @@ EOF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        tar -czf fluid.tgz fluid
       fi
 }
 

From 6d3da458a77101e2bbbb8142db32e4d81be53ca2 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Tue, 31 Jul 2018 12:20:40 +0800
Subject: [PATCH 38/40] Fix/float16 style (#12446)

* "rewrite the test case"

* "follow comment"
---
 paddle/fluid/platform/cuda_helper_test.cu | 183 +++++++++++++---------
 paddle/fluid/platform/cuda_primitives.h   |  20 +--
 2 files changed, 119 insertions(+), 84 deletions(-)

diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 4a47ba5cca..ca5ca1caeb 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include <bitset>
 #include <iostream>
 #include <random>
 
@@ -25,13 +24,13 @@
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
 
-#define CUDA_ATOMIC_KERNEL(op, T)                                      \
-  __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;       \
-         i += blockDim.x * gridDim.x) {                                \
-      paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]);         \
-    }                                                                  \
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
   }
+}
 
 template <typename T>
 struct AddFunctor {
@@ -39,80 +38,116 @@ struct AddFunctor {
 };
 
 template <typename T>
-struct SubFunctor {
-  T operator()(const T& a, const T& b) { return a - b; }
-};
-
-// NOTE(dzhwinter): the float16 add has small underflow/overflow
-// so we use EXPECT_NEAR to check the result.
-#define ARITHMETIC_KERNEL_LAUNCH(op, T)                                 \
-  void Test##T##op(size_t num) {                                        \
-    T *in1, *in2, *out;                                                 \
-    T *d_in1, *d_in2;                                                   \
-    size_t size = sizeof(T) * num;                                      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);                 \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);                 \
-    in1 = reinterpret_cast<T*>(malloc(size));                           \
-    in2 = reinterpret_cast<T*>(malloc(size));                           \
-    out = reinterpret_cast<T*>(malloc(size));                           \
-    std::minstd_rand engine;                                            \
-    std::uniform_real_distribution<double> dist(0.0, 1.0);              \
-    for (size_t i = 0; i < num; ++i) {                                  \
-      in1[i] = static_cast<T>(dist(engine));                            \
-      in2[i] = static_cast<T>(dist(engine));                            \
-    }                                                                   \
-    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);               \
-    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);               \
-    op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);      \
-    cudaDeviceSynchronize();                                            \
-    cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);               \
-    cudaDeviceSynchronize();                                            \
-    for (size_t i = 0; i < num; ++i) {                                  \
-      EXPECT_NEAR(static_cast<float>(out[i]),                           \
-                  static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
-                  0.001);                                               \
-    }                                                                   \
-    free(in1);                                                          \
-    free(in2);                                                          \
-    free(out);                                                          \
-    cudaFree(d_in1);                                                    \
-    cudaFree(d_in2);                                                    \
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
   }
-CUDA_ATOMIC_KERNEL(Add, float);
-CUDA_ATOMIC_KERNEL(Add, double);
-CUDA_ATOMIC_KERNEL(Add, float16);
-
-ARITHMETIC_KERNEL_LAUNCH(Add, float);
-ARITHMETIC_KERNEL_LAUNCH(Add, double);
-ARITHMETIC_KERNEL_LAUNCH(Add, float16);
-
-namespace paddle {
-namespace platform {
-USE_CUDA_ATOMIC(Sub, int);
-};
-};
-CUDA_ATOMIC_KERNEL(Sub, int);
-ARITHMETIC_KERNEL_LAUNCH(Sub, int);
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
 
 // cuda primitives
 TEST(CudaAtomic, Add) {
-  TestfloatAdd(static_cast<size_t>(10));
-  TestfloatAdd(static_cast<size_t>(1024 * 1024));
-  TestdoubleAdd(static_cast<size_t>(10));
-  TestdoubleAdd(static_cast<size_t>(1024 * 1024));
-}
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
 
-TEST(CudaAtomic, Sub) {
-  TestintSub(static_cast<size_t>(10));
-  TestintSub(static_cast<size_t>(1024 * 1024));
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
 }
 
 TEST(CudaAtomic, float16) {
-  using paddle::platform::float16;
-  Testfloat16Add(static_cast<size_t>(1));
-  Testfloat16Add(static_cast<size_t>(2));
-  Testfloat16Add(static_cast<size_t>(3));
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
 
-  Testfloat16Add(static_cast<size_t>(10));
-  Testfloat16Add(static_cast<size_t>(1024 * 1024));
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
 }
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 94ce83975a..67ea64833d 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
 // convert the value into float and do the add arithmetic.
 // then store the result into a uint32.
-inline __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
   float16 low_half;
   // the float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xffffu);
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
   low_half = static_cast<float16>(static_cast<float>(low_half) + x);
-  return (val & 0xffff0000u) | low_half.x;
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
-inline __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
   float16 high_half;
   // the float16 in higher 16bits
   high_half.x = static_cast<uint16_t>(val >> 16);
   high_half = static_cast<float16>(static_cast<float>(high_half) + x);
-  return (val & 0xffffu) | (static_cast<uint32_t>(high_half.x) << 16);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Add, float16) {
   // concrete packed float16 value may exsits in lower or higher 16bits
   // of the 32bits address.
-  uint32_t *address_as_ui =
-      reinterpret_cast<uint32_t *>(reinterpret_cast<char *>(address) -
-                                   (reinterpret_cast<size_t>(address) & 2));
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
   float val_f = static_cast<float>(val);
   uint32_t old = *address_as_ui;
   uint32_t sum;
   uint32_t newval;
   uint32_t assumed;
-  if (((size_t)address & 2) == 0) {
+  if (((uintptr_t)address & 0x02) == 0) {
     // the float16 value stay at lower 16 bits of the address.
     do {
       assumed = old;
       old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
     } while (old != assumed);
     float16 ret;
-    ret.x = old & 0xffffu;
+    ret.x = old & 0xFFFFu;
     return ret;
   } else {
     // the float16 value stay at higher 16 bits of the address.

From 31a2c8768899f218da188b8804cdeaface318d4d Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 31 Jul 2018 14:48:11 +0800
Subject: [PATCH 39/40] fea/lightly support lod (#12451)

---
 .../inference/api/api_anakin_engine_tester.cc  | 18 ++++++++++--------
 paddle/fluid/inference/api/api_impl.cc         | 11 +++++++++++
 .../api/api_tensorrt_subgraph_engine_tester.cc |  9 ++++-----
 .../api/demo_ci/simple_on_word2vec.cc          | 17 +++++++++--------
 paddle/fluid/inference/api/demo_ci/vis_demo.cc | 10 +++++-----
 .../fluid/inference/api/paddle_inference_api.h |  2 +-
 6 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
index d6d631bfba..7554fe4989 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -37,19 +37,21 @@ TEST(inference, anakin) {
 
   float data[1 * 3 * 224 * 224] = {1.0f};
 
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
   std::vector<PaddleTensor> paddle_tensor_feeds;
   paddle_tensor_feeds.emplace_back(std::move(tensor));
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({1000, 1});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
   std::vector<PaddleTensor> outputs;
   outputs.emplace_back(std::move(tensor_out));
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 58fd7c6f8b..08d7af6d3a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                 inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+
     feeds->push_back(input);
   }
   return true;
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
       buffer.Resize(sizeof(float) * data.size());
     }
     std::memcpy(buffer.data(), data.data(), buffer.length());
+    // copy LoD
+    for (const auto &level : fetchs[i].lod()) {
+      outputs->at(i).lod.emplace_back(level);
+    }
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 62d98a7967..fcbf9b89d6 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
     std::vector<int64_t> data(20);
     for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{
-        .name = "",
-        .shape = std::vector<int>({10, 1}),
-        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
-        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5f96fecf93..03ac79e9ed 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 0a2a2b713a..ddfe05a502 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2f8b4f8596..3342ee3c25 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -67,9 +67,9 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<uint64_t>> lod;  // lod data
 };
 
 enum class PaddleEngineKind {

From dcfbc6a6615ed63c2de78686d52c8fb847ad7c46 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 31 Jul 2018 14:48:33 +0800
Subject: [PATCH 40/40] inference analyzer as bin (#12450)

---
 .../fluid/inference/analysis/CMakeLists.txt   |  3 +
 paddle/fluid/inference/analysis/analyzer.cc   |  7 +++
 paddle/fluid/inference/analysis/analyzer.h    | 30 ++++-----
 .../fluid/inference/analysis/analyzer_main.cc | 33 ++++++++++
 .../inference/analysis/analyzer_tester.cc     |  8 ++-
 paddle/fluid/inference/analysis/argument.h    | 13 ++++
 .../analysis/data_flow_graph_tester.cc        |  4 +-
 .../data_flow_graph_to_fluid_pass_tester.cc   | 10 +--
 .../analysis/dfg_graphviz_draw_pass_tester.cc | 12 +++-
 .../analysis/fluid_to_data_flow_graph_pass.cc | 17 +++++-
 .../fluid_to_data_flow_graph_pass_tester.cc   |  3 +-
 paddle/fluid/inference/analysis/helper.h      | 15 +++++
 .../inference/analysis/model_store_pass.cc    | 61 +++++++++++++++++++
 .../inference/analysis/model_store_pass.h     | 51 ++++++++++++++++
 .../analysis/model_store_pass_tester.cc       | 43 +++++++++++++
 paddle/fluid/inference/analysis/pass.h        |  1 +
 .../inference/analysis/pass_manager_tester.cc |  7 ++-
 .../analysis/subgraph_splitter_tester.cc      |  8 +--
 ...tensorrt_subgraph_node_mark_pass_tester.cc |  6 +-
 .../analysis/tensorrt_subgraph_pass_tester.cc |  7 +--
 paddle/fluid/inference/analysis/ut_helper.h   | 21 +------
 .../api/api_tensorrt_subgraph_engine.cc       | 12 ++++
 22 files changed, 309 insertions(+), 63 deletions(-)
 create mode 100644 paddle/fluid/inference/analysis/analyzer_main.cc
 create mode 100644 paddle/fluid/inference/analysis/model_store_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/model_store_pass.h
 create mode 100644 paddle/fluid/inference/analysis/model_store_pass_tester.cc

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 67d355d10d..27fe575cb6 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
   tensorrt_subgraph_node_mark_pass.cc
   analyzer.cc
   helper.cc
+        model_store_pass.cc
   DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index b3a1075e5a..98bdfcc00b 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(inference_analysis_output_storage_path, "",
+              "optimized model output path");
+
 namespace inference {
 namespace analysis {
 
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 0132bf5b9c..c82fdfff86 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,28 +16,23 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
@@ -50,6 +45,7 @@ namespace paddle {
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+DECLARE_string(inference_analysis_output_storage_path);
 
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
new file mode 100644
index 0000000000..5e1fe3eb79
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 25a440e7e7..24bfb3993c 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -20,14 +20,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, analysis_without_tensorrt) {
+TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
-TEST_F(DFG_Tester, analysis_with_tensorrt) {
+TEST(Analyzer, analysis_with_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bf..9e1c2e4586 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -36,6 +36,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +54,9 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 7912f8d7f1..a881262665 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -20,7 +20,7 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a..4ef381db29 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 65842b1e85..928be79170 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 88fdf8c9cb..511631d3e0 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,8 +26,20 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
     argument->main_dfg.reset(new DataFlowGraph);
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index dadb84059d..d218dcd050 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,8 +21,9 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
   pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f..a0f912b251 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000..db7be3c0cd
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000..713e8783ea
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000..5f3526dd50
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_inference_analysis_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6b4dbb3bb5..6806f9ff7d 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -50,6 +50,7 @@ class Pass {
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
+  virtual void Run() { LOG(FATAL) << "not valid"; }
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d7..13423e4837 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
   std::string description() const override { return "some doc"; }
 };
 
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  Argument argument(FLAGS_inference_model_dir);
+
   ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
 
-TEST_F(DFG_Tester, Node_pass_manager) {
+TEST(PassManager, Node_pass_manager) {
+  Argument argument(FLAGS_inference_model_dir);
   // Pre-process: initialize the DFG with the ProgramDesc first.
   FluidToDataFlowGraphPass pass0;
   pass0.Initialize(&argument);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 67dd4da54b..39cc433b40 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b..c1d932878e 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3..67a5af83d8 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567..1073a6f686 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index c0891e9c28..45b5a7638b 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);