From f3463ecb6ee2b791c7ccd3eb64f7d317f9c30519 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Thu, 14 Feb 2019 16:19:02 +0800
Subject: [PATCH 01/10] refine pg execution

---
 .../fluid/framework/details/build_strategy.cc |  10 +-
 .../details/multi_devices_graph_pass.cc       |  54 +++++----
 .../details/multi_devices_graph_pass.h        |  16 ++-
 .../framework/details/multi_devices_helper.h  |  11 +-
 .../fluid/framework/details/op_handle_base.h  |   3 +
 .../details/parallel_ssa_graph_executor.cc    |  65 ++++++++++-
 .../details/parallel_ssa_graph_executor.h     |  11 ++
 .../details/threaded_ssa_graph_executor.cc    |   4 +-
 paddle/fluid/framework/ir/graph.h             |  26 +++--
 paddle/fluid/framework/ir/graph_helper.h      |   4 +-
 paddle/fluid/framework/parallel_executor.cc   |  81 +++++++------
 .../unittests/parallel_executor_test_base.py  |   3 +-
 .../unittests/test_parallel_executor_pg.py    | 107 ++++++++++++++++++
 13 files changed, 309 insertions(+), 86 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index ce5731a1f4..10855eacff 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -35,8 +35,8 @@ static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
   // them in multiple threads or processes to avoid hang.
   return (!strategy.enable_sequential_execution_ &&
-          strategy.num_trainers_ > 1) ||
-         strategy.enable_parallel_graph_;
+          strategy.num_trainers_ > 1) &&
+         !strategy.enable_parallel_graph_;
 }
 
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
@@ -106,7 +106,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // Verify that the graph is correct for multi-device executor.
-    AppendPass("multi_devices_check_pass");
+    auto multi_devices_pass = AppendPass("multi_devices_check_pass");
+    multi_devices_pass->Set<bool>(kEnablePG,
+                                  new bool(strategy.enable_parallel_graph_));
 
     if (SeqOnlyAllReduceOps(strategy)) {
       AppendPass("all_reduce_deps_pass");
@@ -180,6 +182,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
                                                     &local_scopes);
       pass->Erase(kNRanks);
       pass->Set<size_t>(kNRanks, new size_t(nranks));
+      pass->Erase(kEnablePG);
+      pass->Set<bool>(kEnablePG, new bool(true));
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cc..dcceaa93d9 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -36,11 +36,6 @@ namespace framework {
 namespace details {
 
 namespace {
-// TODO(panyx0718): Clean this up as well.
-// all operators. NOTE that even we use a vector here, the operators is
-// unordered.
-typedef std::vector<OpHandleBase *> GraphOps;
-const char kGraphOps[] = "ops";
 
 bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) {
   return boost::get<int>(
@@ -206,7 +201,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
             auto &g_name = backward_vars[i + 1];
             VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
 
-            InsertCollectiveOp(&result, p_name, g_name);
+            InsertCollectiveOp(&result, node, p_name, g_name);
           }
         } catch (boost::bad_get e) {
         }
@@ -226,7 +221,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-  result.Erase(kGraphOps);
+  // result.Erase(kGraphOps);
   return graph;
 }
 
@@ -391,20 +386,34 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
 }
 
 void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
-    ir::Graph *result, const std::string &og) const {
+    ir::Graph *result, ir::Node *node, const std::string &og) const {
+  OpHandleBase *op_handle = nullptr;
+
+  auto append_allreduce_op = [&](
+      std::vector<Scope *> &scopes,
+      std::vector<platform::Place> &places) -> OpHandleBase * {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places, nccl_ctxs_));
 #else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+    return result->Get<GraphOps>(kGraphOps).back();
+  };
+
+  if (!strategy_.enable_parallel_graph_)
+    op_handle = append_allreduce_op(local_scopes_, places_);
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
+    auto p = places_[i];
+    std::vector<Scope *> ss{local_scopes_[i]};
+    std::vector<platform::Place> ps{p};
+    if (strategy_.enable_parallel_graph_)
+      op_handle = append_allreduce_op(ss, ps);
+
     SetCommunicationContext(op_handle, p);
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
@@ -501,13 +510,13 @@ bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
 }
 
 void AllReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, const std::string &p_name,
+    ir::Graph *result, ir::Node *node, const std::string &p_name,
     const std::string &g_name) const {
   if (IsSparseGradient(g_name)) {
     CreateReduceOp(result, g_name, 0);
     CreateBroadcastOp(result, g_name, 0);
   } else {
-    CreateAllReduceOp(result, g_name);
+    CreateAllReduceOp(result, node, g_name);
   }
 }
 
@@ -580,7 +589,7 @@ void ReduceSSAGraphBuilder::ResetState() const {
 }
 
 void ReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, const std::string &p_name,
+    ir::Graph *result, ir::Node *node, const std::string &p_name,
     const std::string &g_name) const {
   size_t cur_device_id = GetAppropriateDeviceID({g_name});
   CreateReduceOp(result, g_name, cur_device_id);
@@ -900,7 +909,7 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
-void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
+void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, ir::Node *node,
                                              const std::string &p_name,
                                              const std::string &g_name) const {
   size_t cur_device_id = 0;
@@ -915,7 +924,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-        CreateAllReduceOp(result, g_name);
+        CreateAllReduceOp(result, node, g_name);
       }
       break;
     default:
@@ -966,7 +975,8 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
       .RequirePassAttr(paddle::framework::details::kPlaces)                    \
       .RequirePassAttr(paddle::framework::details::kLocalScopes)               \
       .RequirePassAttr(paddle::framework::details::kStrategy)                  \
-      .RequirePassAttr(paddle::framework::details::kNRanks)
+      .RequirePassAttr(paddle::framework::details::kNRanks)                    \
+      .RequirePassAttr(paddle::framework::details::kEnablePG)
 
 REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
                             paddle::framework::details::ReduceSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 6d4386538e..e3c1fe711c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -36,6 +36,7 @@ constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
 constexpr char kStrategy[] = "strategy";
 constexpr char kNRanks[] = "nranks";
+constexpr char kEnablePG[] = "enable_pg";
 
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
@@ -46,7 +47,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
+                                  const std::string &p_name,
                                   const std::string &g_name) const = 0;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
@@ -75,7 +77,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   bool IsSparseGradient(const std::string &og) const;
 
-  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
+  void CreateAllReduceOp(ir::Graph *result, ir::Node *node,
+                         const std::string &og) const;
 
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
@@ -106,7 +109,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
 class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
  protected:
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
+                                  const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
@@ -135,7 +139,8 @@ class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
  protected:
   virtual void Init() const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
+                                  const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
@@ -164,7 +169,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   virtual void InsertPostprocessOps(ir::Graph *result) const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
+                                  const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual void ResetState() const;
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 1a2b75fbc0..5331b750eb 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -36,13 +36,20 @@ namespace details {
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
 // `std::vector<VarHandle*>` is the version of varaibles.
-typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
     GraphVars;
 const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase*> GraphDepVars;
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
+
+// TODO(panyx0718): Clean this up as well.
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<OpHandleBase *> GraphOps;
+const char kGraphOps[] = "ops";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b1a82e8771..e0aa352e95 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,9 @@ class OpHandleBase {
     auto it = dev_ctxes_.find(place);
     return it != dev_ctxes_.end() ? it->second : nullptr;
   }
+  const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
+    return dev_ctxes_;
+  }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
     dev_ctxes_[place] = ctx_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 128aaa33a2..41bfe99cab 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,11 +13,74 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph) {
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  graphs.reserve(places.size());
+  for (size_t i = 0; i < places.size(); ++i) {
+    ProgramDesc empty;
+    graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
+    auto &g = graphs.back();
+    g->Set(kGraphVars, new GraphVars(1UL));
+    g->Set(kGraphDepVars, new GraphDepVars);
+    g->Set(kGraphOps, new GraphOps);
+  }
+
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+    auto &dev_ctx = op->DeviceContext();
+    auto &p = dev_ctx.begin()->first;
+#ifdef PADDLE_WITH_CUDA
+    int dev_id = boost::get<platform::CUDAPlace>(p).device;
+    auto &dev_ops = graphs[dev_id]->Get<GraphOps>(kGraphOps);
+    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
+    dev_ops.emplace_back(op);
+    graphs[dev_id]->AddNode(graph->ReleaseNode(op->Node()).release());
+
+    for (auto &var : op->Inputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->ReleaseNode(var->Node()).release());
+      }
+    }
+    for (auto &var : op->Outputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->ReleaseNode(var->Node()).release());
+      }
+    }
+#else
+    PADDLE_THROW("Parallel Graph Execution only support CUDAPlace.");
+#endif
+  }
+
+  for (size_t dev_id = 0; dev_id < places.size(); ++dev_id) {
+    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
+    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
+    for (auto &name_pair : origin_vars) {
+      dev_vars.emplace(name_pair.first, name_pair.second);
+      for (auto &version_pair : name_pair.second) {
+        if (graph->Nodes().count(version_pair->Node())) {
+          graphs[dev_id]->AddNode(
+              graph->ReleaseNode(version_pair->Node()).release());
+        }
+      }
+    }
+  }
+
+  return graphs;
+}
+
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
@@ -37,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
   }
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index c00c5bc2d1..e3abd23753 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -14,16 +14,24 @@
 
 #pragma once
 
+#include <fstream>
+#include <sstream>
 #include <string>
 #include <vector>
 
 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph);
+
 class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -31,11 +39,14 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
                            const std::vector<platform::Place> &places,
                            std::vector<std::unique_ptr<ir::Graph>> &&graphs);
   ~ParallelSSAGraphExecutor() final = default;
+
   const ir::Graph &Graph() const override { return *graphs_[0]; }
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
  private:
+  // std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph();
+
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 677a293794..c0edad6f74 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -56,10 +56,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
       }
     }
   }
+
   for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, ready_vars.get(), var);
   }
-
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op);
@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 8bb3c27bdd..07cbfc74ff 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -167,6 +167,14 @@ class Graph {
     return ret;
   }
 
+  std::unique_ptr<ir::Node> ReleaseNode(ir::Node *node) {
+    std::unique_ptr<ir::Node> ret;
+    ret.reset(nodes_.at(node).release());
+    nodes_.erase(node);
+    node_set_.erase(node);
+    return ret;
+  }
+
   void RemoveNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
     node_set_.erase(node);
@@ -183,13 +191,6 @@ class Graph {
     return nullptr;
   }
 
-  void ResolveHazard(
-      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
-
- private:
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
@@ -198,6 +199,17 @@ class Graph {
     return node;
   }
 
+  bool ContainNode(ir::Node *node) {
+    return node_set_.find(node) != node_set_.end();
+  }
+
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
+ private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
   // NOTE: program_ shouldn't be exposed to user.
   const ProgramDesc program_;
   std::map<std::string, boost::any> attrs_;
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index fba4936f2c..726cf8ec52 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -59,7 +59,9 @@ template <typename T>
 std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
   std::vector<T *> ret;
   for (ir::Node *n : graph.Nodes()) {
-    if (n->IsWrappedBy<T>()) ret.push_back(&n->Wrapper<T>());
+    if (n->IsWrappedBy<T>()) {
+      ret.push_back(&n->Wrapper<T>());
+    }
   }
   return ret;
 }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f61c9e3a91..abe241ed22 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -201,7 +202,6 @@ ParallelExecutor::ParallelExecutor(
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
   member_->nranks_ = build_strategy.num_trainers_ * places.size();
-
   if (!member_->use_all_reduce_) {
     PADDLE_ENFORCE(places.size() > 1,
                    "If you set build_strategy.reduce with 'Reduce',"
@@ -229,9 +229,10 @@ ParallelExecutor::ParallelExecutor(
   // choice the execution strategy.
   build_strategy.enable_parallel_graph_ =
       EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
-
-  VLOG(1) << "Enable ParallelGraph Execution: "
-          << build_strategy.enable_parallel_graph_;
+  if (build_strategy.enable_parallel_graph_)
+    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
+               "Execution which can get better performance,"
+            << "you can force it off by env FLAGS_enable_parallel_graph=0";
 
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
@@ -265,58 +266,42 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  std::unique_ptr<ir::Graph> graph;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (build_strategy.enable_parallel_graph_) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
-          member_->nccl_ctxs_.get());
-      graphs.push_back(std::move(graph));
-    }
-  } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-    graphs.push_back(std::move(graph));
-  }
+  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
-  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->nranks_, member_->use_cuda_);
-  graphs.push_back(std::move(graph));
+  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_);
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    for (size_t i = 0; i < graphs.size(); ++i) {
-      graphs[i] = member_->PrepareGCAndRefCnts(
-          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
-    }
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &graph : graphs) {
-    for (auto &node : graph->Nodes()) {
-      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-        var_infos.emplace_back();
-        var_infos.back().name_ = node->Var()->Name();
-        var_infos.back().type_ = node->Var()->GetType();
-        var_infos.back().persistable_ = node->Var()->Persistable();
-      }
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graphs[0]);
+    size_t graph_num = ir::GraphNum(*graph);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graphs[0])
+          << ir::GraphNum(*graph)
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -325,18 +310,30 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (build_strategy.enable_parallel_graph_) {
+    auto parallel_graph =
+        details::SeparateMultiDevicesGraph(member_->places_, std::move(graph));
+    auto seq_allreduce_pass =
+        ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+    seq_allreduce_pass->Erase(details::kAllOpDescs);
+    seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
+        details::kAllOpDescs,
+        new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    for (size_t i = 0; i < parallel_graph.size(); ++i) {
+      parallel_graph[i] =
+          seq_allreduce_pass->Apply(std::move(parallel_graph[i]));
+    }
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graphs)));
+        std::move(parallel_graph)));
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     } else {
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     }
   }
 
@@ -487,8 +484,8 @@ bool ParallelExecutor::EnableParallelGraphExecution(
     }
   }
 
-  if (!member_->use_all_reduce_ || !member_->use_cuda_)
-    enable_parallel_graph = false;
+  // if (!member_->use_all_reduce_ || !member_->use_cuda_)
+  if (!member_->use_all_reduce_) enable_parallel_graph = false;
 
   if (build_strategy.enable_sequential_execution_ ||
       exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index fdacd241f9..f14094a7b3 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -72,6 +72,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
+            exec_strategy.num_threads = 1
             if use_fast_executor:
                 exec_strategy.use_experimental_executor = True
             build_strategy = fluid.BuildStrategy()
@@ -99,7 +100,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             first_loss, = run_executor(
                 exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
 
-            for i in range(iter):
+            for _ in range(iter):
                 run_executor(
                     exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
new file mode 100644
index 0000000000..041c56fce1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import os
+os.environ['FLAGS_enable_parallel_graph'] = str(1)
+import paddle.fluid.core as core
+import os
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+
+
+def simple_fc_net(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def _init_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=use_reduce)
+
+    def test_simple_fc(self):
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=True)
+
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(True)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 73005ee00dc54eff7218e1c853bdf2eb0c053723 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Thu, 14 Feb 2019 16:37:35 +0800
Subject: [PATCH 02/10] cleanup code test=develop

---
 .../fluid/framework/details/build_strategy.cc   |  4 ----
 .../details/multi_devices_graph_pass.cc         | 17 ++++++++---------
 .../details/multi_devices_graph_pass.h          | 16 +++++-----------
 .../details/parallel_ssa_graph_executor.h       |  2 --
 .../details/threaded_ssa_graph_executor.cc      |  2 +-
 paddle/fluid/framework/ir/graph.h               | 10 ----------
 paddle/fluid/framework/ir/graph_helper.h        |  4 +---
 paddle/fluid/framework/parallel_executor.cc     |  9 ++++-----
 8 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index ae17b8df75..7d2a081e3b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -119,8 +119,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Verify that the graph is correct for multi-device executor.
     auto multi_devices_pass = AppendPass("multi_devices_check_pass");
-    multi_devices_pass->Set<bool>(kEnablePG,
-                                  new bool(strategy.enable_parallel_graph_));
 
     if (SeqOnlyAllReduceOps(strategy)) {
       AppendPass("all_reduce_deps_pass");
@@ -194,8 +192,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
                                                     &local_scopes);
       pass->Erase(kNRanks);
       pass->Set<size_t>(kNRanks, new size_t(nranks));
-      pass->Erase(kEnablePG);
-      pass->Set<bool>(kEnablePG, new bool(true));
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index dcceaa93d9..4f856c6d9e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -201,7 +201,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
             auto &g_name = backward_vars[i + 1];
             VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
 
-            InsertCollectiveOp(&result, node, p_name, g_name);
+            InsertCollectiveOp(&result, p_name, g_name);
           }
         } catch (boost::bad_get e) {
         }
@@ -386,7 +386,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
 }
 
 void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
-    ir::Graph *result, ir::Node *node, const std::string &og) const {
+    ir::Graph *result, const std::string &og) const {
   OpHandleBase *op_handle = nullptr;
 
   auto append_allreduce_op = [&](
@@ -510,13 +510,13 @@ bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
 }
 
 void AllReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, ir::Node *node, const std::string &p_name,
+    ir::Graph *result, const std::string &p_name,
     const std::string &g_name) const {
   if (IsSparseGradient(g_name)) {
     CreateReduceOp(result, g_name, 0);
     CreateBroadcastOp(result, g_name, 0);
   } else {
-    CreateAllReduceOp(result, node, g_name);
+    CreateAllReduceOp(result, g_name);
   }
 }
 
@@ -589,7 +589,7 @@ void ReduceSSAGraphBuilder::ResetState() const {
 }
 
 void ReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, ir::Node *node, const std::string &p_name,
+    ir::Graph *result, const std::string &p_name,
     const std::string &g_name) const {
   size_t cur_device_id = GetAppropriateDeviceID({g_name});
   CreateReduceOp(result, g_name, cur_device_id);
@@ -909,7 +909,7 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
-void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, ir::Node *node,
+void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
                                              const std::string &g_name) const {
   size_t cur_device_id = 0;
@@ -924,7 +924,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, ir::Node *node,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-        CreateAllReduceOp(result, node, g_name);
+        CreateAllReduceOp(result, g_name);
       }
       break;
     default:
@@ -975,8 +975,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
       .RequirePassAttr(paddle::framework::details::kPlaces)                    \
       .RequirePassAttr(paddle::framework::details::kLocalScopes)               \
       .RequirePassAttr(paddle::framework::details::kStrategy)                  \
-      .RequirePassAttr(paddle::framework::details::kNRanks)                    \
-      .RequirePassAttr(paddle::framework::details::kEnablePG)
+      .RequirePassAttr(paddle::framework::details::kNRanks)
 
 REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
                             paddle::framework::details::ReduceSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index e3c1fe711c..6d4386538e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -36,7 +36,6 @@ constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
 constexpr char kStrategy[] = "strategy";
 constexpr char kNRanks[] = "nranks";
-constexpr char kEnablePG[] = "enable_pg";
 
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
@@ -47,8 +46,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
-                                  const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const = 0;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
@@ -77,8 +75,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   bool IsSparseGradient(const std::string &og) const;
 
-  void CreateAllReduceOp(ir::Graph *result, ir::Node *node,
-                         const std::string &og) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
 
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
@@ -109,8 +106,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
 class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
  protected:
-  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
-                                  const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
@@ -139,8 +135,7 @@ class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
  protected:
   virtual void Init() const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
-                                  const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
@@ -169,8 +164,7 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   virtual void InsertPostprocessOps(ir::Graph *result) const;
 
-  virtual void InsertCollectiveOp(ir::Graph *result, ir::Node *node,
-                                  const std::string &p_name,
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                                   const std::string &g_name) const;
 
   virtual void ResetState() const;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index e3abd23753..c31bba17f6 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -45,8 +45,6 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
  private:
-  // std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph();
-
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c0edad6f74..5bf414324f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -56,10 +56,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
       }
     }
   }
-
   for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, ready_vars.get(), var);
   }
+
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op);
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 0d66043a73..40baae2ffd 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -176,12 +176,6 @@ class Graph {
     return ret;
   }
 
-  void RemoveNode(ir::Node *node) {
-    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
-    nodes_.erase(node);
-  }
-
   // NOTE low performance, but simple and secure.
   Node *RetrieveNode(int id) {
     for (auto &node : nodes_) {
@@ -200,10 +194,6 @@ class Graph {
     return node;
   }
 
-  bool ContainNode(ir::Node *node) {
-    return node_set_.find(node) != node_set_.end();
-  }
-
   void ResolveHazard(
       const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
 
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 3b95aa7b86..214de9ec7d 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -64,9 +64,7 @@ template <typename T>
 std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
   std::vector<T *> ret;
   for (ir::Node *n : graph.Nodes()) {
-    if (n->IsWrappedBy<T>()) {
-      ret.push_back(&n->Wrapper<T>());
-    }
+    if (n->IsWrappedBy<T>()) ret.push_back(&n->Wrapper<T>());
   }
   return ret;
 }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 91d1a99886..dca1a4e530 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -478,12 +478,11 @@ bool ParallelExecutor::EnableParallelGraphExecution(
     }
   }
 
-  // if (!member_->use_all_reduce_ || !member_->use_cuda_)
-  if (!member_->use_all_reduce_) enable_parallel_graph = false;
+  if (!member_->use_all_reduce_ || !member_->use_cuda_)
 
-  if (build_strategy.enable_sequential_execution_ ||
-      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
-    enable_parallel_graph = false;
+    if (build_strategy.enable_sequential_execution_ ||
+        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+      enable_parallel_graph = false;
   return enable_parallel_graph;
 }
 

From ecdd1166b80627b652b948d6b8b317307ce0afb0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Thu, 14 Feb 2019 16:44:09 +0800
Subject: [PATCH 03/10] cleanup code test=develop

---
 .../framework/details/parallel_ssa_graph_executor.cc      | 8 ++++----
 paddle/fluid/framework/ir/graph.h                         | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index a7cb9adbbf..77a3318ff9 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -41,14 +41,14 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
     auto &dev_ops = graphs[dev_id]->Get<GraphOps>(kGraphOps);
     auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
     dev_ops.emplace_back(op);
-    graphs[dev_id]->AddNode(graph->ReleaseNode(op->Node()).release());
+    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
 
     for (auto &var : op->Inputs()) {
       auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
       if (dummy_ptr) {
         dev_dummys.insert(var);
         if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->ReleaseNode(var->Node()).release());
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
       }
     }
     for (auto &var : op->Outputs()) {
@@ -56,7 +56,7 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
       if (dummy_ptr) {
         dev_dummys.insert(var);
         if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->ReleaseNode(var->Node()).release());
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
       }
     }
 #else
@@ -72,7 +72,7 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
       for (auto &version_pair : name_pair.second) {
         if (graph->Nodes().count(version_pair->Node())) {
           graphs[dev_id]->AddNode(
-              graph->ReleaseNode(version_pair->Node()).release());
+              graph->RemoveNode(version_pair->Node()).release());
         }
       }
     }
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 40baae2ffd..b55a774513 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -168,7 +168,8 @@ class Graph {
     return ret;
   }
 
-  std::unique_ptr<ir::Node> ReleaseNode(ir::Node *node) {
+  std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
     std::unique_ptr<ir::Node> ret;
     ret.reset(nodes_.at(node).release());
     nodes_.erase(node);

From bd0d44af2409c9900706fb5eb50c2c713a7fd083 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Thu, 14 Feb 2019 17:51:34 +0800
Subject: [PATCH 04/10] fix build failed test=develop

---
 paddle/fluid/framework/details/all_reduce_deps_pass.cc | 2 --
 paddle/fluid/framework/details/all_reduce_deps_pass.h  | 2 ++
 paddle/fluid/framework/parallel_executor.cc            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index b7d6edd389..2e20c436df 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -30,8 +30,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static constexpr char kAllOpDescs[] = "all_op_descs";
-
 VarHandle* GetValidInput(const OpHandleBase* a) {
   for (auto p : a->Inputs()) {
     VarHandle* b = dynamic_cast<VarHandle*>(p);
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
index e8b9108981..1637c7a7a6 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@@ -21,6 +21,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kAllOpDescs[] = "all_op_descs";
+
 // TODO(gongwb): overlap allreduce with backward computation.
 class AllReduceDepsPass : public ir::Pass {
  protected:
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dca1a4e530..21f2e1ee3e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,12 +21,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 

From 7cd6de37f57d05c967d829844bc819dd69ce278b Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Thu, 14 Feb 2019 18:29:12 +0800
Subject: [PATCH 05/10] fix cpu test=develop

---
 .../fluid/framework/details/parallel_ssa_graph_executor.cc   | 4 ----
 paddle/fluid/framework/parallel_executor.cc                  | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 77a3318ff9..3433c3424e 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -36,7 +36,6 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
   for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     auto &dev_ctx = op->DeviceContext();
     auto &p = dev_ctx.begin()->first;
-#ifdef PADDLE_WITH_CUDA
     int dev_id = boost::get<platform::CUDAPlace>(p).device;
     auto &dev_ops = graphs[dev_id]->Get<GraphOps>(kGraphOps);
     auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
@@ -59,9 +58,6 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
           graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
       }
     }
-#else
-    PADDLE_THROW("Parallel Graph Execution only support CUDAPlace.");
-#endif
   }
 
   for (size_t dev_id = 0; dev_id < places.size(); ++dev_id) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 21f2e1ee3e..dbe1bf9b29 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -304,6 +304,7 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (build_strategy.enable_parallel_graph_) {
+#ifdef PADDLE_WITH_CUDA
     auto parallel_graph =
         details::SeparateMultiDevicesGraph(member_->places_, std::move(graph));
     auto seq_allreduce_pass =
@@ -319,6 +320,10 @@ ParallelExecutor::ParallelExecutor(
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
         std::move(parallel_graph)));
+#else
+    PADDLE_THROW(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
+#endif
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(

From 642fd68ce0e4c71e0a5e9fd4417769a9e98ee8b7 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Mon, 18 Feb 2019 14:44:25 +0800
Subject: [PATCH 06/10] update by comment test=develop

---
 .../framework/details/all_reduce_deps_pass.h  |  2 --
 .../details/memory_optimize_helper.h          |  2 --
 .../details/multi_devices_graph_pass.cc       |  1 -
 .../details/parallel_ssa_graph_executor.cc    | 28 +++++++++++++------
 .../details/parallel_ssa_graph_executor.h     | 11 ++++----
 paddle/fluid/framework/ir/graph.h             |  5 ++++
 paddle/fluid/framework/parallel_executor.cc   | 18 +++---------
 7 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
index 1637c7a7a6..e8b9108981 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@@ -21,8 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kAllOpDescs[] = "all_op_descs";
-
 // TODO(gongwb): overlap allreduce with backward computation.
 class AllReduceDepsPass : public ir::Pass {
  protected:
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 0bfaf827fe..2c9a16d445 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -29,8 +29,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kAllOpDescs[] = "all_op_descs";
-
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
 // NOTE(dzh): A ordered set for node reuse in memory optimize.
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 4f856c6d9e..27bc771814 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -221,7 +221,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-  // result.Erase(kGraphOps);
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 3433c3424e..2cafa1873a 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -19,12 +19,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> graph) {
+std::vector<std::unique_ptr<ir::Graph>>
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
+    std::unique_ptr<ir::Graph> &&graph) {
   std::vector<std::unique_ptr<ir::Graph>> graphs;
-  graphs.reserve(places.size());
-  for (size_t i = 0; i < places.size(); ++i) {
+  graphs.reserve(places_.size());
+  for (size_t i = 0; i < places_.size(); ++i) {
     ProgramDesc empty;
     graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
     auto &g = graphs.back();
@@ -60,7 +60,7 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
     }
   }
 
-  for (size_t dev_id = 0; dev_id < places.size(); ++dev_id) {
+  for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
     auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
     auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
     for (auto &name_pair : origin_vars) {
@@ -80,14 +80,26 @@ std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graphs_(std::move(graphs)) {
+      main_prog_(main_prog),
+      // TODO(Yancey1989): copy graphs is not safely since it deleted the attrs.
+      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
+  auto seq_allreduce_pass =
+      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+  seq_allreduce_pass->Erase(details::kAllOpDescs);
+  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
+      details::kAllOpDescs,
+      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
+  for (size_t i = 0; i < graphs_.size(); ++i) {
+    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+  }
+
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                                ? 1UL
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index c31bba17f6..f59305bf98 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -28,16 +28,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> graph);
-
 class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+                           const framework::ProgramDesc &main_prog,
+                           std::unique_ptr<ir::Graph> &&graph);
   ~ParallelSSAGraphExecutor() final = default;
 
   const ir::Graph &Graph() const override { return *graphs_[0]; }
@@ -45,10 +42,14 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
  private:
+  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+      std::unique_ptr<ir::Graph> &&graph);
+
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
+  framework::ProgramDesc main_prog_;
   std::vector<std::unique_ptr<ir::Graph>> graphs_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b55a774513..d5b3782f62 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,6 +26,11 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+namespace details {
+constexpr char kAllOpDescs[] = "all_op_descs";
+}  //  namespace details
+
 namespace ir {
 
 /*
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dbe1bf9b29..56da566009 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -305,21 +305,11 @@ ParallelExecutor::ParallelExecutor(
 
   if (build_strategy.enable_parallel_graph_) {
 #ifdef PADDLE_WITH_CUDA
-    auto parallel_graph =
-        details::SeparateMultiDevicesGraph(member_->places_, std::move(graph));
-    auto seq_allreduce_pass =
-        ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-    seq_allreduce_pass->Erase(details::kAllOpDescs);
-    seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
-        details::kAllOpDescs,
-        new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
-    for (size_t i = 0; i < parallel_graph.size(); ++i) {
-      parallel_graph[i] =
-          seq_allreduce_pass->Apply(std::move(parallel_graph[i]));
-    }
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(parallel_graph)));
+        exec_strategy, member_->local_scopes_, member_->places_, main_program,
+        std::move(graph)));
 #else
     PADDLE_THROW(
         "Paddle should be compiled with CUDA for ParallelGraph Execution.");

From 5677c9d4eed6b7d591e214b980354d18bb1c4c87 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Mon, 18 Feb 2019 14:45:39 +0800
Subject: [PATCH 07/10] update comment test=develop

---
 paddle/fluid/framework/details/parallel_ssa_graph_executor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 2cafa1873a..c36618016b 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -86,7 +86,8 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
       main_prog_(main_prog),
-      // TODO(Yancey1989): copy graphs is not safely since it deleted the attrs.
+      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
+      // attrs.
       graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 

From 0f8bd73cc9d23ba1bf2fc9b15bae74450daee0d5 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Mon, 18 Feb 2019 14:51:47 +0800
Subject: [PATCH 08/10] cleanup code test=develop

---
 paddle/fluid/framework/details/build_strategy.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 7d2a081e3b..45c2c73415 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -34,6 +34,8 @@ namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
   // them in multiple threads or processes to avoid hang.
+  // NOTE: ParallelExecutor would execute this pass on each graph, so
+  // don't need to append it here.
   return (!strategy.enable_sequential_execution_ &&
           strategy.num_trainers_ > 1) &&
          !strategy.enable_parallel_graph_;
@@ -118,7 +120,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // Verify that the graph is correct for multi-device executor.
-    auto multi_devices_pass = AppendPass("multi_devices_check_pass");
+    AppendPass("multi_devices_check_pass");
 
     if (SeqOnlyAllReduceOps(strategy)) {
       AppendPass("all_reduce_deps_pass");

From d5090c892d609bf1d394d3c755cc4bafb80ba6f7 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Tue, 19 Feb 2019 15:22:25 +0800
Subject: [PATCH 09/10] polish code test=develop

---
 paddle/fluid/framework/details/build_strategy.cc |  2 +-
 .../details/multi_devices_graph_pass.cc          | 16 +++++++---------
 .../details/parallel_ssa_graph_executor.cc       |  3 ++-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 45c2c73415..3a5e41ef3c 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -34,7 +34,7 @@ namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
   // them in multiple threads or processes to avoid hang.
-  // NOTE: ParallelExecutor would execute this pass on each graph, so
+  // NOTE: ParallelGraph would execute this pass on each graph, so
   // don't need to append it here.
   return (!strategy.enable_sequential_execution_ &&
           strategy.num_trainers_ > 1) &&
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 27bc771814..3c0a8d7020 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -389,8 +389,8 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
   OpHandleBase *op_handle = nullptr;
 
   auto append_allreduce_op = [&](
-      std::vector<Scope *> &scopes,
-      std::vector<platform::Place> &places) -> OpHandleBase * {
+      const std::vector<Scope *> &scopes,
+      const std::vector<platform::Place> &places) -> OpHandleBase * {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -407,13 +407,11 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
     op_handle = append_allreduce_op(local_scopes_, places_);
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto p = places_[i];
-    std::vector<Scope *> ss{local_scopes_[i]};
-    std::vector<platform::Place> ps{p};
-    if (strategy_.enable_parallel_graph_)
-      op_handle = append_allreduce_op(ss, ps);
+    if (strategy_.enable_parallel_graph_) {
+      op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
+    }
 
-    SetCommunicationContext(op_handle, p);
+    SetCommunicationContext(op_handle, places_[i]);
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
@@ -421,7 +419,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
 
     auto var =
         new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
-                      vars.size(), i, og, p);
+                      vars.size(), i, og, places_[i]);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index c36618016b..3740b795fa 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -32,8 +32,9 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
     g->Set(kGraphDepVars, new GraphDepVars);
     g->Set(kGraphOps, new GraphOps);
   }
+  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
 
-  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+  for (auto &op : op_handles) {
     auto &dev_ctx = op->DeviceContext();
     auto &p = dev_ctx.begin()->first;
     int dev_id = boost::get<platform::CUDAPlace>(p).device;

From 4b193db14c4862569c345e4cf7970418dbf01073 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Tue, 19 Feb 2019 17:17:36 +0800
Subject: [PATCH 10/10] polish code test=develop

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc  | 6 ++++++
 paddle/fluid/framework/details/multi_devices_helper.h       | 6 ------
 .../fluid/framework/details/parallel_ssa_graph_executor.cc  | 3 ---
 .../fluid/framework/details/parallel_ssa_graph_executor.h   | 2 --
 paddle/fluid/framework/ir/graph.h                           | 3 +++
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 3c0a8d7020..7d1e63f368 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -36,6 +36,11 @@ namespace framework {
 namespace details {
 
 namespace {
+// TODO(panyx0718): Clean this up as well.
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<OpHandleBase *> GraphOps;
+const char kGraphOps[] = "ops";
 
 bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) {
   return boost::get<int>(
@@ -221,6 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+  result.Erase(kGraphOps);
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 5331b750eb..9afbb91005 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -44,12 +44,6 @@ const char kGraphVars[] = "vars";
 typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
 
-// TODO(panyx0718): Clean this up as well.
-// all operators. NOTE that even we use a vector here, the operators is
-// unordered.
-typedef std::vector<OpHandleBase *> GraphOps;
-const char kGraphOps[] = "ops";
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 3740b795fa..4c8f69c68c 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -30,7 +30,6 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
     auto &g = graphs.back();
     g->Set(kGraphVars, new GraphVars(1UL));
     g->Set(kGraphDepVars, new GraphDepVars);
-    g->Set(kGraphOps, new GraphOps);
   }
   auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
 
@@ -38,9 +37,7 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
     auto &dev_ctx = op->DeviceContext();
     auto &p = dev_ctx.begin()->first;
     int dev_id = boost::get<platform::CUDAPlace>(p).device;
-    auto &dev_ops = graphs[dev_id]->Get<GraphOps>(kGraphOps);
     auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
-    dev_ops.emplace_back(op);
     graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
 
     for (auto &var : op->Inputs()) {
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index f59305bf98..1c35d45fdd 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include <fstream>
-#include <sstream>
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index d5b3782f62..296f3b8396 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -28,6 +28,9 @@ namespace paddle {
 namespace framework {
 
 namespace details {
+
+// This attr is not recommended, because the graph should not dependence
+// the program once it is built.
 constexpr char kAllOpDescs[] = "all_op_descs";
 }  //  namespace details