Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor

test=develop
6 years ago · 31a05d3efd
parent cc71e89499 a7e7d95203
commit 31a05d3efd
43 changed files with 870 additions and 333 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@ -30,8 +30,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-static constexpr char kAllOpDescs[] = "all_op_descs";
-
 VarHandle* GetValidInput(const OpHandleBase* a) {
  for (auto p : a->Inputs()) {
    VarHandle* b = dynamic_cast<VarHandle*>(p);
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif

 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());

  WaitInputVarGenerated();
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -22,7 +22,7 @@ namespace framework {
 namespace details {

 void BroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());

  if (places_.size() == 1) return;

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -34,9 +34,11 @@ namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
  // Should fix the allreduce op order if scheduling
  // them in multiple threads or processes to avoid hang.
+  // NOTE: ParallelGraph would execute this pass on each graph, so
+  // don't need to append it here.
  return (!strategy.enable_sequential_execution_ &&
-          strategy.num_trainers_ > 1) ||
-         strategy.enable_parallel_graph_;
+          strategy.num_trainers_ > 1) &&
+         !strategy.enable_parallel_graph_;
 }

 class ParallelExecutorPassBuilder : public ir::PassBuilder {
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@ -22,7 +22,7 @@ namespace framework {
 namespace details {

 void FusedBroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());

  if (places_.size() == 1UL) return;

--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@ -29,8 +29,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kAllOpDescs[] = "all_op_descs";
-
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);

 // NOTE(dzh): A ordered set for node reuse in memory optimize.
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -412,20 +412,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,

 void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
    ir::Graph *result, const std::string &og) const {
+  OpHandleBase *op_handle = nullptr;
+
+  auto append_allreduce_op = [&](
+      const std::vector<Scope *> &scopes,
+      const std::vector<platform::Place> &places) -> OpHandleBase * {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places, nccl_ctxs_));
 #else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+    return result->Get<GraphOps>(kGraphOps).back();
+  };
+
+  if (!strategy_.enable_parallel_graph_)
+    op_handle = append_allreduce_op(local_scopes_, places_);

  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
+    if (strategy_.enable_parallel_graph_) {
+      op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
+    }
+
+    SetCommunicationContext(op_handle, places_[i]);
    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
@ -433,7 +445,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(

    auto var =
        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
-                      vars.size(), i, og, p);
+                      vars.size(), i, og, places_[i]);
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@ -36,13 +36,14 @@ namespace details {
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
 // `std::vector<VarHandle*>` is the version of varaibles.
-typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
    GraphVars;
 const char kGraphVars[] = "vars";

 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase*> GraphDepVars;
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -70,6 +70,9 @@ class OpHandleBase {
    auto it = dev_ctxes_.find(place);
    return it != dev_ctxes_.end() ? it->second : nullptr;
  }
+  const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
+    return dev_ctxes_;
+  }

  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
    dev_ctxes_[place] = ctx_;
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@ -13,22 +13,92 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
 namespace details {

+std::vector<std::unique_ptr<ir::Graph>>
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
+    std::unique_ptr<ir::Graph> &&graph) {
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  graphs.reserve(places_.size());
+  for (size_t i = 0; i < places_.size(); ++i) {
+    ProgramDesc empty;
+    graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
+    auto &g = graphs.back();
+    g->Set(kGraphVars, new GraphVars(1UL));
+    g->Set(kGraphDepVars, new GraphDepVars);
+  }
+  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+  for (auto &op : op_handles) {
+    auto &dev_ctx = op->DeviceContext();
+    auto &p = dev_ctx.begin()->first;
+    int dev_id = boost::get<platform::CUDAPlace>(p).device;
+    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
+    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
+
+    for (auto &var : op->Inputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+    for (auto &var : op->Outputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+  }
+
+  for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
+    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
+    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
+    for (auto &name_pair : origin_vars) {
+      dev_vars.emplace(name_pair.first, name_pair.second);
+      for (auto &version_pair : name_pair.second) {
+        if (graph->Nodes().count(version_pair->Node())) {
+          graphs[dev_id]->AddNode(
+              graph->RemoveNode(version_pair->Node()).release());
+        }
+      }
+    }
+  }
+
+  return graphs;
+}
+
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
    : strategy_(std::move(strategy)),
      local_scopes_(std::move(local_scopes)),
      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
      places_(std::move(places)),
-      graphs_(std::move(graphs)) {
+      main_prog_(main_prog),
+      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
+      // attrs.
+      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());

+  auto seq_allreduce_pass =
+      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+  seq_allreduce_pass->Erase(details::kAllOpDescs);
+  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
+      details::kAllOpDescs,
+      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
+  for (size_t i = 0; i < graphs_.size(); ++i) {
+    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+  }
+
  // set the correct size of thread pool to each device.
  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                               ? 1UL
@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
          << " to run the operators of the graph on each device.";
  for (size_t i = 0; i < places.size(); ++i) {
    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
  }
 }

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@ -18,7 +18,9 @@
 #include <vector>

 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+                           const framework::ProgramDesc &main_prog,
+                           std::unique_ptr<ir::Graph> &&graph);
  ~ParallelSSAGraphExecutor() final = default;
+
  const ir::Graph &Graph() const override { return *graphs_[0]; }

  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;

 private:
+  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+      std::unique_ptr<ir::Graph> &&graph);
+
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::unique_ptr<::ThreadPool> pool_{nullptr};
  std::vector<platform::Place> places_;
+  framework::ProgramDesc main_prog_;
  std::vector<std::unique_ptr<ir::Graph>> graphs_;

  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
 #endif

 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());

  if (places_.size() == 1) return;
  // the input and output may have dummy var.
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
    eptr = std::current_exception();
  }

-  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
  ++drop_scope_counter_;

  bool stream_end = false;
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -50,7 +50,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
    const std::vector<std::string> &fetch_tensors) {
  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
  std::unordered_map<OpHandleBase *, size_t> pending_ops;
  std::unordered_set<VarHandleBase *> pending_vars;
  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
@ -240,7 +240,7 @@ void ThreadedSSAGraphExecutor::RunOp(
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(10) << op << " " << op->Name() << " Signal posted";
    } catch (...) {
      exception_holder_.Catch(std::current_exception());
    }
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -26,6 +26,14 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
+
+namespace details {
+
+// This attr is not recommended, because the graph should not dependence
+// the program once it is built.
+constexpr char kAllOpDescs[] = "all_op_descs";
+}  //  namespace details
+
 namespace ir {

 /*
@ -168,10 +176,13 @@ class Graph {
    return ret;
  }

-  void RemoveNode(ir::Node *node) {
+  std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
+    std::unique_ptr<ir::Node> ret;
+    ret.reset(nodes_.at(node).release());
    nodes_.erase(node);
+    node_set_.erase(node);
+    return ret;
  }

  // NOTE low performance, but simple and secure.
@ -184,13 +195,6 @@ class Graph {
    return nullptr;
  }

-  void ResolveHazard(
-      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
-
- private:
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
@ -199,6 +203,13 @@ class Graph {
    return node;
  }

+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
+ private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
  // NOTE: program_ shouldn't be exposed to user.
  const ProgramDesc program_;
  std::map<std::string, boost::any> attrs_;
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    // in concurrency scenerio. Here use an `if` to fix this issue.
    // Please not remove the `if`, ask @Superjomn if there are any concern.
    if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
+      platform::RecordEvent record_event(Type());
      RunImpl(scope, place);
    } else {
      RunImpl(scope, place);
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"

 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@ -194,7 +195,6 @@ ParallelExecutor::ParallelExecutor(
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-
  if (!member_->use_all_reduce_) {
    PADDLE_ENFORCE(places.size() > 1,
                   "If you set build_strategy.reduce with 'Reduce',"
@ -222,9 +222,10 @@ ParallelExecutor::ParallelExecutor(
  // choice the execution strategy.
  build_strategy.enable_parallel_graph_ =
      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
-
-  VLOG(1) << "Enable ParallelGraph Execution: "
-          << build_strategy.enable_parallel_graph_;
+  if (build_strategy.enable_parallel_graph_)
+    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
+               "Execution which can get better performance,"
+            << "you can force it off by env FLAGS_enable_parallel_graph=0";

  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
@ -258,22 +259,11 @@ ParallelExecutor::ParallelExecutor(

  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
  // ncclOp
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  std::unique_ptr<ir::Graph> graph;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (build_strategy.enable_parallel_graph_) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
-          member_->nccl_ctxs_.get());
-      graphs.push_back(std::move(graph));
-    }
-  } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-    graphs.push_back(std::move(graph));
-  }
+  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
    for (size_t i = 0; i < member_->places_.size(); ++i) {
@ -293,40 +283,37 @@ ParallelExecutor::ParallelExecutor(
  VLOG(10) << "Eager Deletion Threshold "
           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
-    for (size_t i = 0; i < graphs.size(); ++i) {
-      graphs[i] = member_->PrepareGCAndRefCnts(
-          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
-    }
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
  }

  // Step 3. Create vars in each scope. Passes may also create new vars.
  //         skip control vars and empty vars
  std::vector<details::VariableInfo> var_infos;
-  for (auto &graph : graphs) {
-    for (auto &node : graph->Nodes()) {
-      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-        var_infos.emplace_back();
-        var_infos.back().name_ = node->Var()->Name();
-        var_infos.back().type_ = node->Var()->GetType();
-        var_infos.back().persistable_ = node->Var()->Persistable();
-      }
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
    }
  }

  // If the loss_var_name is given, the number of graph should be only one.
  if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graphs[0]);
+    size_t graph_num = ir::GraphNum(*graph);
    if (graph_num > 1) {
      LOG(WARNING)
          << "The number of graph should be only one, "
             "but the current graph has "
-          << ir::GraphNum(*graphs[0])
+          << ir::GraphNum(*graph)
          << " sub_graphs. If you want to see the nodes of the "
             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
             "to specify the output dir. NOTES: if you not do training, "
             "please don't pass loss_var_name.";
    }
  }
+
  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
    VLOG(3) << "use AsyncSSAGraphExecutor";
    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
@ -334,20 +321,27 @@ ParallelExecutor::ParallelExecutor(
        std::move(graphs)));
  } else if (build_strategy.enable_parallel_graph_) {
    VLOG(3) << "use ParallelSSAGraphExecutor";
+#ifdef PADDLE_WITH_CUDA
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
    member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graphs)));
+        exec_strategy, member_->local_scopes_, member_->places_, main_program,
+        std::move(graph)));
+#else
+    PADDLE_THROW(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
+#endif
  } else {
    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
      VLOG(3) << "use ThreadedSSAGraphExecutor";
      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
    } else {
      VLOG(3) << "use FastThreadedSSAGraphExecutor";
      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
    }
  }

@ -509,11 +503,10 @@ bool ParallelExecutor::EnableParallelGraphExecution(
  }

  if (!member_->use_all_reduce_ || !member_->use_cuda_)
-    enable_parallel_graph = false;

-  if (build_strategy.enable_sequential_execution_ ||
-      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
-    enable_parallel_graph = false;
+    if (build_strategy.enable_sequential_execution_ ||
+        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+      enable_parallel_graph = false;
  return enable_parallel_graph;
 }

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
  // Enable the profiler
  paddle::platform::EnableProfiler(state);
  {
-    paddle::platform::RecordEvent record_event(
-        "init_program",
-        paddle::platform::DeviceContextPool::Instance().Get(place));
+    paddle::platform::RecordEvent record_event("init_program");
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
  }

@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,

    // Run repeat times to profile the performance
    for (int i = 0; i < repeat; ++i) {
-      paddle::platform::RecordEvent record_event(
-          "run_inference",
-          paddle::platform::DeviceContextPool::Instance().Get(place));
+      paddle::platform::RecordEvent record_event("run_inference");

      if (PrepareContext) {
        // Note: if you change the inference_program, you need to call
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
  usage_ -= size;
 }

-uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
+uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }

 LegacyMemMonitor::~LegacyMemMonitor() {
  for (auto &item : gpu_mem_info_) delete item.second;
@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
  gpu_mem_info_[device]->Minus(size);
 }

-uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
+uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
  return gpu_mem_info_.find(device) == gpu_mem_info_.end()
             ? 0
-             : gpu_mem_info_[device]->GetPeakUsage();
+             : gpu_mem_info_.at(device)->GetPeakUsage();
 }

 void LegacyMemMonitor::PrintMemUsage() {
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@ -27,20 +27,20 @@ namespace allocation {
 class MemInfo {
 public:
  MemInfo() : usage_(0), peak_usage_(0) {}
-  MemInfo(const MemInfo &) = delete;
-  MemInfo &operator=(const MemInfo &) = delete;

  // return a flag to indicate current operation will create a peak point or not
  bool Add(const size_t &);
  void Minus(const size_t &);

-  uint64_t GetPeakUsage();
+  uint64_t GetPeakUsage() const;

 private:
  /* current memory usage*/
  uint64_t usage_;
  uint64_t peak_usage_;
  std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(MemInfo);
 };

 class LegacyMemMonitor {
@ -56,11 +56,11 @@ class LegacyMemMonitor {
  void Add(const int &, const size_t &);
  void Minus(const int &, const size_t &);

-  uint64_t GetMemUsage(const int &);
+  uint64_t GetMemUsage(const int &) const;

  void PrintMemUsage();

- protected:
+ private:
  MemUsage gpu_mem_info_;
 };

--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
    google::protobuf::Closure* done = brpc::NewCallback(
        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);

-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);

    ch_ctx->stub->SendVariable(cntl, &request, response, done);

@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
    google::protobuf::Closure* done = brpc::NewCallback(
        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);

-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);

    if (method_name == kGetMonomerRPC) {
      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
                                  &cntl->request_attachment(), out_var_name_val,
                                  false, 0, table_name_val);

-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);

    google::protobuf::Closure* done = brpc::NewCallback(
        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  VarHandlePtr var_h(
      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  google::protobuf::Closure* done = brpc::NewCallback(
      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
  cntl->set_timeout_ms(time_out);

-  platform::RecordRPCEvent record_event(method_name, nullptr);
+  platform::RecordRPCEvent record_event(method_name);

  VarHandlePtr var_h(
      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
    // stub context
    s->response_call_back_ = nullptr;

-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);

    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
        // stub context
        s->response_call_back_ = ProcGetResponse;

-        platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method);

        auto call =
            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
    // stub context
    s->response_call_back_ = ProcGetResponse;

-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);

    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(var_name);

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(COMPLETE_MESSAGE);

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
  req.set_out_varname(dir);

-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);

  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           ::grpc::ByteBuffer* msg, const std::string& out_name,
                           const int trainer_id,
                           const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial", &ctx);
+  platform::RecordRPCEvent record_event("serial");
  VarMsg request;
  TensorPayload* payload = nullptr;

@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
                               framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial", &ctx);
+  platform::RecordRPCEvent record_event("deserial");
  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
  *var = resp.GetVar();
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
    std::vector<framework::LoDTensor> ins;

    // For profiling
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(dev_place);
-    platform::RecordEvent record_event(Type(), &ctx);
+    platform::RecordEvent record_event(Type());

    reader->ReadNext(&ins);
    if (ins.empty()) {
--- a/Show More
+++ b/Show More