From eb2e4eeadeaced1716da76aea6b4014f80bad301 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 16:37:15 +0800
Subject: [PATCH 1/6] Debug

---
 paddle/fluid/framework/parallel_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c1486b527d..186ddbe369 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -64,12 +64,12 @@ ParallelExecutor::ParallelExecutor(
   // Create local scopes
   if (local_scopes.empty()) {
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(&scope->NewScope());
+      member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(local_scopes[i]);
     }
   }
 

From 89728f8e66d078ecc6cae24a93b8931de8250b16 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 16:42:40 +0800
Subject: [PATCH 2/6] update

---
 paddle/fluid/framework/parallel_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 186ddbe369..c63162d2ff 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -177,7 +177,7 @@ void ParallelExecutor::Run(
         InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       } else {
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       }
     }

From d7527681c619296634ceb061e0b013bb2bf22fff Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 16:47:35 +0800
Subject: [PATCH 3/6] Fix scale grad

---
 .../fluid/framework/details/scale_loss_grad_op_handle.cc   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 7fb9f99a8a..7a65ee62c9 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 void ScaleLossGradOpHandle::RunImpl() {
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp =
-      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
-          make_ddim({1}), place_);
+  float *tmp = local_scope.FindVar(var_name)
+                   ->GetMutable<LoDTensor>()
+                   ->mutable_data<float>(make_ddim({1}), place_);
 
   if (platform::is_cpu_place(place_)) {
     *tmp = coeff_;

From 2e8459be71c796b73a603c8adf87b0d811ddecd3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 16:52:58 +0800
Subject: [PATCH 4/6] DebugCode

---
 .../details/nccl_all_reduce_op_handle.cc          | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 1e48f75958..e587210b35 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
       lod_tensors.emplace_back(lod_tensor);
     }
 
@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
         }
       });
     } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg =
-          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+      auto &trg = *this->local_scopes_[0]
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->Var()
+                       ->GetMutable<framework::LoDTensor>();
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
       VisitDataType(ToDataType(lod_tensors[0].type()), func);
 
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &scope = local_scopes_[i];
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope->FindVar(var_name);
+        auto *var = scope.FindVar(var_name);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {

From 71a2e6b73ccc71fa76e8b12f494cc13ead990669 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 16:57:02 +0800
Subject: [PATCH 5/6] Reverse create var

---
 paddle/fluid/framework/parallel_executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c63162d2ff..aceb2a9def 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -163,7 +163,9 @@ void ParallelExecutor::Run(
   SplitTensorToPlaces(feed_tensors);
 
   // Create local scopes.
-  for (auto &scope : member_->local_scopes_) {
+  for (auto it = member_->local_scopes_.rbegin();
+       it != member_->local_scopes_.rend(); ++it) {
+    auto &scope = *it;
     Scope &local_scope = scope->NewScope();
     *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
         &local_scope;

From 06fb055a2f3572d8fe5233c0f150f55ca74de66f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Apr 2018 19:53:51 +0800
Subject: [PATCH 6/6] New group

---
 paddle/fluid/framework/parallel_executor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index aceb2a9def..53306f1219 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -63,7 +63,8 @@ ParallelExecutor::ParallelExecutor(
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {