From 92a6c7a04906e7d26196ac795eccace84156d42d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 16 Jan 2019 10:08:14 +0800
Subject: [PATCH 01/98] init async ssa executor

---
 .../details/async_ssa_graph_executor.cc       | 99 +++++++++++++++++++
 .../details/async_ssa_graph_executor.h        | 51 ++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.cc
 create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.h
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
new file mode 100644
index 0000000000..9b26fdd545
--- /dev/null
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    : strategy_(std::move(strategy)),
+      local_scopes_(std::move(local_scopes)),
+      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
+      places_(std::move(places)),
+      graphs_(std::move(graphs)) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+
+  // set the correct size of thread pool to each device.
+  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
+                               ? 1UL
+                               : strategy_.num_threads_ / places_.size();
+  VLOG(1) << "set num_threads: " << strategy_.num_threads_
+          << " to run the operators of the graph on each device.";
+  for (size_t i = 0; i < places.size(); ++i) {
+    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
+        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+  }
+}
+
+FeedFetchList AsyncSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::vector<std::future<FeedFetchList>> run_futures;
+
+  std::vector<FeedFetchList> fetch_data;
+  FeedFetchList ret;
+
+  fetch_data.reserve(places_.size());
+  ret.reserve(fetch_tensors.size());
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
+      try {
+        return executors_[i]->Run(fetch_tensors);
+      } catch (...) {
+        exception_holder_.Catch(std::current_exception());
+      }
+      return FeedFetchList();
+    };
+
+    if (pool_) {
+      run_futures.emplace_back(pool_->enqueue(std::move(call)));
+    } else {
+      fetch_data.emplace_back(std::move(call()));
+    }
+  }
+
+  if (pool_) {
+    for (auto &f : run_futures) {
+      if (exception_holder_.IsCaught()) {
+        f.wait();
+      } else {
+        fetch_data.emplace_back(std::move(f.get()));
+      }
+    }
+  }
+  if (exception_holder_.IsCaught()) {
+    exception_holder_.ReThrow();
+  }
+
+  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
+    std::vector<const LoDTensor *> lodtensor_ptrs;
+    lodtensor_ptrs.reserve(local_scopes_.size());
+    for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+      lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
+    }
+    ret.emplace_back();
+    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+  }
+  return ret;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
new file mode 100644
index 0000000000..4091c56d74
--- /dev/null
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class AsyncSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
+                        const std::vector<Scope *> &local_scopes,
+                        const std::vector<platform::Place> &places,
+                        std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+  ~AsyncSSAGraphExecutor() final = default;
+  const ir::Graph &Graph() const override { return *graphs_[0]; }
+
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::unique_ptr<::ThreadPool> pool_{nullptr};
+  std::vector<platform::Place> places_;
+  std::vector<std::unique_ptr<ir::Graph>> graphs_;
+
+  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
+  ExceptionHolder exception_holder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle

From afda84012643353fbf9849fb5f26bbcd0c45bcea Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 16 Jan 2019 10:32:56 +0800
Subject: [PATCH 02/98] init communicator

---
 paddle/fluid/framework/communicator.h | 45 +++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 paddle/fluid/framework/communicator.h

diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h
new file mode 100644
index 0000000000..e459729f5c
--- /dev/null
+++ b/paddle/fluid/framework/communicator.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <typeindex>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+
+namespace framework {
+
+class Communicator {
+ public:
+  Communicator() {}
+  ~Communicator() {}
+
+ private:
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/fluid/framework/tensor_impl.h"

From ea66979684c53743b9eb749106e0400542ec83da Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 17 Jan 2019 13:28:15 +0800
Subject: [PATCH 03/98] can run

---
 paddle/fluid/framework/CMakeLists.txt         |  2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  2 +
 .../details/async_ssa_graph_executor.cc       |  1 +
 .../fluid/framework/details/build_strategy.cc |  5 +-
 .../fluid/framework/details/build_strategy.h  |  1 +
 .../details/multi_devices_graph_pass.cc       |  2 +
 .../details/multi_devices_graph_pass.h        | 16 ++++++-
 paddle/fluid/framework/parallel_executor.cc   | 46 +++++++++++++++----
 paddle/fluid/pybind/pybind.cc                 |  3 ++
 9 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a167511160..e22c7f8a40 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -184,7 +184,7 @@ endif()
 target_link_libraries(executor garbage_collector)
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
         graph build_strategy
         fast_threaded_ssa_graph_executor variable_helper)
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c1ba6606f1..01c24b0d82 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -79,6 +79,8 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
 
+cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
+
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 9b26fdd545..d3e4573e22 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -27,6 +27,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
       graphs_(std::move(graphs)) {
+  VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
   // set the correct size of thread pool to each device.
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index df0ff772c9..f8911cd9ad 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -116,7 +116,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   // Convert graph to run on multi-devices.
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
-    if (strategy_.is_distribution_) {
+
+    if (strategy_.async_mode_) {
+      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
+    } else if (strategy_.is_distribution_) {
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 15c2e01b61..1632483965 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -86,6 +86,7 @@ struct BuildStrategy {
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
   bool is_distribution_{false};
+  bool async_mode_{false};
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cc..d7a4b5692b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -975,3 +975,5 @@ REGISTER_MULTI_DEVICES_PASS(
     paddle::framework::details::AllReduceSSAGraphBuilder);
 REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
                             paddle::framework::details::DistSSAGraphBuilder);
+REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass,
+                            paddle::framework::details::AsyncSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 6d4386538e..e91397816c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -55,7 +55,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   bool UseGPU() const;
 
-  bool NeedCollectiveOps() const;
+  virtual bool NeedCollectiveOps() const;
 
   bool IsScaleLossOp(ir::Node *node) const;
 
@@ -116,6 +116,20 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
   virtual void InsertPostprocessOps(ir::Graph *result) const {}
 };
 
+class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const {}
+
+  bool NeedCollectiveOps() const override { return false; }
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
+    return false;
+  }
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const {}
+};
+
 class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
  protected:
   int GetVarDeviceID(const std::string &varname) const;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f61c9e3a91..4173b39e10 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
+#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@@ -282,10 +283,19 @@ ParallelExecutor::ParallelExecutor(
     graphs.push_back(std::move(graph));
   }
 #else
-  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->nranks_, member_->use_cuda_);
-  graphs.push_back(std::move(graph));
+  if (build_strategy.async_mode_) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+          main_program, {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_);
+      graphs.push_back(std::move(graph));
+    }
+  } else {
+    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+        main_program, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_cuda_);
+    graphs.push_back(std::move(graph));
+  }
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
@@ -323,23 +333,31 @@ ParallelExecutor::ParallelExecutor(
              "please don't pass loss_var_name.";
     }
   }
-
-  if (build_strategy.enable_parallel_graph_) {
+  if (build_strategy.async_mode_) {
+    VLOG(3) << "use AsyncSSAGraphExecutor";
+    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graphs)));
+  } else if (build_strategy.enable_parallel_graph_) {
+    VLOG(3) << "use ParallelSSAGraphExecutor";
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
         std::move(graphs)));
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+      VLOG(3) << "use ThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
           std::move(graphs[0])));
     } else {
+      VLOG(3) << "use FastThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
           std::move(graphs[0])));
     }
   }
 
+  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
       member_->places_, std::move(member_->executor_)));
@@ -401,14 +419,22 @@ void ParallelExecutor::BCastParamsToDevices(
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
 
-        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
-        if (member_->use_all_reduce_ || member_->use_cuda_ ||
-            var == "@LR_DECAY_COUNTER@") {
+        auto share_memory = [&] {
           t->Resize(dims);
           t->mutable_data(cpu, main_tensor.type());
           paddle::framework::TensorCopy(main_tensor, cpu, t);
+        };
+
+        auto copy_memory = [&] { t->ShareDataWith(main_tensor); };
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->build_strategy_.async_mode_) {
+          share_memory();
+        } else if (member_->use_all_reduce_ || member_->use_cuda_ ||
+                   var == "@LR_DECAY_COUNTER@") {
+          copy_memory();
         } else {
-          t->ShareDataWith(main_tensor);
+          share_memory();
         }
       }
     }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f3f4854a9e..88d12c69b7 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1030,6 +1030,9 @@ All parameter, weight, gradient are variables in Paddle.
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
           [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
+      .def_property("async_mode",
+                    [](const BuildStrategy &self) { return self.async_mode_; },
+                    [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
       .def_property(
           "memory_early_delete",
           [](const BuildStrategy &self) { return self.memory_early_delete_; },

From 88d71fa2f9655c206d398088effe3cb1a43dafc4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 17 Jan 2019 17:30:27 +0800
Subject: [PATCH 04/98] support num_iteration_per_run

---
 .../framework/details/async_ssa_graph_executor.cc     |  3 +++
 paddle/fluid/framework/details/execution_strategy.h   |  2 ++
 paddle/fluid/pybind/pybind.cc                         | 11 +++++++++++
 3 files changed, 16 insertions(+)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index d3e4573e22..ba2e90d052 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -56,6 +56,9 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
   for (size_t i = 0; i < places_.size(); ++i) {
     auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
       try {
+        for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) {
+          executors_[i]->Run(fetch_tensors);
+        }
         return executors_[i]->Run(fetch_tensors);
       } catch (...) {
         exception_holder_.Catch(std::current_exception());
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736..dec4589cad 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -28,6 +28,8 @@ struct ExecutionStrategy {
   size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
+  size_t num_iteration_per_run_{1};  // only use with async_ssa_graph_executor
+                                     // and pyreader with data queue
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 88d12c69b7..b52f99f324 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -892,6 +892,17 @@ All parameter, weight, gradient are variables in Paddle.
                     2. In some NLP model, it may cause the GPU memory is insufficient,
                        in this case, you should reduce `num_iteration_per_drop_scope`.
               )DOC")
+      .def_property(
+          "num_iteration_per_run",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_run_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
+            self.num_iteration_per_run_ = num_iteration_per_run;
+          },
+          R"DOC(This config that how many iteration the executor will run when
+                user call pe.run() in python
+              )DOC")
       .def_property("_dry_run",
                     [](const ExecutionStrategy &self) { return self.dry_run_; },
                     [](ExecutionStrategy &self, bool dry_run) {

From 69484f71e0c842633df77470c80dc26222f6fd3b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 18 Jan 2019 12:25:30 +0800
Subject: [PATCH 05/98] remote communicator

---
 paddle/fluid/framework/communicator.h | 45 ---------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 paddle/fluid/framework/communicator.h

diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h
deleted file mode 100644
index e459729f5c..0000000000
--- a/paddle/fluid/framework/communicator.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <typeindex>
-#include <vector>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-
-namespace framework {
-
-class Communicator {
- public:
-  Communicator() {}
-  ~Communicator() {}
-
- private:
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/fluid/framework/tensor_impl.h"

From 7021979bc2a3c03ae8fa601b967539a4416ab325 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 18 Jan 2019 12:52:19 +0800
Subject: [PATCH 06/98] init communicator

---
 paddle/fluid/framework/communicator.h | 51 +++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 paddle/fluid/framework/communicator.h

diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h
new file mode 100644
index 0000000000..ba8fb3e173
--- /dev/null
+++ b/paddle/fluid/framework/communicator.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <typeindex>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+
+namespace framework {
+
+class Communicator {
+ public:
+  Communicator() {}
+  ~Communicator() {}
+
+  // send grad
+  void send() {}
+
+  void receive() {}
+
+  void wait() {}
+
+ private:
+  std::unique_ptr<std::thread> communicate_thread_;
+};
+
+}  // namespace framework
+}  // namespace paddle

From 9958775b312e7a4802f574dfd4ea6162a773ed28 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 18 Jan 2019 14:52:15 +0800
Subject: [PATCH 07/98] add NewTmpScope to scope

---
 paddle/fluid/framework/scope.cc                  |  2 ++
 paddle/fluid/framework/scope.h                   |  2 ++
 .../operators/distributed/grpc/grpc_server.cc    |  3 +++
 .../operators/distributed/parameter_prefetch.cc  | 16 ++++++++--------
 .../operators/distributed/request_handler.h      |  6 +++++-
 .../distributed/request_handler_impl.cc          | 10 +++-------
 .../operators/distributed/variable_response.h    | 12 ++++++++----
 7 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9536185609..c774eaf4c8 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -81,6 +81,8 @@ Scope& Scope::NewScope() const {
   return *child;
 }
 
+Scope* Scope::NewTmpScope() const { return new Scope(this); }
+
 Variable* Scope::Var(const std::string& name) {
   SCOPE_VARS_WRITER_LOCK
   return VarInternal(name);
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index f0915d2eee..0e9b8edeb3 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -55,6 +55,8 @@ class Scope {
   /// Mark it to const because that new kid scope cannot change parent scope.
   Scope& NewScope() const;
 
+  Scope* NewTmpScope() const;
+
   /// Create a variable with given name if it doesn't exist.
   /// Caller doesn't own the returned Variable.
   Variable* Var(const std::string& name);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 08f777e279..8bc8d5772f 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -107,6 +107,9 @@ class RequestSend final : public RequestBase {
     int trainer_id = request_->GetTrainerId();
     framework::Variable* outvar = nullptr;
 
+    if (!request_handler_->sync_mode()) {
+      request_->ReleaseOwnershipOfLocalScope();
+    }
     request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
     Finish(reply_, &responder_);
   }
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index c63d653488..9dfbc80870 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -180,7 +180,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<int>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope) {
-  auto& local_scope = scope.NewScope();
+  framework::Scope* local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -224,22 +224,22 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 #endif
   }
 
-  auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
+  auto splited_ids = SplitIds(ids_vector, height_sections, local_scope);
   SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
-                                    &local_scope);
+                                    local_scope);
 
   // create output var in local scope
   for (auto& name : out_var_names) {
-    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
   }
 
   std::vector<distributed::VarHandlePtr> rets;
   for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(local_scope, in_var_names[i])) {
+    if (NeedSend(*local_scope, in_var_names[i])) {
       VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
               << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
+          epmap[i], cpu_ctx, *local_scope, in_var_names[i], out_var_names[i],
           table_names[i]));
     } else {
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
@@ -252,8 +252,8 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                     out_var_names, height_sections, splited_ids,
-                                    context, &local_scope, &actual_ctx);
-  scope.DeleteScope(&local_scope);
+                                    context, local_scope, &actual_ctx);
+  delete local_scope;
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 62b24f150b..f58c2bc380 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -58,13 +58,15 @@ class VarHandle {
   VarHandle(const std::string ep, const std::string& method,
             const std::string& name,
             const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr)
+            const framework::Scope* p_scope = nullptr,
+            bool delete_local_scope = false)
       : status_(kDefaultState) {
     ep_ = ep;
     ctx_ = p_ctx;
     scope_ = p_scope;
     name_ = name;
     method_ = method;
+    delete_local_scope_ = delete_local_scope;
   }
 
   virtual ~VarHandle() {}
@@ -86,6 +88,7 @@ class VarHandle {
       std::unique_lock<std::mutex> lk(sync_mutex_);
       status_ = ok ? kFinishState : kErrorState;
     }
+    if (delete_local_scope_ && scope_) delete scope_;
     VLOG(7) << "VarHandle finish:" << ok;
     wait_cond_.notify_all();
   }
@@ -112,6 +115,7 @@ class VarHandle {
   std::string name_;
   // RPC method name.
   std::string method_;
+  bool delete_local_scope_;
 
  protected:
   std::mutex sync_mutex_;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 9722f8c96e..1625e55d5a 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -53,13 +53,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
     // Async
     if (!sync_mode_) {
       VLOG(3) << "async process var: " << varname;
-      try {
-        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                      scope);
-      } catch (std::exception& e) {
-        LOG(ERROR) << "async: run sub program error " << e.what();
-        return false;
-      }
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+      delete scope;
       return true;
     } else {  // sync
       rpc_server_->WaitCond(kRequestSend);
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 294cae5f44..3ecb696069 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -60,14 +60,12 @@ class VariableResponse {
                    bool create_scope = false)
       : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
     if (create_scope) {
-      local_scope_ = &scope->NewScope();
+      local_scope_ = scope->NewTmpScope();
     }
   }
 
   virtual ~VariableResponse() {
-    if (create_scope_) {
-      scope_->DeleteScope(local_scope_);
-    }
+    if (local_scope_) delete local_scope_;
   }
 
   int Parse(Source* source, const sendrecv::VariableMessage& meta) {
@@ -86,6 +84,12 @@ class VariableResponse {
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
   inline std::string TableName() const { return meta_.table_name(); }
+  inline void ReleaseOwnershipOfLocalScope() {
+    PADDLE_ENFORCE(create_scope_,
+                   "only when create_scope_ is true can you release the "
+                   "ownership of local scope");
+    local_scope_ = nullptr;
+  }
 
   // should call parse first.
   framework::Variable* GetVar() {

From b5aefc8b6d4c2aa2d28fbb1546d64ac52a754a26 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 18 Jan 2019 15:07:55 +0800
Subject: [PATCH 08/98] fix compile problem

---
 paddle/fluid/operators/distributed/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1249ef9a9b..ed819ac9f0 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -50,7 +50,7 @@ endif()
 
 cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
-cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 

From f3210b60ba3a5f23cfed95148c44e5d5db298f35 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 18 Jan 2019 15:49:32 +0800
Subject: [PATCH 09/98] fix copy_memory and share_memory

---
 paddle/fluid/framework/parallel_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 4173b39e10..3997294f17 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -419,13 +419,13 @@ void ParallelExecutor::BCastParamsToDevices(
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
 
-        auto share_memory = [&] {
+        auto copy_memory = [&] {
           t->Resize(dims);
           t->mutable_data(cpu, main_tensor.type());
           paddle::framework::TensorCopy(main_tensor, cpu, t);
         };
 
-        auto copy_memory = [&] { t->ShareDataWith(main_tensor); };
+        auto share_memory = [&] { t->ShareDataWith(main_tensor); };
 
         // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
         if (member_->build_strategy_.async_mode_) {

From ca5d96bb3d376be0ade29db4f58700ba2c81b88a Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 24 Jan 2019 16:36:48 +0800
Subject: [PATCH 10/98] complete send lod tensor

---
 paddle/fluid/framework/communicator.h         |   2 +
 .../operators/distributed/CMakeLists.txt      |   3 +-
 .../operators/distributed/parameter_send.cc   | 189 ++++++++++++++++++
 .../operators/distributed/parameter_send.h    |  35 ++++
 .../operators/distributed_ops/send_op.cc      |  15 ++
 5 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/distributed/parameter_send.cc
 create mode 100644 paddle/fluid/operators/distributed/parameter_send.h

diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h
index ba8fb3e173..0e90ba02e6 100644
--- a/paddle/fluid/framework/communicator.h
+++ b/paddle/fluid/framework/communicator.h
@@ -41,6 +41,8 @@ class Communicator {
 
   void receive() {}
 
+  void prefetch() {}
+
   void wait() {}
 
  private:
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cb361e95e8..fa8abf4cec 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -30,7 +30,7 @@ if(WITH_GRPC)
 
 else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc
@@ -50,6 +50,7 @@ cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
+cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
new file mode 100644
index 0000000000..01e7341f15
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -0,0 +1,189 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/parameter_send.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+static std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+static std::vector<std::vector<int64_t>> SplitIds(
+    const std::vector<int64_t>& ids_vector,
+    const std::vector<int>& height_section, framework::Scope* scope) {
+  std::set<int64_t> all_ids;
+  for (auto id : ids_vector) {
+    all_ids.insert(id);
+  }
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+
+static void SplitIdsIntoMultipleVarsBySection(
+    const std::vector<std::string>& in_var_names,
+    const std::vector<int>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+void send(const std::string& var_name,
+          const std::vector<std::string>& send_varnames,
+          const std::vector<std::string>& epmap,
+          const std::vector<int>& height_sections,
+          const framework::ExecutionContext& context,
+          const framework::Scope& scope, bool sync) {
+  framework::Scope* local_scope = scope.NewTmpScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto& actual_ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  auto* send_var = scope.FindVar(var_name);
+  size_t out_num = send_varnames.size();
+  if (send_var->IsType<framework::LoDTensor>()) {
+    auto& send_tensor = send_var->Get<framework::LoDTensor>();
+    auto& send_tensor_dims = send_tensor.dims();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(out_num);
+
+    // infer output shape
+    int num = context.Attr<int>("num");
+    if (num > 0) {
+      int64_t in_axis_dim = send_tensor_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < out_num; ++i) {
+        auto dim = send_tensor_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (height_sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(height_sections.size(), out_num,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < out_num; ++i) {
+        auto dim = send_tensor_dims;
+        dim[0] = height_sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+
+    // create output var in local scope
+    size_t row_offset = 0;
+    for (auto i = 0; i < out_num; ++i) {
+      auto* out =
+          local_scope->Var(send_varnames[i])->GetMutable<framework::Tensor>();
+      *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
+      row_offset += outs_dims[i][0];
+    }
+  } else if (send_var->IsType<framework::LoDTensor>()) {
+    // create output var in local scope
+    for (auto& name : send_varnames) {
+      local_scope->Var(name)->GetMutable<framework::SelectedRows>();
+    }
+  } else {
+    PADDLE_THROW("unsupported var type");
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < send_varnames.size(); i++) {
+    auto& send_var_name = send_varnames[i];
+    auto& endpoint = epmap[i];
+    if (NeedSend(*local_scope, send_var_name)) {
+      VLOG(3) << "sending " << send_var_name << " to " << endpoint;
+      rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
+                                              send_var_name));
+    } else {
+      VLOG(3) << "don't send non-initialized variable: " << send_varnames[i];
+    }
+  }
+
+  if (sync) {
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
+  }
+
+  delete local_scope;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
new file mode 100644
index 0000000000..ee4da997b7
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -0,0 +1,35 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void send(const std::string& var_name,
+          const std::vector<std::string>& send_varnames,
+          const std::vector<std::string>& epmap,
+          const std::vector<int>& height_sections,
+          const framework::ExecutionContext& context,
+          const framework::Scope& scope, bool sync);
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index e2c2147ab5..02397bb6b3 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -88,6 +88,21 @@ This operator will send variables to listen_and_serve op at the parameter server
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({"127.0.0.1:6164"});
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<std::vector<std::string>>(
+        "send_varnames",
+        "(vector<string>) "
+        "the splited output varnames to send to pserver")
+        .SetDefault(std::vector<std::string>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
   }
 };
 

From 1866d2dbefbaa630eac57da6838b8423d1074dd8 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 24 Jan 2019 17:16:32 +0800
Subject: [PATCH 11/98] parameter send support selected_rows

---
 .../operators/distributed/parameter_send.cc   | 84 +++++++++++++++++--
 .../operators/distributed/parameter_send.h    |  1 +
 2 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 01e7341f15..d79ea8cdb9 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -47,6 +47,15 @@ static size_t GetSectionIndex(int64_t id,
   return abs_sections.size() - 1;
 }
 
+static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (row < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
 static std::vector<int64_t> ToAbsoluteSection(
     const std::vector<int>& height_sections) {
   std::vector<int64_t> abs_sections;
@@ -97,21 +106,22 @@ static void SplitIdsIntoMultipleVarsBySection(
   }
 }
 
+template <typename T>
 void send(const std::string& var_name,
           const std::vector<std::string>& send_varnames,
           const std::vector<std::string>& epmap,
           const std::vector<int>& height_sections,
-          const framework::ExecutionContext& context,
-          const framework::Scope& scope, bool sync) {
+          const framework::ExecutionContext& ctx, const framework::Scope& scope,
+          bool sync) {
   framework::Scope* local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& cpu_ctx = *pool.Get(platform::CPUPlace());
-  auto& actual_ctx = *pool.Get(context.GetPlace());
+  auto& actual_ctx = *pool.Get(ctx.GetPlace());
 
   distributed::RPCClient* rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
+          ctx.Attr<int>("trainer_id"));
 
   auto* send_var = scope.FindVar(var_name);
   size_t out_num = send_varnames.size();
@@ -122,7 +132,7 @@ void send(const std::string& var_name,
     outs_dims.reserve(out_num);
 
     // infer output shape
-    int num = context.Attr<int>("num");
+    int num = ctx.Attr<int>("num");
     if (num > 0) {
       int64_t in_axis_dim = send_tensor_dims[0];
       PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
@@ -153,13 +163,71 @@ void send(const std::string& var_name,
       *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
       row_offset += outs_dims[i][0];
     }
-  } else if (send_var->IsType<framework::LoDTensor>()) {
+  } else if (send_var->IsType<framework::SelectedRows>()) {
+    auto& send_slr = send_var->Get<framework::SelectedRows>();
+    auto abs_sections = ToAbsoluteSection(height_sections);
+
+    auto send_rows = send_slr.rows();
+    std::vector<std::vector<int>> outs_rows_idx;
+    std::vector<std::vector<int>> outs_dense_idx;
+
+    outs_rows_idx.resize(out_num);
+    outs_dense_idx.resize(out_num);
+
+    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
+    auto src = send_slr.value().data<T>();
+
     // create output var in local scope
+    std::vector<framework::SelectedRows*> outs;
     for (auto& name : send_varnames) {
-      local_scope->Var(name)->GetMutable<framework::SelectedRows>();
+      auto* out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
+      outs.push_back(out);
+    }
+
+    // split rows index into output sparse vars
+    for (size_t i = 0; i < send_rows.size(); ++i) {
+      int out_idx = FindOutIdx(send_rows[i], abs_sections);
+      outs_rows_idx[out_idx].push_back(send_rows[i]);
+      outs_dense_idx[out_idx].push_back(i);
     }
+    auto place = ctx.GetPlace();
+
+    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
+      auto rows_idx = outs_rows_idx[i];
+      outs[i]->set_height(height_sections[i]);
+      auto dims = send_slr.GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
+      outs[i]->mutable_rows()->clear();
+      if (rows_idx.size() > 0) {
+        for (auto idx : rows_idx) {
+          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
+        }
+        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
+        for (size_t j = 0; j < rows_idx.size(); j++) {
+          if (platform::is_cpu_place(place)) {
+            memory::Copy(
+                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
+                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
+          } else {
+#ifdef PADDLE_WITH_CUDA
+            auto stream = ctx.cuda_device_context().stream();
+            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
+                         platform::CUDAPlace(),
+                         src + outs_dense_idx[i][j] * row_numel,
+                         sizeof(T) * row_numel, stream);
+#else
+            PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+          }
+        }
+      }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
+    }
+
   } else {
-    PADDLE_THROW("unsupported var type");
+    PADDLE_THROW("unsupported var type to send!");
   }
 
   std::vector<distributed::VarHandlePtr> rets;
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
index ee4da997b7..e337649cf2 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -23,6 +23,7 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
+template <typename T>
 void send(const std::string& var_name,
           const std::vector<std::string>& send_varnames,
           const std::vector<std::string>& epmap,

From 74040cb4aad1c8390fcc080c32f0c12bee46a05b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 24 Jan 2019 18:38:52 +0800
Subject: [PATCH 12/98] code clean

---
 .../distributed/parameter_prefetch.cc         | 29 ++------
 .../distributed/parameter_prefetch.h          |  4 +-
 .../operators/distributed/parameter_send.cc   | 71 +------------------
 .../operators/distributed/parameter_send.h    |  2 +-
 .../operators/distributed_ops/send_op.cc      | 10 +--
 .../distributed_ops/send_recv_util.h          | 36 ++++++++++
 .../operators/hierarchical_sigmoid_op.cc      |  6 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |  2 +-
 paddle/fluid/operators/lookup_table_op.cc     |  6 +-
 paddle/fluid/operators/lookup_table_op.h      |  3 +-
 paddle/fluid/operators/nce_op.cc              |  6 +-
 paddle/fluid/operators/nce_op.h               |  3 +-
 .../fluid/operators/split_selected_rows_op.h  | 21 +-----
 13 files changed, 64 insertions(+), 135 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 9dfbc80870..7434265929 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -37,30 +37,9 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-static size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-static std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
 static std::vector<std::vector<int64_t>> SplitIds(
     const std::vector<int64_t>& ids_vector,
-    const std::vector<int>& height_section, framework::Scope* scope) {
+    const std::vector<int64_t>& height_section, framework::Scope* scope) {
   std::set<int64_t> all_ids;
   for (auto id : ids_vector) {
     all_ids.insert(id);
@@ -78,7 +57,7 @@ static std::vector<std::vector<int64_t>> SplitIds(
 
 static void SplitIdsIntoMultipleVarsBySection(
     const std::vector<std::string>& in_var_names,
-    const std::vector<int>& height_section,
+    const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     framework::Scope* scope) {
   PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
@@ -100,7 +79,7 @@ static void SplitIdsIntoMultipleVarsBySection(
 static void MergeMultipleVarsIntoOneBySection(
     const std::string& id_name, const std::vector<int64_t>& ids_vector,
     const std::string& out_name, const std::vector<std::string>& out_var_names,
-    const std::vector<int>& height_section,
+    const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     const framework::ExecutionContext& context, framework::Scope* scope,
     platform::DeviceContext* actual_ctx) {
@@ -177,7 +156,7 @@ static void MergeMultipleVarsIntoOneBySection(
 void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
-              const std::vector<int>& height_sections,
+              const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope) {
   framework::Scope* local_scope = scope.NewTmpScope();
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 2f850a0332..0429ec4415 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -26,7 +26,7 @@ namespace distributed {
 void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
-              const std::vector<int>& height_sections,
+              const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope);
 
@@ -35,7 +35,7 @@ void prefetch_with_reconstruct(const std::string& id_name,
                                const std::string& out_name,
                                const std::vector<std::string>& table_names,
                                const std::vector<std::string>& epmap,
-                               const std::vector<int>& height_sections,
+                               const std::vector<int64_t>& height_sections,
                                const framework::ExecutionContext& context,
                                const framework::Scope& scope,
                                framework::LoDTensor* original) {
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index d79ea8cdb9..09fce06b5a 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -37,80 +37,11 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-static size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (row < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-static std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-static std::vector<std::vector<int64_t>> SplitIds(
-    const std::vector<int64_t>& ids_vector,
-    const std::vector<int>& height_section, framework::Scope* scope) {
-  std::set<int64_t> all_ids;
-  for (auto id : ids_vector) {
-    all_ids.insert(id);
-  }
-
-  auto abs_sections = ToAbsoluteSection(height_section);
-  std::vector<std::vector<int64_t>> splited_ids;
-  splited_ids.resize(height_section.size() + 1);
-  for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id, abs_sections);
-    splited_ids[section_index].push_back(id - abs_sections[section_index]);
-  }
-  return splited_ids;
-}
-
-static void SplitIdsIntoMultipleVarsBySection(
-    const std::vector<std::string>& in_var_names,
-    const std::vector<int>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
-
-  auto place = platform::CPUPlace();
-
-  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor =
-        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
-    auto& ids = splited_ids[i];
-    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
 template <typename T>
 void send(const std::string& var_name,
           const std::vector<std::string>& send_varnames,
           const std::vector<std::string>& epmap,
-          const std::vector<int>& height_sections,
+          const std::vector<int64_t>& height_sections,
           const framework::ExecutionContext& ctx, const framework::Scope& scope,
           bool sync) {
   framework::Scope* local_scope = scope.NewTmpScope();
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
index e337649cf2..6272cc5d25 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -27,7 +27,7 @@ template <typename T>
 void send(const std::string& var_name,
           const std::vector<std::string>& send_varnames,
           const std::vector<std::string>& epmap,
-          const std::vector<int>& height_sections,
+          const std::vector<int64_t>& height_sections,
           const framework::ExecutionContext& context,
           const framework::Scope& scope, bool sync);
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 02397bb6b3..f8b9a1d15a 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -88,11 +88,11 @@ This operator will send variables to listen_and_serve op at the parameter server
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
+    AddAttr<std::vector<int64_t>>("sections",
+                                  "(vector<int>) "
+                                  "the length of each output along the "
+                                  "specified axis.")
+        .SetDefault(std::vector<int64_t>{});
     AddAttr<std::vector<std::string>>(
         "send_varnames",
         "(vector<string>) "
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index dc26c53c64..1e91f0dd51 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -13,8 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace operators {
@@ -42,5 +48,35 @@ inline bool NeedSend(const framework::Scope& scope,
   return false;
 }
 
+inline int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (row < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+inline std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 6ca6f0bc04..13820e54aa 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -134,9 +134,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     // for parameter prefetch
     AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
     AddAttr<std::vector<std::string>>(
         "epmap",
         "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 1a7ca96301..2247131137 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -70,7 +70,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
       // if epmap is not empty, then the parameter will be fetched from remote
       // parameter
       // server
-      auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+      auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
       auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
       std::vector<int64_t> real_rows = PathToRows(*path);
       framework::Scope& local_scope = ctx.scope().NewScope();
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 0029932bc0..9f6fbe05fa 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -91,9 +91,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     // for parameter prefetch
     AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
     AddAttr<std::vector<std::string>>(
         "epmap",
         "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index a7d0fd4856..f95f29356f 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -50,7 +50,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     // for remote prefetch
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto height_sections =
+        context.Attr<std::vector<int64_t>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
     if (!epmap.empty()) {
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 256da34912..8160f45e74 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -156,9 +156,9 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
     // for parameter prefetch
     AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
     AddAttr<std::vector<std::string>>(
         "epmap",
         "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 2c97eef096..fab46a5971 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -167,7 +167,8 @@ class NCEKernel : public framework::OpKernel<T> {
 
       framework::Scope &local_scope = context.scope().NewScope();
 
-      auto height_sections = context.Attr<std::vector<int>>("height_sections");
+      auto height_sections =
+          context.Attr<std::vector<int64_t>>("height_sections");
       auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
       auto *ids = local_scope.Var("Ids@Prefetch");
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index 1fef2b3d37..c29065649e 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -16,31 +16,12 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
 
-static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (row < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-static std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
 template <typename DeviceContext, typename T>
 class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
  public:

From 1edc0423d2f2a96a342acdd8750e3608aa7b8ce9 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 24 Jan 2019 19:26:07 +0800
Subject: [PATCH 13/98] update send_op

---
 .../operators/distributed_ops/send_op.cc      | 59 ++++++++++++-------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index f8b9a1d15a..2136670103 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/parameter_send.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -37,30 +38,46 @@ class SendOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto ins = Inputs("X");
 
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    auto epmap = Attr<std::vector<std::string>>("epmap");
     int sync_send = Attr<int>("sync_mode");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
+    auto height_sections = Attr<std::vector<int64_t>>("height_sections");
+
+    if (send_varnames.size() > 0) {
+      PADDLE_ENFORCE_EQ(ins.size(), 1, "");
+      framework::RuntimeContext ctx(Inputs(), Outputs(), scope);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(place);
+      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx);
+      distributed::send<float>(ins[0], send_varnames, epmap, height_sections,
+                               exe_ctx, scope, static_cast<bool>(sync_send));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto& ctx = *pool.Get(place);
+
+      distributed::RPCClient* rpc_client =
+          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+              Attr<int>("trainer_id"));
+
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < ins.size(); i++) {
+        if (NeedSend(scope, ins[i])) {
+          VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+          rets.push_back(
+              rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
+        } else {
+          VLOG(3) << "don't send no-initialied variable: " << ins[i];
+        }
       }
-    }
-    if (sync_send) {
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
+      if (sync_send) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+          VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
+        }
       }
     }
   }

From fab8457e6b117be26e23171b649a1bfda14531b2 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 26 Jan 2019 23:12:23 +0800
Subject: [PATCH 14/98] code optimize

---
 .../details/async_ssa_graph_executor.cc          | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index ba2e90d052..7dc269242f 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -68,20 +68,18 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
 
     if (pool_) {
       run_futures.emplace_back(pool_->enqueue(std::move(call)));
+      for (auto &f : run_futures) {
+        if (exception_holder_.IsCaught()) {
+          f.wait();
+        } else {
+          fetch_data.emplace_back(std::move(f.get()));
+        }
+      }
     } else {
       fetch_data.emplace_back(std::move(call()));
     }
   }
 
-  if (pool_) {
-    for (auto &f : run_futures) {
-      if (exception_holder_.IsCaught()) {
-        f.wait();
-      } else {
-        fetch_data.emplace_back(std::move(f.get()));
-      }
-    }
-  }
   if (exception_holder_.IsCaught()) {
     exception_holder_.ReThrow();
   }

From 62549e071402530e862285ab1613eb8e8e5e5150 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 27 Jan 2019 17:10:45 +0800
Subject: [PATCH 15/98] add GenParentScopeTreeDebugInfo

---
 paddle/fluid/framework/parallel_executor.cc |  1 +
 paddle/fluid/framework/scope.cc             | 29 +++++++++++++++++++++
 paddle/fluid/framework/scope.h              |  1 +
 3 files changed, 31 insertions(+)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 3997294f17..f0bc3acccc 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -365,6 +365,7 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToDevices(
     const std::unordered_set<std::string> &vars) const {
+  VLOG(3) << "BCastParamsToDevices";
   // the initializing bcast, all vars would be bcast from device(0).
   for (auto &var : vars) {
     framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9536185609..884ad3b34b 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -259,5 +259,34 @@ std::string GenScopeTreeDebugInfo(Scope* root) {
   return os.str();
 }
 
+std::string GenParentScopeTreeDebugInfo(Scope* leaf) {
+  std::stringstream os;
+
+  if (!leaf) return "";
+
+  // level traversal
+  std::vector<const Scope*> scopes;
+  const Scope* current_scope = leaf;
+
+  while (current_scope != nullptr) {
+    scopes.push_back(current_scope);
+    current_scope = current_scope->parent();
+    // end of a level
+    os << "\n------------------------------------------\n";
+  }
+
+  os << "\nDetails:\n\n";
+
+  for (auto* q : scopes) {
+    os << "====\n";
+    os << q << ":\n";
+    for (auto& var : q->LocalVarNames()) {
+      os << "  - " << var << "\n";
+    }
+  }
+
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index f0915d2eee..eb5c12def6 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,6 +144,7 @@ class Scope {
 // Generate some debug string about the inherience structure of scope, quite
 // naive.
 std::string GenScopeTreeDebugInfo(Scope*);
+std::string GenParentScopeTreeDebugInfo(Scope*);
 
 }  // namespace framework
 }  // namespace paddle

From be738a646e2f760a53c36a658c7d07c4f75cd814 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 27 Jan 2019 21:56:25 +0800
Subject: [PATCH 16/98] add some debug infor

---
 .../details/async_ssa_graph_executor.cc         | 17 ++++++++++-------
 .../details/multi_devices_graph_pass.cc         |  2 ++
 paddle/fluid/framework/scope.cc                 | 12 +++++-------
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 7dc269242f..c259ff4f74 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -68,18 +68,21 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
 
     if (pool_) {
       run_futures.emplace_back(pool_->enqueue(std::move(call)));
-      for (auto &f : run_futures) {
-        if (exception_holder_.IsCaught()) {
-          f.wait();
-        } else {
-          fetch_data.emplace_back(std::move(f.get()));
-        }
-      }
     } else {
       fetch_data.emplace_back(std::move(call()));
     }
   }
 
+  if (pool_) {
+    for (auto &f : run_futures) {
+      if (exception_holder_.IsCaught()) {
+        f.wait();
+      } else {
+        fetch_data.emplace_back(std::move(f.get()));
+      }
+    }
+  }
+
   if (exception_holder_.IsCaught()) {
     exception_holder_.ReThrow();
   }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index d7a4b5692b..f1347e2b0d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -249,6 +249,8 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
       break;
   }
 
+  VLOG(3) << "loss_scale: " << loss_scale;
+
   if (loss_scale) {
     // TODO(paddle-dev): Why is there no input for this op_handle?
     auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 884ad3b34b..2c76ab22f6 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -271,16 +271,14 @@ std::string GenParentScopeTreeDebugInfo(Scope* leaf) {
   while (current_scope != nullptr) {
     scopes.push_back(current_scope);
     current_scope = current_scope->parent();
-    // end of a level
-    os << "\n------------------------------------------\n";
   }
 
-  os << "\nDetails:\n\n";
+  os << "\n--------------GenParentScopeTreeDebugInfo--------------\n";
 
-  for (auto* q : scopes) {
-    os << "====\n";
-    os << q << ":\n";
-    for (auto& var : q->LocalVarNames()) {
+  for (int i = scopes.size() - 1; i >= 0; --i) {
+    os << "=======level [" << i << "]=======\n";
+    os << scopes[i] << ":\n";
+    for (auto& var : scopes[i]->LocalVarNames()) {
       os << "  - " << var << "\n";
     }
   }

From 9da96aba956abe13aec945c1e71e338df56a13b5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 27 Jan 2019 23:04:50 +0800
Subject: [PATCH 17/98] clean code of test_async_ssa_graph_executor_mnist

---
 .../test_async_ssa_graph_executor_mnist.py    | 214 ++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py

diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
new file mode 100644
index 0000000000..e2b3b2b0f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+from PIL import Image
+import numpy
+import paddle
+import paddle.fluid as fluid
+
+BATCH_SIZE = 64
+PASS_NUM = 5
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
+
+
+def convolutional_neural_network(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def train(use_cuda,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    prediction, avg_loss, acc = convolutional_neural_network(img, label)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    def train_test(train_test_program, train_test_feed, train_test_reader):
+        acc_set = []
+        avg_loss_set = []
+        for test_data in train_test_reader():
+            acc_np, avg_loss_np = exe.run(program=train_test_program,
+                                          feed=train_test_feed.feed(test_data),
+                                          fetch_list=[acc, avg_loss])
+            acc_set.append(float(acc_np))
+            avg_loss_set.append(float(avg_loss_np))
+        # get test acc and loss
+        acc_val_mean = numpy.array(acc_set).mean()
+        avg_loss_val_mean = numpy.array(avg_loss_set).mean()
+        return avg_loss_val_mean, acc_val_mean
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+    main_program = fluid.default_main_program()
+
+    exec_strategy = fluid.ExecutionStrategy()
+    build_strategy = fluid.BuildStrategy()
+
+    cpu_num = int(os.environ.get('CPU_NUM'))
+    thread_num = int(os.getenv("NUM_THREADS"))
+
+    print("cpu_num:" + str(cpu_num))
+    print("thread_num:" + str(thread_num))
+
+    build_strategy.async_mode = True
+
+    exec_strategy.num_threads = thread_num
+    exec_strategy.num_iteration_per_drop_scope = 1
+    exec_strategy.num_iteration_per_run = 10
+
+    pe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=avg_loss.name,
+        main_program=main_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+    lists = []
+    step = 0
+    for epoch_id in range(PASS_NUM):
+        for step_id, data in enumerate(train_reader()):
+            loss_val, acc_val = pe.run(feed=feeder.feed(data),
+                                       fetch_list=[avg_loss.name, acc.name])
+            loss_val = numpy.mean(loss_val)
+            acc_val = numpy.mean(acc_val)
+            if step % 100 == 0:
+                print("Pass %d, Batch %d, Cost %f" % (epoch_id, step, loss_val))
+            step += 1
+        # test for epoch
+        avg_loss_val, acc_val = train_test(
+            train_test_program=test_program,
+            train_test_reader=test_reader,
+            train_test_feed=feeder)
+
+        print("Test with Epoch %d, avg_cost: %s, acc: %s" %
+              (epoch_id, avg_loss_val, acc_val))
+        lists.append((epoch_id, avg_loss_val, acc_val))
+        if save_dirname is not None:
+            fluid.io.save_inference_model(
+                save_dirname, ["img"], [prediction],
+                exe,
+                model_filename=model_filename,
+                params_filename=params_filename)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1]))
+    print('The classification accuracy is %.2f%%' % (float(best[2]) * 100))
+
+
+def infer(use_cuda,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    def load_image(file):
+        im = Image.open(file).convert('L')
+        im = im.resize((28, 28), Image.ANTIALIAS)
+        im = numpy.array(im).reshape(1, 1, 28, 28).astype(numpy.float32)
+        im = im / 255.0 * 2.0 - 1.0
+        return im
+
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    tensor_img = load_image(cur_dir + '/image/infer_3.png')
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             save_dirname, exe, model_filename, params_filename)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+        lab = numpy.argsort(results)
+        print("Inference result of image/infer_3.png is: %d" % lab[0][0][-1])
+
+
+def main(use_cuda):
+    model_filename = None
+    params_filename = None
+    save_dirname = "recognize_digits" + ".inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        model_filename=model_filename,
+        params_filename=params_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        model_filename=model_filename,
+        params_filename=params_filename)
+
+
+if __name__ == '__main__':
+    use_cuda = False
+    main(use_cuda=use_cuda)

From 7e145b7c0e8a877ce78135dc74d3d65090e9c704 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 28 Jan 2019 10:13:09 +0800
Subject: [PATCH 18/98] optimize test_async_ssa_graph_executor_mnist

---
 .../test_async_ssa_graph_executor_mnist.py    | 138 ++++--------------
 1 file changed, 31 insertions(+), 107 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index e2b3b2b0f2..03d7df8852 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -15,13 +15,13 @@
 from __future__ import print_function
 
 import os
-from PIL import Image
+import unittest
+
 import numpy
 import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 64
-PASS_NUM = 5
 
 
 def loss_net(hidden, label):
@@ -51,11 +51,9 @@ def convolutional_neural_network(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def train(use_cuda,
-          save_dirname=None,
-          model_filename=None,
-          params_filename=None):
+def train(use_cuda, thread_num, cpu_num):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
+        print("paddle is not compiled with cuda, exit!")
         return
 
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -84,8 +82,6 @@ def train(use_cuda,
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    exe = fluid.Executor(place)
-
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.mnist.train(), buf_size=500),
@@ -94,24 +90,22 @@ def train(use_cuda,
         paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
     feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
 
+    exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
-    main_program = fluid.default_main_program()
 
-    exec_strategy = fluid.ExecutionStrategy()
-    build_strategy = fluid.BuildStrategy()
-
-    cpu_num = int(os.environ.get('CPU_NUM'))
-    thread_num = int(os.getenv("NUM_THREADS"))
+    os.environ['CPU_NUM'] = str(cpu_num)
 
     print("cpu_num:" + str(cpu_num))
     print("thread_num:" + str(thread_num))
 
-    build_strategy.async_mode = True
+    build_strategy = fluid.BuildStrategy()
+    build_strategy.async_mode = True  # enable async mode
 
+    exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_drop_scope = 1
-    exec_strategy.num_iteration_per_run = 10
+    exec_strategy.num_iteration_per_run = 2
 
+    main_program = fluid.default_main_program()
     pe = fluid.ParallelExecutor(
         use_cuda=False,
         loss_name=avg_loss.name,
@@ -119,96 +113,26 @@ def train(use_cuda,
         build_strategy=build_strategy,
         exec_strategy=exec_strategy)
 
-    lists = []
     step = 0
-    for epoch_id in range(PASS_NUM):
-        for step_id, data in enumerate(train_reader()):
-            loss_val, acc_val = pe.run(feed=feeder.feed(data),
-                                       fetch_list=[avg_loss.name, acc.name])
-            loss_val = numpy.mean(loss_val)
-            acc_val = numpy.mean(acc_val)
-            if step % 100 == 0:
-                print("Pass %d, Batch %d, Cost %f" % (epoch_id, step, loss_val))
-            step += 1
-        # test for epoch
-        avg_loss_val, acc_val = train_test(
-            train_test_program=test_program,
-            train_test_reader=test_reader,
-            train_test_feed=feeder)
-
-        print("Test with Epoch %d, avg_cost: %s, acc: %s" %
-              (epoch_id, avg_loss_val, acc_val))
-        lists.append((epoch_id, avg_loss_val, acc_val))
-        if save_dirname is not None:
-            fluid.io.save_inference_model(
-                save_dirname, ["img"], [prediction],
-                exe,
-                model_filename=model_filename,
-                params_filename=params_filename)
-
-    # find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1]))
-    print('The classification accuracy is %.2f%%' % (float(best[2]) * 100))
-
-
-def infer(use_cuda,
-          save_dirname=None,
-          model_filename=None,
-          params_filename=None):
-    if save_dirname is None:
-        return
+    for step_id, data in enumerate(train_reader()):
+        loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name])
+        loss_val = numpy.mean(loss_val)
+        if step % 100 == 0:
+            print("Batch %d, Cost %f" % (step, loss_val))
+        step += 1
+    # test for epoch
+    avg_loss_val, acc_val = train_test(
+        train_test_program=test_program,
+        train_test_reader=test_reader,
+        train_test_feed=feeder)
+
+    print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val))
+
+
+class TestAsyncSSAGraphExecutor(unittest.TestCase):
+    def test_check_async_ssa_exe_train(self):
+        train(use_cuda=False, thread_num=2, cpu_num=2)
 
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
 
-    def load_image(file):
-        im = Image.open(file).convert('L')
-        im = im.resize((28, 28), Image.ANTIALIAS)
-        im = numpy.array(im).reshape(1, 1, 28, 28).astype(numpy.float32)
-        im = im / 255.0 * 2.0 - 1.0
-        return im
-
-    cur_dir = os.path.dirname(os.path.realpath(__file__))
-    tensor_img = load_image(cur_dir + '/image/infer_3.png')
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             save_dirname, exe, model_filename, params_filename)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-        lab = numpy.argsort(results)
-        print("Inference result of image/infer_3.png is: %d" % lab[0][0][-1])
-
-
-def main(use_cuda):
-    model_filename = None
-    params_filename = None
-    save_dirname = "recognize_digits" + ".inference.model"
-
-    # call train() with is_local argument to run distributed train
-    train(
-        use_cuda=use_cuda,
-        save_dirname=save_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
-    infer(
-        use_cuda=use_cuda,
-        save_dirname=save_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
-
-
-if __name__ == '__main__':
-    use_cuda = False
-    main(use_cuda=use_cuda)
+if __name__ == "__main__":
+    unittest.main()

From 02dab46ab8101873663a63614f88931ead7846d9 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 28 Jan 2019 16:23:06 +0800
Subject: [PATCH 19/98] add some debug info

---
 .../details/async_ssa_graph_executor.cc       |  2 ++
 .../framework/details/exception_holder.h      | 17 ++++++++++++
 .../fluid/operators/reader/blocking_queue.h   |  1 +
 .../test_async_ssa_graph_executor_mnist.py    | 27 ++++++++++++++++++-
 4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index c259ff4f74..e21d5fb96d 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -84,6 +84,8 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
   }
 
   if (exception_holder_.IsCaught()) {
+    VLOG(3) << "caught exception " << exception_holder_.Type()
+            << ", rethrow it";
     exception_holder_.ReThrow();
   }
 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index 1b1afce04e..77ca03b86e 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -64,6 +66,21 @@ class ExceptionHolder {
     ClearImpl();
   }
 
+  std::string Type() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        return "None";
+      case kEnforceNotMet: {
+        return "EnforceNotMet";
+      }
+      case kEOF: {
+        return "EOF";
+      }
+    }
+    return "unknown";
+  }
+
  private:
   void ClearImpl() {
     exception_.reset();
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 51b980acb5..45c3ad802f 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -79,6 +79,7 @@ class BlockingQueue {
       return true;
     } else {
       PADDLE_ENFORCE(closed_);
+      VLOG(3) << "queue is closed! return nothing.";
       return false;
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 03d7df8852..6a2f829654 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -59,6 +59,13 @@ def train(use_cuda, thread_num, cpu_num):
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
+    py_reader = fluid.layers.create_py_reader_by_data(
+        capacity=64,
+        feed_list=[img, label],
+        name='py_reader',
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(py_reader)
+
     prediction, avg_loss, acc = convolutional_neural_network(img, label)
 
     test_program = fluid.default_main_program().clone(for_test=True)
@@ -103,7 +110,7 @@ def train(use_cuda, thread_num, cpu_num):
 
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_run = 2
+    exec_strategy.num_iteration_per_run = 1
 
     main_program = fluid.default_main_program()
     pe = fluid.ParallelExecutor(
@@ -113,6 +120,22 @@ def train(use_cuda, thread_num, cpu_num):
         build_strategy=build_strategy,
         exec_strategy=exec_strategy)
 
+    py_reader.decorate_paddle_reader(train_reader)
+    py_reader.start()
+
+    step = 0
+    try:
+        while True:
+            print("step %d in" % step)
+            loss_val = pe.run(fetch_list=[avg_loss.name])
+            loss_val = numpy.mean(loss_val)
+            if step % 1 == 0:
+                print("Batch %d, Cost %f, queue size %d" %
+                      (step, loss_val, py_reader.queue.size()))
+            step += 1
+    except fluid.core.EOFException:
+        py_reader.reset()
+    """
     step = 0
     for step_id, data in enumerate(train_reader()):
         loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name])
@@ -120,6 +143,8 @@ def train(use_cuda, thread_num, cpu_num):
         if step % 100 == 0:
             print("Batch %d, Cost %f" % (step, loss_val))
         step += 1
+    """
+
     # test for epoch
     avg_loss_val, acc_val = train_test(
         train_test_program=test_program,

From 4a172611f989eaae04638784cf96c3a2be3c6b8c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 28 Jan 2019 17:11:48 +0800
Subject: [PATCH 20/98] complete test_async_ssa_graph_executor_mnist
 test=develop

---
 .../test_async_ssa_graph_executor_mnist.py    | 162 ++++++++++--------
 1 file changed, 91 insertions(+), 71 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 6a2f829654..1104604970 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -18,60 +18,61 @@ import os
 import unittest
 
 import numpy
+import time
 import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 64
 
 
-def loss_net(hidden, label):
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return prediction, avg_loss, acc
-
-
-def convolutional_neural_network(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    return loss_net(conv_pool_2, label)
-
-
-def train(use_cuda, thread_num, cpu_num):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        print("paddle is not compiled with cuda, exit!")
-        return
-
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    py_reader = fluid.layers.create_py_reader_by_data(
-        capacity=64,
-        feed_list=[img, label],
-        name='py_reader',
-        use_double_buffer=True)
-    img, label = fluid.layers.read_file(py_reader)
-
-    prediction, avg_loss, acc = convolutional_neural_network(img, label)
+def convolutional_neural_network(use_py_reader):
+    with fluid.unique_name.guard():
+        img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        py_reader = None
+        if use_py_reader:
+            py_reader = fluid.layers.create_py_reader_by_data(
+                capacity=64,
+                feed_list=[img, label],
+                name='py_reader',
+                use_double_buffer=True)
+            img, label = fluid.layers.read_file(py_reader)
+
+        conv_pool_1 = fluid.nets.simple_img_conv_pool(
+            input=img,
+            filter_size=5,
+            num_filters=20,
+            pool_size=2,
+            pool_stride=2,
+            act="relu")
+        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+        conv_pool_2 = fluid.nets.simple_img_conv_pool(
+            input=conv_pool_1,
+            filter_size=5,
+            num_filters=50,
+            pool_size=2,
+            pool_stride=2,
+            act="relu")
+
+        prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_loss = fluid.layers.mean(loss)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        return img, label, prediction, avg_loss, acc, py_reader
+
+
+def test():
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    test_program = fluid.default_main_program().clone(for_test=True)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
 
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_loss)
+    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
+        use_py_reader=False)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
 
     def train_test(train_test_program, train_test_feed, train_test_reader):
         acc_set = []
@@ -87,16 +88,33 @@ def train(use_cuda, thread_num, cpu_num):
         avg_loss_val_mean = numpy.array(avg_loss_set).mean()
         return avg_loss_val_mean, acc_val_mean
 
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    # test for epoch
+    avg_loss_val, acc_val = train_test(
+        train_test_program=fluid.default_main_program(),
+        train_test_reader=test_reader,
+        train_test_feed=feeder)
+
+    print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val))
+    assert acc_val > 0.96
+
+
+def train(use_cuda, thread_num, cpu_num):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        print("paddle is not compiled with cuda, exit!")
+        return
+
+    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
+        use_py_reader=True)
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.mnist.train(), buf_size=500),
         batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
 
+    place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
@@ -106,11 +124,11 @@ def train(use_cuda, thread_num, cpu_num):
     print("thread_num:" + str(thread_num))
 
     build_strategy = fluid.BuildStrategy()
-    build_strategy.async_mode = True  # enable async mode
+    build_strategy.async_mode = True
 
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_run = 1
+    exec_strategy.num_iteration_per_run = 10
 
     main_program = fluid.default_main_program()
     pe = fluid.ParallelExecutor(
@@ -126,37 +144,39 @@ def train(use_cuda, thread_num, cpu_num):
     step = 0
     try:
         while True:
-            print("step %d in" % step)
             loss_val = pe.run(fetch_list=[avg_loss.name])
             loss_val = numpy.mean(loss_val)
-            if step % 1 == 0:
+            if step % 100 == 0:
                 print("Batch %d, Cost %f, queue size %d" %
                       (step, loss_val, py_reader.queue.size()))
             step += 1
     except fluid.core.EOFException:
+        print("train end")
         py_reader.reset()
-    """
-    step = 0
-    for step_id, data in enumerate(train_reader()):
-        loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name])
-        loss_val = numpy.mean(loss_val)
-        if step % 100 == 0:
-            print("Batch %d, Cost %f" % (step, loss_val))
-        step += 1
-    """
-
-    # test for epoch
-    avg_loss_val, acc_val = train_test(
-        train_test_program=test_program,
-        train_test_reader=test_reader,
-        train_test_feed=feeder)
 
-    print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val))
+    return step
 
 
 class TestAsyncSSAGraphExecutor(unittest.TestCase):
     def test_check_async_ssa_exe_train(self):
-        train(use_cuda=False, thread_num=2, cpu_num=2)
+        step_list = []
+        for cpu_num in [1, 2, 4]:
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(
+                        fluid.Program(), startup_program=fluid.Program()):
+                    start_time = time.time()
+                    step = train(
+                        use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num)
+                    end_time = time.time()
+                    step_list.append(step)
+                print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) +
+                      " time -> " + str(end_time - start_time))
+                with fluid.program_guard(
+                        fluid.Program(), startup_program=fluid.Program()):
+                    test()
+        assert step_list[0] / 2 == step_list[1]
+        assert step_list[1] / 2 == step_list[2]
 
 
 if __name__ == "__main__":

From 657a4f9430913da999b025a55c213c5c9e603a73 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 28 Jan 2019 21:40:51 +0800
Subject: [PATCH 21/98] code can compile

---
 .../operators/distributed/parameter_send.cc   | 48 ++++++++++---------
 .../operators/distributed/parameter_send.h    | 14 +++---
 .../operators/distributed_ops/CMakeLists.txt  |  4 +-
 .../operators/distributed_ops/send_op.cc      |  5 +-
 4 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 09fce06b5a..38b64c3fcd 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -38,27 +38,27 @@ using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void send(const std::string& var_name,
-          const std::vector<std::string>& send_varnames,
-          const std::vector<std::string>& epmap,
-          const std::vector<int64_t>& height_sections,
-          const framework::ExecutionContext& ctx, const framework::Scope& scope,
-          bool sync) {
-  framework::Scope* local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
-  auto& actual_ctx = *pool.Get(ctx.GetPlace());
-
-  distributed::RPCClient* rpc_client =
+void ParameterSend<T>::operator()(const std::string &var_name,
+                                  const std::vector<std::string> &send_varnames,
+                                  const std::vector<std::string> &epmap,
+                                  const std::vector<int64_t> &height_sections,
+                                  const framework::ExecutionContext &ctx,
+                                  const framework::Scope &scope, bool sync) {
+  framework::Scope *local_scope = scope.NewTmpScope();
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto &actual_ctx = *pool.Get(ctx.GetPlace());
+
+  distributed::RPCClient *rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
           ctx.Attr<int>("trainer_id"));
 
-  auto* send_var = scope.FindVar(var_name);
+  auto *send_var = scope.FindVar(var_name);
   size_t out_num = send_varnames.size();
   if (send_var->IsType<framework::LoDTensor>()) {
-    auto& send_tensor = send_var->Get<framework::LoDTensor>();
-    auto& send_tensor_dims = send_tensor.dims();
+    auto &send_tensor = send_var->Get<framework::LoDTensor>();
+    auto &send_tensor_dims = send_tensor.dims();
     std::vector<framework::DDim> outs_dims;
     outs_dims.reserve(out_num);
 
@@ -89,13 +89,13 @@ void send(const std::string& var_name,
     // create output var in local scope
     size_t row_offset = 0;
     for (auto i = 0; i < out_num; ++i) {
-      auto* out =
+      auto *out =
           local_scope->Var(send_varnames[i])->GetMutable<framework::Tensor>();
       *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
       row_offset += outs_dims[i][0];
     }
   } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto& send_slr = send_var->Get<framework::SelectedRows>();
+    auto &send_slr = send_var->Get<framework::SelectedRows>();
     auto abs_sections = ToAbsoluteSection(height_sections);
 
     auto send_rows = send_slr.rows();
@@ -109,9 +109,9 @@ void send(const std::string& var_name,
     auto src = send_slr.value().data<T>();
 
     // create output var in local scope
-    std::vector<framework::SelectedRows*> outs;
-    for (auto& name : send_varnames) {
-      auto* out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
+    std::vector<framework::SelectedRows *> outs;
+    for (auto &name : send_varnames) {
+      auto *out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
       outs.push_back(out);
     }
 
@@ -163,8 +163,8 @@ void send(const std::string& var_name,
 
   std::vector<distributed::VarHandlePtr> rets;
   for (size_t i = 0; i < send_varnames.size(); i++) {
-    auto& send_var_name = send_varnames[i];
-    auto& endpoint = epmap[i];
+    auto &send_var_name = send_varnames[i];
+    auto &endpoint = epmap[i];
     if (NeedSend(*local_scope, send_var_name)) {
       VLOG(3) << "sending " << send_var_name << " to " << endpoint;
       rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
@@ -183,6 +183,8 @@ void send(const std::string& var_name,
   delete local_scope;
 }
 
+template struct ParameterSend<float>;
+
 };  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
index 6272cc5d25..1746377228 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -24,12 +24,14 @@ namespace operators {
 namespace distributed {
 
 template <typename T>
-void send(const std::string& var_name,
-          const std::vector<std::string>& send_varnames,
-          const std::vector<std::string>& epmap,
-          const std::vector<int64_t>& height_sections,
-          const framework::ExecutionContext& context,
-          const framework::Scope& scope, bool sync);
+struct ParameterSend {
+  void operator()(const std::string &var_name,
+                  const std::vector<std::string> &send_varnames,
+                  const std::vector<std::string> &epmap,
+                  const std::vector<int64_t> &height_sections,
+                  const framework::ExecutionContext &context,
+                  const framework::Scope &scope, bool sync);
+};
 
 };  // namespace distributed
 };  // namespace operators
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index a8bb597cbd..0eb30ce695 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 2136670103..e7ccaa83de 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -51,8 +51,9 @@ class SendOp : public framework::OperatorBase {
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
       auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx);
-      distributed::send<float>(ins[0], send_varnames, epmap, height_sections,
-                               exe_ctx, scope, static_cast<bool>(sync_send));
+      auto send_functor = distributed::ParameterSend<float>();
+      send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx,
+                   scope, static_cast<bool>(sync_send));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From 249f48e5397359696f1c2844473f4dcf55ce0ebe Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 29 Jan 2019 07:10:00 +0800
Subject: [PATCH 22/98] update test test=develop

---
 .../tests/unittests/test_async_ssa_graph_executor_mnist.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 1104604970..41fa39e06b 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -175,8 +175,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase):
                 with fluid.program_guard(
                         fluid.Program(), startup_program=fluid.Program()):
                     test()
-        assert step_list[0] / 2 == step_list[1]
-        assert step_list[1] / 2 == step_list[2]
+        assert int(step_list[0] / 2) == int(step_list[1])
+        assert int(step_list[1] / 2) == int(step_list[2])
 
 
 if __name__ == "__main__":

From b1fe8d45709e0d7d0dcde4e969b5fc4e833320c6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Feb 2019 09:48:00 +0800
Subject: [PATCH 23/98] add a check for async_ssa_graph_exe test=develop

---
 .../framework/details/async_ssa_graph_executor.cc   | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index e21d5fb96d..79b390dde4 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -30,6 +30,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
+  if (strategy_.num_iteration_per_run_ > 1) {
+    int read_op_num = 0;
+    for (auto *node : graphs_[0]->Nodes()) {
+      if (node->IsOp() && node->Name() == "read") {
+        read_op_num++;
+      }
+    }
+    if (read_op_num == 0) {
+      LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model "
+                      "should use pyreader to feed data!";
+    }
+  }
+
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                                ? 1UL

From 741b7cfda9e6b921fba69b7a6ed904a3b5406f02 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Feb 2019 23:02:47 +0800
Subject: [PATCH 24/98] fix compile test=develop

---
 paddle/fluid/operators/distributed/parameter_send.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 38b64c3fcd..efe094fd1f 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -48,7 +48,6 @@ void ParameterSend<T>::operator()(const std::string &var_name,
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-  auto &actual_ctx = *pool.Get(ctx.GetPlace());
 
   distributed::RPCClient *rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(

From 4356f186b4a3015ea1a2877e60f1d8a05fe5312d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 6 Feb 2019 11:08:12 +0800
Subject: [PATCH 25/98] complete parameter_send

---
 .../operators/distributed/parameter_send.cc   | 42 ++++++-----------
 .../operators/distributed_ops/send_op.cc      |  2 +-
 .../fluid/tests/unittests/test_dist_base.py   |  5 ++
 .../fluid/transpiler/distribute_transpiler.py | 47 +++++++++++++------
 4 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index efe094fd1f..47ca42c790 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -56,25 +56,13 @@ void ParameterSend<T>::operator()(const std::string &var_name,
   auto *send_var = scope.FindVar(var_name);
   size_t out_num = send_varnames.size();
   if (send_var->IsType<framework::LoDTensor>()) {
-    auto &send_tensor = send_var->Get<framework::LoDTensor>();
-    auto &send_tensor_dims = send_tensor.dims();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(out_num);
-
-    // infer output shape
-    int num = ctx.Attr<int>("num");
-    if (num > 0) {
-      int64_t in_axis_dim = send_tensor_dims[0];
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (height_sections.size() > 0) {
+    if (out_num > 1) {
+      auto &send_tensor = send_var->Get<framework::LoDTensor>();
+      auto &send_tensor_dims = send_tensor.dims();
+      std::vector<framework::DDim> outs_dims;
+      outs_dims.reserve(out_num);
+
+      // infer output shape
       PADDLE_ENFORCE_EQ(height_sections.size(), out_num,
                         "tensor split sections size"
                         "should be equal to output size.");
@@ -83,15 +71,15 @@ void ParameterSend<T>::operator()(const std::string &var_name,
         dim[0] = height_sections[i];
         outs_dims.push_back(dim);
       }
-    }
 
-    // create output var in local scope
-    size_t row_offset = 0;
-    for (auto i = 0; i < out_num; ++i) {
-      auto *out =
-          local_scope->Var(send_varnames[i])->GetMutable<framework::Tensor>();
-      *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-      row_offset += outs_dims[i][0];
+      // create output var in local scope
+      size_t row_offset = 0;
+      for (auto i = 0; i < out_num; ++i) {
+        auto *out =
+            local_scope->Var(send_varnames[i])->GetMutable<framework::Tensor>();
+        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
+        row_offset += outs_dims[i][0];
+      }
     }
   } else if (send_var->IsType<framework::SelectedRows>()) {
     auto &send_slr = send_var->Get<framework::SelectedRows>();
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index e7ccaa83de..0f0ad6b8f9 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -42,7 +42,7 @@ class SendOp : public framework::OperatorBase {
     int sync_send = Attr<int>("sync_mode");
 
     auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
-    auto height_sections = Attr<std::vector<int64_t>>("height_sections");
+    auto height_sections = Attr<std::vector<int64_t>>("sections");
 
     if (send_varnames.size() > 0) {
       PADDLE_ENFORCE_EQ(ins.size(), 1, "");
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 0968ace62b..758c510dc7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -48,6 +48,7 @@ class TestDistRunnerBase(object):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
+        config.runtime_split_send_recv = True
         t = fluid.DistributeTranspiler(config=config)
         t.transpile(
             trainer_id=trainer_id,
@@ -87,6 +88,9 @@ class TestDistRunnerBase(object):
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
+            with open("/tmp/trainer." + str(args.trainer_id) + ".proto",
+                      "w") as f:
+                f.write(str(trainer_prog))
         elif args.update_method == "nccl2":
             # transpile for nccl2
             config = fluid.DistributeTranspilerConfig()
@@ -115,6 +119,7 @@ class TestDistRunnerBase(object):
         strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
+        build_stra.debug_graphviz_path = "/tmp/graph-" + str(args.trainer_id)
 
         if args.use_reduce:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a3293afbbd..1b1b416593 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -156,6 +156,8 @@ class DistributeTranspilerConfig(object):
     mode = "pserver"
     print_log = False
     wait_port = True
+    # split the send recv var in runtime
+    runtime_split_send_recv = False
 
 
 class DistributeTranspiler(object):
@@ -398,8 +400,10 @@ class DistributeTranspiler(object):
                 orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
-                self._insert_split_op(program, orig_var, index, splited_vars)
-                index += 1
+                if not self.config.runtime_split_send_recv:
+                    self._insert_split_op(program, orig_var, index,
+                                          splited_vars)
+                    index += 1
             else:
                 AssertionError("Can not insert the send op by original "
                                "variable name :", splited_grad_varname)
@@ -408,6 +412,17 @@ class DistributeTranspiler(object):
                 name=framework.generate_control_dev_var_name())
             self.grad_name_to_send_dummy_out[grad_varname] = dummy_output
 
+            if self.config.runtime_split_send_recv:
+                send_input_vars = [
+                    program.global_block().vars[splited_grad_varname]
+                ]
+                sections = self._get_splited_var_sections(splited_vars)
+                send_varnames = [var.name for var in splited_vars]
+            else:
+                send_input_vars = splited_vars
+                sections = []
+                send_varnames = []
+
             # get send op_role_var, if not splited, the grad should have .trainer suffix
             # if splited, grad should be the original grad var name (split_by_ref and send
             # will be on the same place). ParallelExecutor
@@ -415,10 +430,12 @@ class DistributeTranspiler(object):
             program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
-                inputs={"X": splited_vars},
+                inputs={"X": send_input_vars},
                 outputs={"Out": dummy_output},
                 attrs={
                     "epmap": eplist,
+                    "sections": sections,
+                    "send_varnames": send_varnames,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                     OP_ROLE_VAR_ATTR_NAME: [
                         self.grad_name_to_param_name[grad_varname],
@@ -1372,9 +1389,8 @@ class DistributeTranspiler(object):
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
+            op for op in self.optimize_ops if 'Param' in op.input_names and
+            op.input("Param")[0] == self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[
@@ -1548,11 +1564,17 @@ class DistributeTranspiler(object):
             lod_level=var.lod_level,
             persistable=persistable)
 
+    @staticmethod
+    def _get_splited_var_sections(splited_vars):
+        height_sections = []
+        for v in splited_vars:
+            height_sections.append(v.shape[0])
+        return height_sections
+
     def _insert_split_op(self, program, orig_var, index, splited_vars):
+        height_sections = self._get_splited_var_sections(splited_vars)
+
         if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-            height_sections = []
-            for v in splited_vars:
-                height_sections.append(v.shape[0])
             sparse_param_name = self.grad_name_to_param_name[orig_var.name]
             if self._is_input_of_remote_sparse_update_op(sparse_param_name):
                 self.sparse_param_to_height_sections[
@@ -1567,16 +1589,13 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
                 })
         elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-            sections = []
-            for v in splited_vars:
-                sections.append(v.shape[0])
             program.global_block()._insert_op(
                 index=index + 1,
                 type="split_byref",
                 inputs={"X": orig_var},
                 outputs={"Out": splited_vars},
                 attrs={
-                    "sections": sections,
+                    "sections": height_sections,
                     RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
                 })
         else:
@@ -2048,7 +2067,7 @@ class DistributeTranspiler(object):
         Get optimizer operators, parameters and gradients from origin_program
         Returns:
             opt_ops (list): optimize operators.
-            params_grads (dict): paramter->gradient.
+            params_grads (dict): parameter->gradient.
         """
         block = self.origin_program.global_block()
         opt_ops = []

From 5c36eb8b6962446e95840f775f87308d0df32ff6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 6 Feb 2019 20:36:31 +0800
Subject: [PATCH 26/98] fix build

---
 paddle/fluid/operators/distributed/parameter_send.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 47ca42c790..fd97926623 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -75,8 +75,8 @@ void ParameterSend<T>::operator()(const std::string &var_name,
       // create output var in local scope
       size_t row_offset = 0;
       for (auto i = 0; i < out_num; ++i) {
-        auto *out =
-            local_scope->Var(send_varnames[i])->GetMutable<framework::Tensor>();
+        framework::Tensor *out = local_scope->Var(send_varnames[i])
+                                     ->GetMutable<framework::LoDTensor>();
         *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
         row_offset += outs_dims[i][0];
       }
@@ -161,7 +161,8 @@ void ParameterSend<T>::operator()(const std::string &var_name,
     }
   }
 
-  if (sync) {
+  // note!! only support sync send now
+  if (true || sync) {
     for (size_t i = 0; i < rets.size(); i++) {
       PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
     }

From 5cf0092825a9625018e8856931cbdb8ff15b71a5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Feb 2019 14:19:21 +0800
Subject: [PATCH 27/98] add more log and fix test_dist_base in
 multi_batch_merge_pass

---
 paddle/fluid/framework/details/build_strategy.cc      | 2 ++
 paddle/fluid/framework/ir/pass.cc                     | 1 +
 python/paddle/fluid/tests/unittests/test_dist_base.py | 3 +--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 51ce973272..ca9843057d 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -177,11 +177,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 #else
     const bool use_cuda) const {
 #endif
+  VLOG(3) << "apply all passes";
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    VLOG(3) << "apply " << pass->Type();
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 33ccee6aa0..823697495e 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,6 +19,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  VLOG(3) << "apply pass -> " << Type();
   PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
   for (const std::string& attr : required_pass_attrs_) {
     PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 758c510dc7..98e6923c11 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -128,8 +128,7 @@ class TestDistRunnerBase(object):
 
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
-            mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
+            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
             mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":

From a0585d08ed42aa9caeefe1973549b6dd69d46823 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Feb 2019 20:44:18 +0800
Subject: [PATCH 28/98] init parameter recv

---
 .../operators/distributed/CMakeLists.txt      |   3 +-
 .../operators/distributed/parameter_recv.cc   | 178 ++++++++++++++++++
 .../operators/distributed/parameter_recv.h    |  38 ++++
 .../operators/distributed_ops/CMakeLists.txt  |   4 +-
 .../operators/distributed_ops/recv_op.cc      |   5 +
 5 files changed, 225 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/parameter_recv.cc
 create mode 100644 paddle/fluid/operators/distributed/parameter_recv.h

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 03f47b594d..231f4b3bc4 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -30,7 +30,7 @@ if(WITH_GRPC)
 
 else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
 
@@ -53,6 +53,7 @@ cc_test(rpc_server_test SRCS rpc_server_test.cc
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
+cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
new file mode 100644
index 0000000000..e5b486d121
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -0,0 +1,178 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/parameter_recv.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+template <typename T>
+void ParameterRecv<T>::operator()(const std::string &var_name,
+                                  const std::vector<std::string> &send_varnames,
+                                  const std::vector<std::string> &epmap,
+                                  const std::vector<int64_t> &height_sections,
+                                  const framework::ExecutionContext &ctx,
+                                  const framework::Scope &scope, bool sync) {
+  framework::Scope *local_scope = scope.NewTmpScope();
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
+
+  distributed::RPCClient *rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          ctx.Attr<int>("trainer_id"));
+
+  auto *send_var = scope.FindVar(var_name);
+  size_t out_num = send_varnames.size();
+  if (send_var->IsType<framework::LoDTensor>()) {
+    if (out_num > 1) {
+      auto &send_tensor = send_var->Get<framework::LoDTensor>();
+      auto &send_tensor_dims = send_tensor.dims();
+      std::vector<framework::DDim> outs_dims;
+      outs_dims.reserve(out_num);
+
+      // infer output shape
+      PADDLE_ENFORCE_EQ(height_sections.size(), out_num,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < out_num; ++i) {
+        auto dim = send_tensor_dims;
+        dim[0] = height_sections[i];
+        outs_dims.push_back(dim);
+      }
+
+      // create output var in local scope
+      size_t row_offset = 0;
+      for (auto i = 0; i < out_num; ++i) {
+        framework::Tensor *out = local_scope->Var(send_varnames[i])
+                                     ->GetMutable<framework::LoDTensor>();
+        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
+        row_offset += outs_dims[i][0];
+      }
+    }
+  } else if (send_var->IsType<framework::SelectedRows>()) {
+    auto &send_slr = send_var->Get<framework::SelectedRows>();
+    auto abs_sections = ToAbsoluteSection(height_sections);
+
+    auto send_rows = send_slr.rows();
+    std::vector<std::vector<int>> outs_rows_idx;
+    std::vector<std::vector<int>> outs_dense_idx;
+
+    outs_rows_idx.resize(out_num);
+    outs_dense_idx.resize(out_num);
+
+    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
+    auto src = send_slr.value().data<T>();
+
+    // create output var in local scope
+    std::vector<framework::SelectedRows *> outs;
+    for (auto &name : send_varnames) {
+      auto *out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
+      outs.push_back(out);
+    }
+
+    // split rows index into output sparse vars
+    for (size_t i = 0; i < send_rows.size(); ++i) {
+      int out_idx = FindOutIdx(send_rows[i], abs_sections);
+      outs_rows_idx[out_idx].push_back(send_rows[i]);
+      outs_dense_idx[out_idx].push_back(i);
+    }
+    auto place = ctx.GetPlace();
+
+    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
+      auto rows_idx = outs_rows_idx[i];
+      outs[i]->set_height(height_sections[i]);
+      auto dims = send_slr.GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
+      outs[i]->mutable_rows()->clear();
+      if (rows_idx.size() > 0) {
+        for (auto idx : rows_idx) {
+          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
+        }
+        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
+        for (size_t j = 0; j < rows_idx.size(); j++) {
+          if (platform::is_cpu_place(place)) {
+            memory::Copy(
+                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
+                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
+          } else {
+#ifdef PADDLE_WITH_CUDA
+            auto stream = ctx.cuda_device_context().stream();
+            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
+                         platform::CUDAPlace(),
+                         src + outs_dense_idx[i][j] * row_numel,
+                         sizeof(T) * row_numel, stream);
+#else
+            PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+          }
+        }
+      }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
+    }
+
+  } else {
+    PADDLE_THROW("unsupported var type to send!");
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < send_varnames.size(); i++) {
+    auto &send_var_name = send_varnames[i];
+    auto &endpoint = epmap[i];
+    if (NeedSend(*local_scope, send_var_name)) {
+      VLOG(3) << "sending " << send_var_name << " to " << endpoint;
+      rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
+                                              send_var_name));
+    } else {
+      VLOG(3) << "don't send non-initialized variable: " << send_varnames[i];
+    }
+  }
+
+  // note!! only support sync send now
+  if (true || sync) {
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
+  }
+
+  delete local_scope;
+}
+
+template struct ParameterRecv<float>;
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
new file mode 100644
index 0000000000..817115e2d1
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_recv.h
@@ -0,0 +1,38 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+template <typename T>
+struct ParameterRecv {
+  void operator()(const std::string &var_name,
+                  const std::vector<std::string> &send_varnames,
+                  const std::vector<std::string> &epmap,
+                  const std::vector<int64_t> &height_sections,
+                  const framework::ExecutionContext &context,
+                  const framework::Scope &scope, bool sync);
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 0eb30ce695..3bcfc532e8 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 120c65f296..5e004a7a3c 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -110,6 +110,11 @@ This operator can get variables from server side.
         "for example: we need var named 'moment_1@127.0.0.1:1001', "
         "and it real name on parameter server is 'moment_1'. ")
         .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "recv_varnames",
+        "(vector<string>) "
+        "the splited parameter varnames to be recved from pserver")
+        .SetDefault(std::vector<std::string>{});
   }
 };
 

From a804a2ae2ada43244774cebc349b08b6bd65ecfd Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Feb 2019 11:14:58 +0800
Subject: [PATCH 29/98] complete parameter recv

---
 .../operators/distributed/parameter_recv.cc   | 141 ++++--------------
 .../operators/distributed/parameter_recv.h    |   5 +-
 2 files changed, 34 insertions(+), 112 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index e5b486d121..2664a89ed6 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
@@ -39,11 +40,10 @@ using DDim = framework::DDim;
 
 template <typename T>
 void ParameterRecv<T>::operator()(const std::string &var_name,
-                                  const std::vector<std::string> &send_varnames,
+                                  const std::vector<std::string> &recv_varnames,
                                   const std::vector<std::string> &epmap,
-                                  const std::vector<int64_t> &height_sections,
                                   const framework::ExecutionContext &ctx,
-                                  const framework::Scope &scope, bool sync) {
+                                  const framework::Scope &scope) {
   framework::Scope *local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -53,118 +53,41 @@ void ParameterRecv<T>::operator()(const std::string &var_name,
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
           ctx.Attr<int>("trainer_id"));
 
-  auto *send_var = scope.FindVar(var_name);
-  size_t out_num = send_varnames.size();
-  if (send_var->IsType<framework::LoDTensor>()) {
-    if (out_num > 1) {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      auto &send_tensor_dims = send_tensor.dims();
-      std::vector<framework::DDim> outs_dims;
-      outs_dims.reserve(out_num);
-
-      // infer output shape
-      PADDLE_ENFORCE_EQ(height_sections.size(), out_num,
-                        "tensor split sections size"
-                        "should be equal to output size.");
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = height_sections[i];
-        outs_dims.push_back(dim);
-      }
-
-      // create output var in local scope
-      size_t row_offset = 0;
-      for (auto i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(send_varnames[i])
-                                     ->GetMutable<framework::LoDTensor>();
-        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-        row_offset += outs_dims[i][0];
-      }
+  auto *recv_var = scope.FindVar(var_name);
+
+  std::vector<framework::Tensor *> recved_tensors;
+
+  // recv all vars to local scope
+  if (recv_var->IsType<framework::LoDTensor>()) {
+    std::vector<distributed::VarHandlePtr> rets;
+    for (size_t i = 0; i < recv_varnames.size(); i++) {
+      auto &recv_var_name = recv_varnames[i];
+      framework::Tensor *t =
+          local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
+      recved_tensors.push_back(t);
+      VLOG(3) << "recv " << recv_var_name << " from " << epmap[i];
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], cpu_ctx, *local_scope,
+                                             recv_var_name, recv_var_name));
     }
-  } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto &send_slr = send_var->Get<framework::SelectedRows>();
-    auto abs_sections = ToAbsoluteSection(height_sections);
-
-    auto send_rows = send_slr.rows();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
-
-    outs_rows_idx.resize(out_num);
-    outs_dense_idx.resize(out_num);
-
-    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto src = send_slr.value().data<T>();
-
-    // create output var in local scope
-    std::vector<framework::SelectedRows *> outs;
-    for (auto &name : send_varnames) {
-      auto *out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
-      outs.push_back(out);
-    }
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < send_rows.size(); ++i) {
-      int out_idx = FindOutIdx(send_rows[i], abs_sections);
-      outs_rows_idx[out_idx].push_back(send_rows[i]);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
-      auto dims = send_slr.GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-      outs[i]->mutable_rows()->clear();
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-#ifdef PADDLE_WITH_CUDA
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        "rows should has the same size with tensor dim 0");
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
     }
-
   } else {
     PADDLE_THROW("unsupported var type to send!");
   }
 
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < send_varnames.size(); i++) {
-    auto &send_var_name = send_varnames[i];
-    auto &endpoint = epmap[i];
-    if (NeedSend(*local_scope, send_var_name)) {
-      VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
-                                              send_var_name));
-    } else {
-      VLOG(3) << "don't send non-initialized variable: " << send_varnames[i];
-    }
-  }
-
-  // note!! only support sync send now
-  if (true || sync) {
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  // concat recved tensor into one var
+  {
+    size_t output_offset = 0;
+    framework::Tensor *recv_tensor =
+        recv_var->GetMutable<framework::LoDTensor>();
+    for (auto *in : recved_tensors) {
+      auto in_stride = framework::stride_numel(in->dims());
+      auto out_stride = framework::stride_numel(recv_tensor->dims());
+      StridedNumelCopyWithAxis<T>(
+          ctx.device_context(), 0, recv_tensor->data<T>() + output_offset,
+          out_stride, in->data<T>(), in_stride, in_stride[0]);
+      output_offset += in_stride[0];
     }
   }
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
index 817115e2d1..bc6f5f5adf 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ b/paddle/fluid/operators/distributed/parameter_recv.h
@@ -26,11 +26,10 @@ namespace distributed {
 template <typename T>
 struct ParameterRecv {
   void operator()(const std::string &var_name,
-                  const std::vector<std::string> &send_varnames,
+                  const std::vector<std::string> &recv_varnames,
                   const std::vector<std::string> &epmap,
-                  const std::vector<int64_t> &height_sections,
                   const framework::ExecutionContext &context,
-                  const framework::Scope &scope, bool sync);
+                  const framework::Scope &scope);
 };
 
 };  // namespace distributed

From fbd186bd5d6dced8255607f9b6266cd438c564dc Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Feb 2019 14:18:14 +0800
Subject: [PATCH 30/98] complete recv op

---
 .../operators/distributed_ops/recv_op.cc      | 58 ++++++++++++-------
 .../fluid/transpiler/distribute_transpiler.py | 25 +++++---
 2 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 5e004a7a3c..a0185d66f0 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/parameter_recv.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -48,32 +49,45 @@ class RecvOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    if (with_barrier) {
-      std::vector<distributed::VarHandlePtr> rets;
-      for (size_t i = 0; i < outs.size(); i++) {
-        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                << varname << " and with AsyncGetVar";
-        rets.push_back(
-            rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
-      }
-      if (sync_mode) {
+    std::vector<std::string> recv_varnames =
+        Attr<std::vector<std::string>>("recv_varnames");
+
+    if (recv_varnames.size() > 0) {
+      framework::RuntimeContext ctx(Inputs(), Outputs(), scope);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(place);
+      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx);
+      auto recv_functor = distributed::ParameterRecv<float>();
+      recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope);
+    } else {
+      if (with_barrier) {
+        std::vector<distributed::VarHandlePtr> rets;
+        for (size_t i = 0; i < outs.size(); i++) {
+          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                  << varname << " and with AsyncGetVar";
+          rets.push_back(
+              rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
+        }
+        if (sync_mode) {
+          for (size_t i = 0; i < rets.size(); i++) {
+            PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+          }
+        }
+      } else {
+        std::vector<distributed::VarHandlePtr> rets;
+        for (size_t i = 0; i < outs.size(); i++) {
+          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                  << varname << " and with AsyncGetVarNoBarrier";
+          rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
+                                                          varname, outs[i]));
+        }
         for (size_t i = 0; i < rets.size(); i++) {
           PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
         }
       }
-    } else {
-      std::vector<distributed::VarHandlePtr> rets;
-      for (size_t i = 0; i < outs.size(); i++) {
-        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                << varname << " and with AsyncGetVarNoBarrier";
-        rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
-                                                        varname, outs[i]));
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-      }
     }
   }
 };
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 1b1b416593..ae7deda897 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -519,12 +519,20 @@ class DistributeTranspiler(object):
                     param_varname, height_sections, eps, table_names)
             else:
                 all_recv_outputs.extend(splited_var)
+
+                recv_varnames = []
+                if self.config.runtime_split_send_recv:
+                    orig_param = program.global_block().vars[param_varname]
+                    recv_varnames = [var.name for var in splited_vars]
+                    splited_var = [orig_param]
+
                 program.global_block().append_op(
                     type="recv",
                     inputs={"X": [recv_dep_in]},
                     outputs={"Out": splited_var},
                     attrs={
                         "epmap": eps,
+                        "recv_varnames": recv_varnames,
                         "trainer_id": self.trainer_id,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME:
@@ -549,14 +557,15 @@ class DistributeTranspiler(object):
                 continue
             orig_param = program.global_block().vars[param_varname]
             if param_varname not in self.sparse_param_to_height_sections:
-                program.global_block().append_op(
-                    type="concat",
-                    inputs={"X": splited_var},
-                    outputs={"Out": [orig_param]},
-                    attrs={
-                        "axis": 0,
-                        RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                    })
+                if not self.config.runtime_split_send_recv:
+                    program.global_block().append_op(
+                        type="concat",
+                        inputs={"X": splited_var},
+                        outputs={"Out": [orig_param]},
+                        attrs={
+                            "axis": 0,
+                            RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
+                        })
 
         self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
 

From 8bda4ab213c52871435fc6d74ef51d16b9f3235e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Feb 2019 18:22:50 +0800
Subject: [PATCH 31/98] parameter recv can run

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ae7deda897..b9b0cd24eb 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -518,13 +518,12 @@ class DistributeTranspiler(object):
                 self._update_remote_sparse_update_op(
                     param_varname, height_sections, eps, table_names)
             else:
-                all_recv_outputs.extend(splited_var)
-
                 recv_varnames = []
                 if self.config.runtime_split_send_recv:
                     orig_param = program.global_block().vars[param_varname]
-                    recv_varnames = [var.name for var in splited_vars]
+                    recv_varnames = [var.name for var in splited_var]
                     splited_var = [orig_param]
+                all_recv_outputs.extend(splited_var)
 
                 program.global_block().append_op(
                     type="recv",

From e72637ddd22765dd915119b96bc1821734cd28ef Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 9 Feb 2019 17:11:46 +0800
Subject: [PATCH 32/98] ThreadedSSAGraphExecutor support num_iteration_per_run
 test=develop

---
 .../details/async_ssa_graph_executor.cc       | 16 ------------
 .../details/threaded_ssa_graph_executor.cc    | 25 +++++++++++++++++--
 .../details/threaded_ssa_graph_executor.h     |  1 +
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 79b390dde4..5ce92ad826 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -30,19 +30,6 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
-  if (strategy_.num_iteration_per_run_ > 1) {
-    int read_op_num = 0;
-    for (auto *node : graphs_[0]->Nodes()) {
-      if (node->IsOp() && node->Name() == "read") {
-        read_op_num++;
-      }
-    }
-    if (read_op_num == 0) {
-      LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model "
-                      "should use pyreader to feed data!";
-    }
-  }
-
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                                ? 1UL
@@ -69,9 +56,6 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
   for (size_t i = 0; i < places_.size(); ++i) {
     auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
       try {
-        for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) {
-          executors_[i]->Run(fetch_tensors);
-        }
         return executors_[i]->Run(fetch_tensors);
       } catch (...) {
         exception_holder_.Catch(std::current_exception());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 677a293794..16fa2a6db6 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -32,9 +32,22 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       places_(places),
       fetch_ctxs_(places),
       running_ops_(0),
-      strategy_(strategy) {}
+      strategy_(strategy) {
+  if (strategy_.num_iteration_per_run_ > 1) {
+    int read_op_num = 0;
+    for (auto *node : graph_->Nodes()) {
+      if (node->IsOp() && node->Name() == "read") {
+        read_op_num++;
+      }
+    }
+    if (read_op_num == 0) {
+      LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model "
+                      "should use pyreader to feed data!";
+    }
+  }
+}
 
-FeedFetchList ThreadedSSAGraphExecutor::Run(
+inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
     const std::vector<std::string> &fetch_tensors) {
   std::unique_ptr<platform::RecordEvent> event(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
@@ -140,6 +153,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   return fetch_data;
 }
 
+FeedFetchList ThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) {
+    RunImpl({});
+  }
+  return RunImpl(fetch_tensors);
+}
+
 void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
     std::vector<FetchOpHandle *> *fetch_ops,
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 24da56c09e..3809b6e9ae 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ~ThreadedSSAGraphExecutor() final = default;
 
  private:
+  inline FeedFetchList RunImpl(const std::vector<std::string> &fetch_tensors);
   void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
              details::OpHandleBase *op);
 

From 84367cf8bc4195d82dc1851d116980746f7c68b6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 10 Feb 2019 19:58:50 +0800
Subject: [PATCH 33/98] support async mode in dist mode parallel executor

---
 .../details/multi_devices_graph_pass.cc       | 35 ++++++++++++++++---
 .../details/multi_devices_graph_pass.h        | 12 +++----
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f1347e2b0d..a2bbfc91b7 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -167,6 +167,10 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
 
   bool is_forwarding = true;
   bool insert_collection_ops = NeedCollectiveOps();
+  if (strategy_.async_mode_) {
+    // async mode did not need to merge gradient
+    insert_collection_ops = false;
+  }
 
   for (ir::Node *node : sorted_ops) {
     if (DealWithSpecialOp(&result, node)) {
@@ -192,8 +196,22 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
               static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
                                     OpProtoAndCheckerMaker::OpRoleAttrName())) &
                                 static_cast<int>(OpRole::kBackward));
+          // optimize op is already processed in DealWithSpecialOp,
+          // here we only consider backward op
           if (!is_bk_op) continue;
 
+          /*
+           * the op that will generate the gradient of on parameter will have
+           one attr op_role_var
+           * to record the parameter and gradient, like:
+            attrs {
+              name: "op_role_var"
+              type: STRINGS
+              strings: "fc_1.b_0"
+              strings: "fc_1.b_0@GRAD"
+            }
+           */
+
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
           auto backward_vars =
@@ -204,7 +222,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
           for (size_t i = 0; i < backward_vars.size(); i += 2) {
             auto &p_name = backward_vars[i];
             auto &g_name = backward_vars[i + 1];
-            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+            VLOG(3) << "Bcast " << g_name << " for parameter " << p_name;
 
             InsertCollectiveOp(&result, p_name, g_name);
           }
@@ -385,7 +403,7 @@ void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
 
 void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
                                                         ir::Node *node,
-                                                        int dev_id) const {
+                                                        size_t dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id], dev_id));
@@ -454,9 +472,8 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
-                                                       const std::string &og,
-                                                       int dst_dev_id) const {
+VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
+    ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
@@ -720,6 +737,10 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
                                             ir::Node *node) const {
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
+    // in async_mode, each graph will send it's own gradient.
+    if (strategy_.async_mode_ && node->Op()->Type() == "send") {
+      return false;
+    }
     int op_dev_id = CreateRPCOp(result, node);
     PADDLE_ENFORCE(op_dev_id != -1,
                    "Can not schedule the RPC operator to the right place.");
@@ -737,6 +758,8 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   } else if (OpHaveRole(*node, OpRole::kDist)) {
     int op_dev_id = CreateDistTrainOp(result, node);
     if (node->Op()->Type() == "concat") {
+      // the input(block of parameter) of concat is on different device,
+      // the output(parameter) will on one device.
       auto origin_param_name = node->Op()->OutputArgumentNames()[0];
       bcast_var_name_set_[op_dev_id].emplace(origin_param_name);
     }
@@ -744,6 +767,7 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   } else {
     int op_dev_id = GetOpDeviceID(node);
     if (op_dev_id != -1) {  // This op only runs on one specific device.
+      // optimize op will be processed here.
       CreateComputationalOp(result, node, op_dev_id);
       for (ir::Node *n : node->outputs) {
         sharded_var_device_.emplace(n->Name(), op_dev_id);
@@ -905,6 +929,7 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
                                              const std::string &g_name) const {
+  // collective gradient to each device
   size_t cur_device_id = 0;
   switch (strategy_.reduce_) {
     case BuildStrategy::ReduceStrategy::kReduce:
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index e91397816c..377ba50fcc 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -68,10 +68,10 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
                              proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
-                            int dst_dev_id) const;
+                            size_t dst_dev_id) const;
 
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
-                             int dev_id) const;
+                             size_t dev_id) const;
 
   bool IsSparseGradient(const std::string &og) const;
 
@@ -118,16 +118,16 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
 
 class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
  protected:
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                                  const std::string &g_name) const {}
+  void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                          const std::string &g_name) const override {}
 
   bool NeedCollectiveOps() const override { return false; }
 
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
+  bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
     return false;
   }
 
-  virtual void InsertPostprocessOps(ir::Graph *result) const {}
+  void InsertPostprocessOps(ir::Graph *result) const override {}
 };
 
 class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {

From c4ded17e8cbcbf33e68145c1a4ffe777582bf3ab Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 11 Feb 2019 09:19:48 +0800
Subject: [PATCH 34/98] async mode support dist train

---
 paddle/fluid/framework/details/build_strategy.cc           | 6 +++---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 7 ++++++-
 paddle/fluid/framework/parallel_executor.cc                | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index a286cb30a2..e917395259 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -133,10 +133,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
 
-    if (strategy_.async_mode_) {
-      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
-    } else if (strategy_.is_distribution_) {
+    if (strategy_.is_distribution_) {
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
+    } else if (strategy_.async_mode_) {
+      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
         multi_devices_pass =
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index a2bbfc91b7..572d374b50 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -756,6 +756,11 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
     insert_op = true;
     need_broadcast_var_ = true;
   } else if (OpHaveRole(*node, OpRole::kDist)) {
+    // in async_mode, each graph will send it's own gradient, do not need to
+    // merge gradient.
+    if (strategy_.async_mode_ && node->Op()->Type() != "concat") {
+      return false;
+    }
     int op_dev_id = CreateDistTrainOp(result, node);
     if (node->Op()->Type() == "concat") {
       // the input(block of parameter) of concat is on different device,
@@ -827,7 +832,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
     }
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-    if (recv_param_grad.size() == 2U) {
+    if (recv_param_grad.size() == 2U && !strategy_.async_mode_) {
       op_dev_id = GetVarDeviceID(recv_param_grad[1]);
       VLOG(10) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f0bc3acccc..c85fe4f200 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -283,7 +283,7 @@ ParallelExecutor::ParallelExecutor(
     graphs.push_back(std::move(graph));
   }
 #else
-  if (build_strategy.async_mode_) {
+  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     for (size_t i = 0; i < member_->places_.size(); ++i) {
       std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
           main_program, {member_->places_[i]}, loss_var_name,

From 2171aa77f100b53c59b8dfd615f2a7ebcf447b77 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 11 Feb 2019 09:29:36 +0800
Subject: [PATCH 35/98] async ssa exe only support local mode

---
 paddle/fluid/framework/parallel_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c85fe4f200..e8531cd8d8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -333,7 +333,7 @@ ParallelExecutor::ParallelExecutor(
              "please don't pass loss_var_name.";
     }
   }
-  if (build_strategy.async_mode_) {
+  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,

From 9465c3d0c393f7e7c5665f561433ca65e193396c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 21 Feb 2019 16:28:38 +0800
Subject: [PATCH 36/98] fix compile problem

---
 paddle/fluid/framework/parallel_executor.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfadfb57db..67ccf04d05 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
-#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
+#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@@ -260,6 +260,7 @@ ParallelExecutor::ParallelExecutor(
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
   std::unique_ptr<ir::Graph> graph;
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
                                member_->local_scopes_, member_->nranks_,
@@ -273,10 +274,9 @@ ParallelExecutor::ParallelExecutor(
       graphs.push_back(std::move(graph));
     }
   } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_);
-    graphs.push_back(std::move(graph));
+    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                                 member_->local_scopes_, member_->nranks_,
+                                 member_->use_cuda_);
   }
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();

From 7f3be09045e349ef9028337083604c1d3a126169 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 21 Feb 2019 17:08:56 +0800
Subject: [PATCH 37/98] fix multi graph test=develop

---
 .../fluid/framework/details/build_strategy.cc |  1 +
 paddle/fluid/framework/parallel_executor.cc   | 46 +++++++++++--------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 1b0ec02910..e5c108f890 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -249,6 +249,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     graph = pass->Apply(std::move(graph));
     VLOG(3) << "Finish Apply Pass " << pass->Type();
   }
+  VLOG(3) << "All Passes Applied";
   return graph;
 }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 67ccf04d05..ecae729124 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -259,14 +259,15 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::unique_ptr<ir::Graph> graph;
   std::vector<std::unique_ptr<ir::Graph>> graphs;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                               member_->local_scopes_, member_->nranks_,
-                               member_->use_cuda_, member_->nccl_ctxs_.get());
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+      main_program, member_->places_, loss_var_name, member_->local_scopes_,
+      member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
+  graphs.push_back(std::move(graph));
 #else
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
+    VLOG(3) << "use local async mode";
     for (size_t i = 0; i < member_->places_.size(); ++i) {
       std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
           main_program, {member_->places_[i]}, loss_var_name,
@@ -274,39 +275,44 @@ ParallelExecutor::ParallelExecutor(
       graphs.push_back(std::move(graph));
     }
   } else {
-    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                                 member_->local_scopes_, member_->nranks_,
-                                 member_->use_cuda_);
+    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+        main_program, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_cuda_);
+    graphs.push_back(std::move(graph));
   }
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    for (size_t i = 0; i < graphs.size(); ++i) {
+      graphs[i] = member_->PrepareGCAndRefCnts(
+          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
+    }
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
+  for (auto &graph : graphs) {
+    for (auto &node : graph->Nodes()) {
+      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+        var_infos.emplace_back();
+        var_infos.back().name_ = node->Var()->Name();
+        var_infos.back().type_ = node->Var()->GetType();
+        var_infos.back().persistable_ = node->Var()->Persistable();
+      }
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graph);
+    size_t graph_num = ir::GraphNum(*graphs[0]);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graph)
+          << ir::GraphNum(*graphs[0])
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -326,7 +332,7 @@ ParallelExecutor::ParallelExecutor(
     // allreduce_seq_pass doesn't need it as the attr.
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_, main_program,
-        std::move(graph)));
+        std::move(graphs[0])));
 #else
     PADDLE_THROW(
         "Paddle should be compiled with CUDA for ParallelGraph Execution.");
@@ -336,12 +342,12 @@ ParallelExecutor::ParallelExecutor(
       VLOG(3) << "use ThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          std::move(graphs[0])));
     } else {
       VLOG(3) << "use FastThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          std::move(graphs[0])));
     }
   }
 

From 12f6b8c3d623d166e77b77eb11837783ffc5fe42 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 21 Feb 2019 18:23:31 +0800
Subject: [PATCH 38/98] change the include of ThreadPool.h test=develop

---
 paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 3809b6e9ae..ae9cb1ebca 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -21,8 +21,8 @@
 #include <utility>
 #include <vector>
 
+#include <ThreadPool.h>  // ThreadPool in thrird party
 #include <functional>
-#include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"

From f4f4816b0c1ffdf7689523f732cd728c196e5aff Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 22 Feb 2019 16:26:50 +0800
Subject: [PATCH 39/98] fix gpu error test=develop

---
 .../details/async_ssa_graph_executor.cc       |  1 +
 paddle/fluid/framework/parallel_executor.cc   | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 5ce92ad826..0780fb040a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -29,6 +29,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(graphs_.size, local_scopes_.size());
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ecae729124..cfd6609a4b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -261,10 +261,21 @@ ParallelExecutor::ParallelExecutor(
   // ncclOp
   std::vector<std::unique_ptr<ir::Graph>> graphs;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-  graphs.push_back(std::move(graph));
+  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
+    VLOG(3) << "use local async mode";
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+          main_program, {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
+          member_->nccl_ctxs_.get());
+      graphs.push_back(std::move(graph));
+    }
+  } else {
+    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+        main_program, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
+    graphs.push_back(std::move(graph));
+  }
 #else
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";

From ecedd531c1ba9b68a1f24bce9b7b98ced67cc128 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 22 Feb 2019 16:37:40 +0800
Subject: [PATCH 40/98] fix code bug test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 0780fb040a..a584b3a708 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -29,7 +29,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(graphs_.size, local_scopes_.size());
+  PADDLE_ENFORCE_EQ(graphs_.size(), local_scopes_.size());
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()

From b5b8e6cc9c0b219d9fea2c43944798509f035d04 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 23 Feb 2019 09:28:56 +0800
Subject: [PATCH 41/98] revert the change of scope test=develop

---
 paddle/fluid/framework/scope.cc | 27 ---------------------------
 paddle/fluid/framework/scope.h  |  1 -
 2 files changed, 28 deletions(-)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 4fe843dde9..87f0f307d3 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -255,32 +255,5 @@ std::string GenScopeTreeDebugInfo(Scope* root) {
   return os.str();
 }
 
-std::string GenParentScopeTreeDebugInfo(Scope* leaf) {
-  std::stringstream os;
-
-  if (!leaf) return "";
-
-  // level traversal
-  std::vector<const Scope*> scopes;
-  const Scope* current_scope = leaf;
-
-  while (current_scope != nullptr) {
-    scopes.push_back(current_scope);
-    current_scope = current_scope->parent();
-  }
-
-  os << "\n--------------GenParentScopeTreeDebugInfo--------------\n";
-
-  for (int i = scopes.size() - 1; i >= 0; --i) {
-    os << "=======level [" << i << "]=======\n";
-    os << scopes[i] << ":\n";
-    for (auto& var : scopes[i]->LocalVarNames()) {
-      os << "  - " << var << "\n";
-    }
-  }
-
-  return os.str();
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index eb5c12def6..f0915d2eee 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,7 +144,6 @@ class Scope {
 // Generate some debug string about the inherience structure of scope, quite
 // naive.
 std::string GenScopeTreeDebugInfo(Scope*);
-std::string GenParentScopeTreeDebugInfo(Scope*);
 
 }  // namespace framework
 }  // namespace paddle

From 10393dd0d16e57203b8cb039174cff97b6efbc89 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 25 Feb 2019 10:09:25 +0800
Subject: [PATCH 42/98] add some check test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index cfd6609a4b..8236773672 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -217,6 +217,11 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+  if (build_strategy.async_mode_) {
+    PADDLE_ENFORCE(!member_->use_cuda_,
+                   "gpu mode does not support async_mode_ now!");
+  }
+
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.

From 43c82376cba493bf622d452741c395da275f0a1b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 25 Feb 2019 22:39:34 +0800
Subject: [PATCH 43/98] use one graph

---
 .../details/async_ssa_graph_executor.cc       |  7 +-
 .../details/async_ssa_graph_executor.h        |  6 +-
 paddle/fluid/framework/parallel_executor.cc   | 66 ++++++++-----------
 3 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index a584b3a708..b6d1ee5073 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -21,15 +21,14 @@ namespace details {
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    std::unique_ptr<ir::Graph> &&graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graphs_(std::move(graphs)) {
+      graph_(std::move(graph)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(graphs_.size(), local_scopes_.size());
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
@@ -39,7 +38,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+        strategy_, {local_scopes_[i]}, {places_[i]}, graph_.get()));
   }
 }
 
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
index 4091c56d74..50f207361f 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -29,9 +29,9 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
-                        std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+                        std::unique_ptr<ir::Graph> &&graph);
   ~AsyncSSAGraphExecutor() final = default;
-  const ir::Graph &Graph() const override { return *graphs_[0]; }
+  const ir::Graph &Graph() const override { return *graph_; }
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
@@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
-  std::vector<std::unique_ptr<ir::Graph>> graphs_;
+  std::unique_ptr<ir::Graph> graph_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8236773672..129d3a7f0d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -264,71 +264,59 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  std::unique_ptr<ir::Graph> graph;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
-          member_->nccl_ctxs_.get());
-      graphs.push_back(std::move(graph));
-    }
+    graph =
+        build_strategy.Apply(main_program, {member_->places_[0]}, loss_var_name,
+                             {member_->local_scopes_[0]}, member_->nranks_,
+                             member_->use_cuda_, member_->nccl_ctxs_.get());
   } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-    graphs.push_back(std::move(graph));
+    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                                 member_->local_scopes_, member_->nranks_,
+                                 member_->use_cuda_, member_->nccl_ctxs_.get());
   }
 #else
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_);
-      graphs.push_back(std::move(graph));
-    }
+    graph = build_strategy.Apply(main_program, {member_->places_[0]},
+                                 loss_var_name, {member_->local_scopes_[0]},
+                                 member_->nranks_, member_->use_cuda_);
   } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_);
-    graphs.push_back(std::move(graph));
+    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                                 member_->local_scopes_, member_->nranks_,
+                                 member_->use_cuda_);
   }
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    for (size_t i = 0; i < graphs.size(); ++i) {
-      graphs[i] = member_->PrepareGCAndRefCnts(
-          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
-    }
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &graph : graphs) {
-    for (auto &node : graph->Nodes()) {
-      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-        var_infos.emplace_back();
-        var_infos.back().name_ = node->Var()->Name();
-        var_infos.back().type_ = node->Var()->GetType();
-        var_infos.back().persistable_ = node->Var()->Persistable();
-      }
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graphs[0]);
+    size_t graph_num = ir::GraphNum(*graph);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graphs[0])
+          << ir::GraphNum(*graph)
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -340,7 +328,7 @@ ParallelExecutor::ParallelExecutor(
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graphs)));
+        std::move(graph)));
   } else if (build_strategy.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
 #ifdef PADDLE_WITH_CUDA
@@ -358,12 +346,12 @@ ParallelExecutor::ParallelExecutor(
       VLOG(3) << "use ThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     } else {
       VLOG(3) << "use FastThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     }
   }
 

From dab7f36909a61af51beacd145228bb2a4acc4db5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 25 Feb 2019 22:49:03 +0800
Subject: [PATCH 44/98] optimize code test=develop

---
 .../details/async_ssa_graph_executor.cc       |  6 ++--
 .../details/async_ssa_graph_executor.h        |  4 +--
 paddle/fluid/framework/parallel_executor.cc   | 30 +++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index b6d1ee5073..8757842996 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -21,12 +21,12 @@ namespace details {
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    ir::Graph* graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graph_(std::move(graph)) {
+      graph_(graph) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
@@ -38,7 +38,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, graph_.get()));
+        strategy_, {local_scopes_[i]}, {places_[i]}, graph_));
   }
 }
 
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
index 50f207361f..8536852a00 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -29,7 +29,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
-                        std::unique_ptr<ir::Graph> &&graph);
+                        ir::Graph *graph);
   ~AsyncSSAGraphExecutor() final = default;
   const ir::Graph &Graph() const override { return *graph_; }
 
@@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a498ec5b0b..081d06b6aa 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -269,25 +269,26 @@ ParallelExecutor::ParallelExecutor(
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    temp_owned_graph =
-        build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
-                             {member_->local_scopes_[0]}, member_->nranks_,
-                             member_->use_cuda_, member_->nccl_ctxs_.get());
+    temp_owned_graph = build_strategy.Apply(
+        std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_,
+        member_->nccl_ctxs_.get());
   } else {
-    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
-                                 member_->local_scopes_, member_->nranks_,
-                                 member_->use_cuda_, member_->nccl_ctxs_.get());
+    temp_owned_graph = build_strategy.Apply(
+        std::move(temp_owned_graph), member_->places_, loss_var_name,
+        member_->local_scopes_, member_->nranks_, member_->use_cuda_,
+        member_->nccl_ctxs_.get());
   }
 #else
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]},
-                                 loss_var_name, {member_->local_scopes_[0]},
-                                 member_->nranks_, member_->use_cuda_);
+    temp_owned_graph = build_strategy.Apply(
+        std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_);
   } else {
-    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
-                                 member_->local_scopes_, member_->nranks_,
-                                 member_->use_cuda_);
+    temp_owned_graph = build_strategy.Apply(
+        std::move(temp_owned_graph), member_->places_, loss_var_name,
+        member_->local_scopes_, member_->nranks_, member_->use_cuda_);
   }
 
 #endif
@@ -333,8 +334,7 @@ ParallelExecutor::ParallelExecutor(
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        graph));
+        exec_strategy, member_->local_scopes_, member_->places_, graph));
   } else if (build_strategy.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
 #ifdef PADDLE_WITH_CUDA

From ff01d705835c5e1ccac4d9f1e109725bf6efeb53 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 25 Feb 2019 23:31:56 +0800
Subject: [PATCH 45/98] fix style test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 8757842996..21741667a3 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -20,8 +20,7 @@ namespace details {
 
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    ir::Graph* graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),

From f768fbf7157e4b500de3aa456beddaa138f00cd5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 26 Feb 2019 15:01:59 +0800
Subject: [PATCH 46/98] support multi graph test=develop

---
 .../details/async_ssa_graph_executor.cc       |  6 +--
 .../details/async_ssa_graph_executor.h        |  6 +--
 paddle/fluid/framework/parallel_executor.cc   | 40 ++++++++++++++-----
 paddle/fluid/framework/parallel_executor.h    |  2 +-
 .../fluid/operators/reader/blocking_queue.h   |  1 +
 .../operators/reader/create_py_reader_op.cc   |  5 ++-
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 python/paddle/fluid/parallel_executor.py      |  9 ++++-
 8 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 21741667a3..dfb9d73dcb 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -20,12 +20,12 @@ namespace details {
 
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, ir::Graph *graph)
+    const std::vector<platform::Place> &places, std::vector<ir::Graph *> graphs)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graph_(graph) {
+      graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
@@ -37,7 +37,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, graph_));
+        strategy_, {local_scopes_[i]}, {places_[i]}, graphs_[i]));
   }
 }
 
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
index 8536852a00..ff85ba2c6c 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -29,9 +29,9 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
-                        ir::Graph *graph);
+                        std::vector<ir::Graph *> graphs);
   ~AsyncSSAGraphExecutor() final = default;
-  const ir::Graph &Graph() const override { return *graph_; }
+  const ir::Graph &Graph() const override { return *graphs_[0]; }
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
@@ -40,7 +40,7 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
-  ir::Graph *graph_;
+  std::vector<ir::Graph *> graphs_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 081d06b6aa..b1f4091148 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -188,7 +188,7 @@ ParallelExecutor::ParallelExecutor(
     const std::string &loss_var_name, Scope *scope,
     const std::vector<Scope *> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    ir::Graph *graph)
+    std::vector<ir::Graph *> graphs)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -222,6 +222,8 @@ ParallelExecutor::ParallelExecutor(
     PADDLE_ENFORCE(!member_->use_cuda_,
                    "gpu mode does not support async_mode_ now!");
   }
+
+  ir::Graph *graph = graphs[0];
   std::unique_ptr<ir::Graph> temp_owned_graph(graph);
 
   // FIXME(Yancey1989): parallel graph mode get better performance
@@ -262,17 +264,26 @@ ParallelExecutor::ParallelExecutor(
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.
 
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
+  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  std::vector<ir::Graph *> async_graphs(places.size());
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    temp_owned_graph = build_strategy.Apply(
-        std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_,
-        member_->nccl_ctxs_.get());
+    temp_owned_graph =
+        build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]},
+                             loss_var_name, {member_->local_scopes_[0]}, 1,
+                             member_->use_cuda_, member_->nccl_ctxs_.get());
+    for (int i = 1; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> temp_graph(graphs[i]);
+      temp_graph =
+          build_strategy.Apply(std::move(temp_graph), {member_->places_[i]},
+                               loss_var_name, {member_->local_scopes_[i]}, 1,
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
+      async_graphs[i] = temp_graph.release();
+    }
   } else {
     temp_owned_graph = build_strategy.Apply(
         std::move(temp_owned_graph), member_->places_, loss_var_name,
@@ -284,7 +295,14 @@ ParallelExecutor::ParallelExecutor(
     VLOG(3) << "use local async mode";
     temp_owned_graph = build_strategy.Apply(
         std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, member_->nranks_, member_->use_cuda_);
+        {member_->local_scopes_[0]}, 1, member_->use_cuda_);
+    for (int i = 1; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> temp_graph(graphs[i]);
+      temp_graph = build_strategy.Apply(
+          std::move(temp_graph), {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_cuda_);
+      async_graphs[i] = temp_graph.release();
+    }
   } else {
     temp_owned_graph = build_strategy.Apply(
         std::move(temp_owned_graph), member_->places_, loss_var_name,
@@ -304,6 +322,8 @@ ParallelExecutor::ParallelExecutor(
     graph = temp_owned_graph.release();
   }
 
+  async_graphs[0] = graph;
+
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
@@ -334,7 +354,7 @@ ParallelExecutor::ParallelExecutor(
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_, graph));
+        exec_strategy, member_->local_scopes_, member_->places_, async_graphs));
   } else if (build_strategy.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ddf60b3946..0e05b2a460 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -50,7 +50,7 @@ class ParallelExecutor {
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
                             const BuildStrategy &build_strategy,
-                            ir::Graph *graph);
+                            std::vector<ir::Graph *> graphs);
 
   ~ParallelExecutor();
 
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 45c3ad802f..c99b2bc593 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -95,6 +95,7 @@ class BlockingQueue {
 
   void Close() {
     std::lock_guard<std::mutex> lock(mutex_);
+    VLOG(3) << "close queue";
     closed_ = true;
     send_cv_.notify_all();
     receive_cv_.notify_all();
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 901a92ab5b..b2469ad0eb 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -35,7 +35,10 @@ class PyReader : public framework::FileReader {
 
   ~PyReader() { queue_->Close(); }
 
-  void Shutdown() override { queue_->Close(); }
+  void Shutdown() override {
+    VLOG(3) << "PyReader shutdown!";
+    queue_->Close();
+  }
 
   void Start() override { queue_->ReOpen(); }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f9e7366779..fdee5a6d66 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1230,7 +1230,7 @@ All parameter, weight, gradient are variables in Paddle.
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &, const std::string &,
                   Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
-                  const BuildStrategy &, ir::Graph *>())
+                  const BuildStrategy &, std::vector<ir::Graph *>>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 889156ff74..9c578ef662 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -177,12 +177,17 @@ class ParallelExecutor(object):
 
         # step7: init ParallelExecutor
         # ParallelExecutor API will be deprecated, don't support parallel graph.
-        self._graph = core.Graph(main.desc)
+        self._graphs = []
+        if build_strategy.async_mode:
+            for _ in range(cpu_num):
+                self._graphs.append(core.Graph(main.desc))
+        else:
+            self._graphs.append(core.Graph(main.desc))
 
         self.executor = core.ParallelExecutor(
             places, persistable_vars,
             cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy, self._graph)
+            local_scopes, exec_strategy, build_strategy, self._graphs)
 
         self.scope = scope
 

From 02425b2f648f5dbb5773b0eab8901a42bf955f33 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Feb 2019 09:31:27 +0800
Subject: [PATCH 47/98] fix compile

---
 paddle/fluid/operators/distributed_ops/recv_op.cc | 2 +-
 paddle/fluid/operators/distributed_ops/send_op.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index a0185d66f0..bcb16ff2e5 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -57,7 +57,7 @@ class RecvOp : public framework::OperatorBase {
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = pool.Get(place);
-      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx);
+      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto recv_functor = distributed::ParameterRecv<float>();
       recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope);
     } else {
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 0f0ad6b8f9..801909e2c0 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -50,7 +50,7 @@ class SendOp : public framework::OperatorBase {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
-      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx);
+      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto send_functor = distributed::ParameterSend<float>();
       send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx,
                    scope, static_cast<bool>(sync_send));

From 847e4f4e854b3f73625816d152f65ca5f5c7a27e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 1 Mar 2019 11:24:14 +0800
Subject: [PATCH 48/98] pure async mode train

---
 .../details/async_ssa_graph_executor.cc       | 114 ++++++++++++------
 .../details/async_ssa_graph_executor.h        |  12 ++
 .../details/threaded_ssa_graph_executor.cc    |   2 +
 paddle/fluid/framework/parallel_executor.cc   |   8 +-
 paddle/fluid/framework/reader.cc              |   5 +-
 paddle/fluid/framework/reader.h               |  10 +-
 .../fluid/operators/reader/blocking_queue.h   |   3 +-
 .../fluid/operators/reader/buffered_reader.cc |   3 +
 .../operators/reader/create_py_reader_op.cc   |   7 +-
 .../reader/lod_tensor_blocking_queue.h        |   5 +-
 paddle/fluid/pybind/pybind.cc                 |   1 +
 .../test_async_ssa_graph_executor_mnist.py    |  41 ++++---
 12 files changed, 148 insertions(+), 63 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index dfb9d73dcb..69f770afee 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -14,10 +14,31 @@
 
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 
+#include "paddle/fluid/framework/variable_helper.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+inline void NewTempScopeAndInitVars(const std::vector<VarInfo> &var_infos,
+                                    Scope *scope) {
+  Scope &local_scope = scope->NewScope();
+  *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+      &local_scope;
+
+  for (auto &info : var_infos) {
+    if (scope->FindVar(info.name_) != nullptr) {
+      continue;
+    }
+
+    if (info.persistable_) {  // Persistable
+      InitializeVariable(scope->Var(info.name_), info.type_);
+    } else {
+      InitializeVariable(local_scope.Var(info.name_), info.type_);
+    }
+  }
+}
+
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, std::vector<ir::Graph *> graphs)
@@ -39,58 +60,81 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
         strategy_, {local_scopes_[i]}, {places_[i]}, graphs_[i]));
   }
-}
 
-FeedFetchList AsyncSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  std::vector<std::future<FeedFetchList>> run_futures;
-
-  std::vector<FeedFetchList> fetch_data;
-  FeedFetchList ret;
-
-  fetch_data.reserve(places_.size());
-  ret.reserve(fetch_tensors.size());
-  exception_holder_.Clear();
+  for (auto &node : graphs_[0]->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos_.emplace_back();
+      var_infos_.back().name_ = node->Var()->Name();
+      var_infos_.back().type_ = node->Var()->GetType();
+      var_infos_.back().persistable_ = node->Var()->Persistable();
+    }
+  }
+  for (auto *scope : local_scopes_) {
+    NewTempScopeAndInitVars(var_infos_, scope);
+  }
+}
 
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
+void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() {
+  VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size();
+  for (size_t i = 1; i < places_.size(); ++i) {
+    auto call = [this, i]() -> void {
+      VLOG(3) << "start off python thread " << i;
       try {
-        return executors_[i]->Run(fetch_tensors);
+        while (true) {
+          executors_[i]->Run({});
+        }
       } catch (...) {
         exception_holder_.Catch(std::current_exception());
+        VLOG(3) << "get exception type = " << exception_holder_.Type();
       }
-      return FeedFetchList();
+      VLOG(3) << "thread " << i << " exited!";
     };
-
-    if (pool_) {
-      run_futures.emplace_back(pool_->enqueue(std::move(call)));
-    } else {
-      fetch_data.emplace_back(std::move(call()));
-    }
-  }
-
-  if (pool_) {
-    for (auto &f : run_futures) {
-      if (exception_holder_.IsCaught()) {
-        f.wait();
-      } else {
-        fetch_data.emplace_back(std::move(f.get()));
-      }
-    }
+    run_futures_.emplace_back(pool_->enqueue(std::move(call)));
   }
+}
 
+void AsyncSSAGraphExecutor::HandleException() {
   if (exception_holder_.IsCaught()) {
+    for (auto &f : run_futures_) {
+      VLOG(3) << "wait future";
+      f.wait();
+    }
     VLOG(3) << "caught exception " << exception_holder_.Type()
             << ", rethrow it";
+    run_futures_.clear();
     exception_holder_.ReThrow();
   }
+}
+
+FeedFetchList AsyncSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  // init once
+  if (run_futures_.size() == 0 && places_.size() > 1) {
+    exception_holder_.Clear();
+    StartOffPythonTrainLoop();
+  }
+
+  if (places_.size() == 1) {
+    exception_holder_.Clear();
+  } else {
+    HandleException();
+  }
+
+  FeedFetchList fetch_data;
+  fetch_data.reserve(fetch_tensors.size());
+
+  try {
+    fetch_data = executors_[0]->Run(fetch_tensors);
+  } catch (...) {
+    exception_holder_.Catch(std::current_exception());
+  }
+
+  HandleException();
 
+  FeedFetchList ret;
   for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
     std::vector<const LoDTensor *> lodtensor_ptrs;
-    lodtensor_ptrs.reserve(local_scopes_.size());
-    for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
-      lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
-    }
+    lodtensor_ptrs.push_back(&fetch_data.at(fetch_idx));
     ret.emplace_back();
     ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
   }
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
index ff85ba2c6c..7d7296772d 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -24,6 +24,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+struct VarInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+
 class AsyncSSAGraphExecutor : public SSAGraphExecutor {
  public:
   AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -35,6 +41,10 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
+ private:
+  void StartOffPythonTrainLoop();
+  void HandleException();
+
  private:
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
@@ -44,6 +54,8 @@ class AsyncSSAGraphExecutor : public SSAGraphExecutor {
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
   ExceptionHolder exception_holder_;
+  std::vector<std::future<void>> run_futures_;
+  std::vector<VarInfo> var_infos_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 8436626362..fa0c90e1f4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -119,6 +119,8 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
 
     if (timeout) {
       if (exception_holder_.IsCaught()) {
+        VLOG(3) << "caught exception " << exception_holder_.Type()
+                << ", rethrow it";
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b1f4091148..c133772e6e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -379,9 +379,11 @@ ParallelExecutor::ParallelExecutor(
   }
 
   VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, std::move(var_infos),
-      member_->places_, std::move(member_->executor_)));
+  if (!build_strategy.async_mode_) {
+    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, std::move(var_infos),
+        member_->places_, std::move(member_->executor_)));
+  }
 }
 
 void ParallelExecutor::BCastParamsToDevices(
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 40eafda9bf..d3513fb7db 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -69,6 +69,9 @@ void ReaderBase::Start() {
 
 ReaderBase::~ReaderBase() {}
 
-DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
+DecoratedReader::~DecoratedReader() {
+  VLOG(1) << "~DecoratedReader";
+  reader_->Shutdown();
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 82562bf883..6cf0ec2937 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -77,7 +77,10 @@ class DecoratedReader : public ReaderBase,
   ~DecoratedReader();
 
  protected:
-  void ShutdownImpl() override { reader_->Shutdown(); }
+  void ShutdownImpl() override {
+    VLOG(1) << "ShutdownImpl";
+    reader_->Shutdown();
+  }
 
   void StartImpl() override { reader_->Start(); }
 
@@ -98,6 +101,8 @@ class ReaderHolder {
     reader_ = reader_base;
   }
 
+  ~ReaderHolder() { VLOG(1) << "~ReaderHolder"; }
+
   const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
@@ -106,6 +111,7 @@ class ReaderHolder {
   }
 
   void ResetAll() {
+    VLOG(1) << "ResetAll";
     auto end_readers = reader_->GetEndPoints();
     for (auto* reader : end_readers) {
       reader->Shutdown();
@@ -116,11 +122,13 @@ class ReaderHolder {
   }
 
   void Shutdown() {
+    VLOG(1) << "Shutdown";
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->Shutdown();
   }
 
   void Start() {
+    VLOG(1) << "start";
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->Start();
   }
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index c99b2bc593..fe3f2f4031 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -86,6 +86,7 @@ class BlockingQueue {
 
   void ReOpen() {
     std::lock_guard<std::mutex> lock(mutex_);
+    VLOG(1) << "reopen queue";
     closed_ = false;
     std::deque<T> new_deque;
     queue_.swap(new_deque);
@@ -95,7 +96,7 @@ class BlockingQueue {
 
   void Close() {
     std::lock_guard<std::mutex> lock(mutex_);
-    VLOG(3) << "close queue";
+    VLOG(1) << "close queue";
     closed_ = true;
     send_cv_.notify_all();
     receive_cv_.notify_all();
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index defc29b91f..db80fda695 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 namespace reader {
 BufferedReader::~BufferedReader() {
+  VLOG(1) << "~BufferedReader";
   reader_->Shutdown();
   while (!position_.empty()) {
     position_.front().wait();
@@ -41,6 +42,7 @@ BufferedReader::BufferedReader(
       thread_pool_(1),
       place_(place),
       buffer_size_(buffer_size) {
+  VLOG(1) << "BufferedReader";
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
@@ -121,6 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
 }
 
 void BufferedReader::ShutdownImpl() {
+  VLOG(1) << "ShutdownImpl";
   reader_->Shutdown();
   while (!position_.empty()) {
     position_.pop();
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index b2469ad0eb..2916be618c 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -33,10 +33,13 @@ class PyReader : public framework::FileReader {
     if (!success) out->clear();
   }
 
-  ~PyReader() { queue_->Close(); }
+  ~PyReader() {
+    VLOG(1) << "~PyReader";
+    queue_->Close();
+  }
 
   void Shutdown() override {
-    VLOG(3) << "PyReader shutdown!";
+    VLOG(1) << "PyReader shutdown!";
     queue_->Close();
   }
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 5b53edff5d..eeba330d66 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -57,7 +57,10 @@ class LoDTensorBlockingQueue {
 
   inline void ReOpen() { queue_.ReOpen(); }
 
-  inline void Close() { queue_.Close(); }
+  inline void Close() {
+    VLOG(1) << "LoDTensorBlockingQueue close";
+    queue_.Close();
+  }
 
   inline bool IsClosed() const { return queue_.IsClosed(); }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index fdee5a6d66..af049127aa 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -557,6 +557,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_lod_tensor_blocking_queue",
         [](Variable &var,
            size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
+          VLOG(1) << "init_lod_tensor_blocking_queue";
           auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
           holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
           return holder->GetQueue();
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 41fa39e06b..4fbda407f1 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -36,7 +36,7 @@ def convolutional_neural_network(use_py_reader):
                 capacity=64,
                 feed_list=[img, label],
                 name='py_reader',
-                use_double_buffer=True)
+                use_double_buffer=False)
             img, label = fluid.layers.read_file(py_reader)
 
         conv_pool_1 = fluid.nets.simple_img_conv_pool(
@@ -139,20 +139,21 @@ def train(use_cuda, thread_num, cpu_num):
         exec_strategy=exec_strategy)
 
     py_reader.decorate_paddle_reader(train_reader)
-    py_reader.start()
-
-    step = 0
-    try:
-        while True:
-            loss_val = pe.run(fetch_list=[avg_loss.name])
-            loss_val = numpy.mean(loss_val)
-            if step % 100 == 0:
-                print("Batch %d, Cost %f, queue size %d" %
-                      (step, loss_val, py_reader.queue.size()))
-            step += 1
-    except fluid.core.EOFException:
-        print("train end")
-        py_reader.reset()
+
+    for pass_id in range(2):
+        step = 0
+        py_reader.start()
+        try:
+            while True:
+                loss_val = pe.run(fetch_list=[avg_loss.name])
+                loss_val = numpy.mean(loss_val)
+                if step % 10 == 0:
+                    print("Pass %d, Batch %d, Cost %f, queue size %d" %
+                          (pass_id, step, loss_val, py_reader.queue.size()))
+                step += 1
+        except fluid.core.EOFException:
+            print("train end pass = " + str(pass_id))
+            py_reader.reset()
 
     return step
 
@@ -161,10 +162,11 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase):
     def test_check_async_ssa_exe_train(self):
         step_list = []
         for cpu_num in [1, 2, 4]:
-            scope = fluid.core.Scope()
-            with fluid.scope_guard(scope):
+            print("run cpu_num -> " + str(cpu_num))
+            with fluid.scope_guard(fluid.core.Scope()):
                 with fluid.program_guard(
-                        fluid.Program(), startup_program=fluid.Program()):
+                        main_program=fluid.Program(),
+                        startup_program=fluid.Program()):
                     start_time = time.time()
                     step = train(
                         use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num)
@@ -173,7 +175,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase):
                 print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) +
                       " time -> " + str(end_time - start_time))
                 with fluid.program_guard(
-                        fluid.Program(), startup_program=fluid.Program()):
+                        main_program=fluid.Program(),
+                        startup_program=fluid.Program()):
                     test()
         assert int(step_list[0] / 2) == int(step_list[1])
         assert int(step_list[1] / 2) == int(step_list[2])

From 3691a46fa36750bb5a3c828d2eaf55305aa88f69 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 10:29:42 +0800
Subject: [PATCH 49/98] improve communicator

---
 paddle/fluid/framework/communicator.h         |  53 -------
 paddle/fluid/framework/variable_helper.cc     |  26 +++-
 paddle/fluid/framework/variable_helper.h      |   3 +-
 .../operators/distributed/CMakeLists.txt      |   1 +
 .../operators/distributed/communicator.cc     | 113 +++++++++++++++
 .../operators/distributed/communicator.h      | 129 ++++++++++++++++++
 .../distributed/parameter_prefetch.cc         |   4 +-
 .../operators/distributed/parameter_recv.cc   |   2 +-
 .../fluid/operators/distributed/rpc_common.h  |  33 +++++
 .../operators/math/selected_rows_functor.h    |   2 +-
 10 files changed, 306 insertions(+), 60 deletions(-)
 delete mode 100644 paddle/fluid/framework/communicator.h
 create mode 100644 paddle/fluid/operators/distributed/communicator.cc
 create mode 100644 paddle/fluid/operators/distributed/communicator.h
 create mode 100644 paddle/fluid/operators/distributed/rpc_common.h

diff --git a/paddle/fluid/framework/communicator.h b/paddle/fluid/framework/communicator.h
deleted file mode 100644
index 0e90ba02e6..0000000000
--- a/paddle/fluid/framework/communicator.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <typeindex>
-#include <vector>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-
-namespace framework {
-
-class Communicator {
- public:
-  Communicator() {}
-  ~Communicator() {}
-
-  // send grad
-  void send() {}
-
-  void receive() {}
-
-  void prefetch() {}
-
-  void wait() {}
-
- private:
-  std::unique_ptr<std::thread> communicate_thread_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index fc4525549c..d59f3ea7dc 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
+void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
   } else if (var_type == proto::VarType::SELECTED_ROWS) {
@@ -37,7 +37,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@@ -56,5 +56,27 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
         var_type);
   }
 }
+
+void CopyVariable(const Variable &src_var, Variable *dst_var) {
+  // only support cpu now
+  auto cpu_place = platform::CPUPlace();
+
+  if (src_var.IsType<framework::LoDTensor>()) {
+    auto *tmp_grad_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    auto &src_tensor = src_var.Get<framework::LoDTensor>();
+    tmp_grad_tensor->set_lod(src_tensor.lod());
+    framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
+  } else if (src_var.IsType<framework::SelectedRows>()) {
+    auto &src_slr = src_var.Get<framework::SelectedRows>();
+    auto *tmp_grad_slr = dst_var->GetMutable<framework::SelectedRows>();
+    tmp_grad_slr->set_rows(src_slr.rows());
+    tmp_grad_slr->set_height(src_slr.height());
+    auto &src_t = src_slr.value();
+    auto *dst_t = tmp_grad_slr->mutable_value();
+    framework::TensorCopy(src_t, cpu_place, dst_t);
+  } else {
+    PADDLE_THROW("unknown var type to copy");
+  }
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 0e0c72c362..f8e90d5396 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 namespace paddle {
 namespace framework {
-void InitializeVariable(Variable *var, proto::VarType::Type var_type);
+void InitializeVariable(Variable* var, proto::VarType::Type var_type);
+void CopyVariable(const Variable& src_var, Variable* dst_var);
 }
 }
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 231f4b3bc4..22f44c4217 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -54,6 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
new file mode 100644
index 0000000000..fb9ecfa808
--- /dev/null
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed/communicator.h"
+
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/distributed/parameter_recv.h"
+#include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+static void MergeVars(const std::string &var_name,
+                      const std::vector<std::shared_ptr<Variable>> &vars,
+                      Scope *scope) {
+  PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
+  auto cpu_place = platform::CPUPlace();
+  auto &var0 = vars[0];
+  auto *out_var = scope->Var(var_name);
+  if (var0->IsType<framework::LoDTensor>()) {
+    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
+    auto *out_ptr = out_t->mutable_data<float>(
+        var0->Get<framework::LoDTensor>().dims(), cpu_place);
+    auto numel = out_t->numel();
+    for (auto i = 0; i < numel; ++i) {
+      out_ptr[i] = 0;
+      for (auto &var : vars) {
+        auto &var_t = var->Get<framework::LoDTensor>();
+        PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims");
+        out_ptr[i] += var_t.data<float>()[i];
+      }
+    }
+  } else if (var0->IsType<framework::SelectedRows>()) {
+    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
+    std::vector<const paddle::framework::SelectedRows *> inputs;
+    inputs.reserve(vars.size());
+    for (auto &var : vars) {
+      inputs.push_back(&var->Get<framework::SelectedRows>());
+    }
+    math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, float>
+        merge_add;
+    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    merge_add(dev_ctx, inputs, out_slr, false);
+  } else {
+    PADDLE_THROW("unsupported var type!");
+  }
+}
+
+void Communicator::SendThread() {
+  for (auto &iter : send_varname_to_queue_) {
+    auto &var_name = iter.first;
+    VLOG(3) << "merge var " << var_name << " and send";
+    auto &var_queue = iter.second;
+    std::vector<std::shared_ptr<Variable>> vars;
+    const size_t max_merge_var_num = 20;
+    size_t merged_var_num = 0;
+    while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
+      vars.push_back(var_queue->Pop());
+      merged_var_num++;
+    }
+    MergeVars(var_name, vars, send_scope_.get());
+    auto send_functor = distributed::ParameterSend<float>();
+    //    send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx,
+    //    send_scope_, true);
+  }
+}
+
+void Communicator::RecvThread() {
+  // parallel run recv graph
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto &var_name = iter.first;
+    VLOG(3) << "recv var " << iter.first;
+    auto recv_functor = distributed::ParameterRecv<float>();
+    //    recv_functor(var_name, iter.second, exe_ctx, recv_scope_);
+  }
+}
+
+void Communicator::Send(const std::string &var_name,
+                        const framework::Scope &scope) {
+  // push var into send queue by var_name
+  auto *grad_var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
+  auto tmp_grad_var = std::make_shared<Variable>();
+  framework::CopyVariable(*grad_var, tmp_grad_var.get());
+  send_varname_to_queue_[var_name]->Push(tmp_grad_var);
+}
+
+void Communicator::Start() {
+  // start send and recv thread
+  send_thread_.reset(
+      new std::thread(std::bind(&Communicator::SendThread, this)));
+  recv_thread_.reset(
+      new std::thread(std::bind(&Communicator::RecvThread, this)));
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
new file mode 100644
index 0000000000..614d6ade81
--- /dev/null
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/rpc_common.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+
+template <typename T>
+class BlockingQueue {
+ public:
+  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
+    PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0.");
+  }
+
+  bool Push(const T& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.push_back(elem);
+    recv_cv_.notify_one();
+    return true;
+  }
+
+  bool Push(T&& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.emplace_back(std::move(elem));
+    recv_cv_.notify_one();
+    return true;
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    recv_cv_.wait(lock, [=] { return !queue_.empty(); });
+    T rc(std::move(queue_.front()));
+    queue_.pop_front();
+    return rc;
+  }
+
+  size_t Cap() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return capacity_;
+  }
+
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
+ private:
+  const size_t capacity_;
+  std::deque<T> queue_;
+
+  mutable std::mutex mutex_;
+  std::condition_variable recv_cv_;
+  std::condition_variable send_cv_;
+};
+
+class Communicator {
+ public:
+  Communicator(
+      const std::unordered_map<std::string, RpcContext>& send_varname_to_ctx,
+      const std::unordered_map<std::string, RpcContext>& recv_varname_to_ctx,
+      Scope* recv_scope)
+      : send_varname_to_ctx_(send_varname_to_ctx),
+        recv_varname_to_ctx_(recv_varname_to_ctx),
+        recv_scope_(recv_scope) {
+    // get all send information from graph, build vars_to_send
+    send_scope_.reset(new Scope());
+    for (auto& iter : send_varname_to_ctx_) {
+      send_varname_to_queue_[iter.first] =
+          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(10);
+    }
+  }
+
+  ~Communicator() {}
+
+  void Start();
+
+  // send grad
+  void Send(const std::string& var_name, const framework::Scope& scope);
+
+ private:
+  void SendThread();
+  void RecvThread();
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
+      send_varname_to_queue_;
+  std::unordered_map<std::string, RpcContext> send_varname_to_ctx_;
+  std::unordered_map<std::string, RpcContext> recv_varname_to_ctx_;
+  std::unique_ptr<std::thread> send_thread_;
+  std::unique_ptr<std::thread> recv_thread_;
+  Scope* recv_scope_;                  // should be global scope
+  std::unique_ptr<Scope> send_scope_;  // an independent scope
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 7434265929..539a038099 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -39,7 +39,7 @@ using DDim = framework::DDim;
 
 static std::vector<std::vector<int64_t>> SplitIds(
     const std::vector<int64_t>& ids_vector,
-    const std::vector<int64_t>& height_section, framework::Scope* scope) {
+    const std::vector<int64_t>& height_section) {
   std::set<int64_t> all_ids;
   for (auto id : ids_vector) {
     all_ids.insert(id);
@@ -203,7 +203,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 #endif
   }
 
-  auto splited_ids = SplitIds(ids_vector, height_sections, local_scope);
+  auto splited_ids = SplitIds(ids_vector, height_sections);
   SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
                                     local_scope);
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 2664a89ed6..b8d3b77ae4 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -73,7 +73,7 @@ void ParameterRecv<T>::operator()(const std::string &var_name,
       PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
     }
   } else {
-    PADDLE_THROW("unsupported var type to send!");
+    PADDLE_THROW("unsupported var type to recv!");
   }
 
   // concat recved tensor into one var
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
new file mode 100644
index 0000000000..dc50414b9a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+struct RpcContext {
+  std::string var_name;
+  std::vector<std::string> splited_var_names;
+  std::vector<std::string> epmap;
+  std::vector<int64_t> height_sections;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 222d761ef9..db0ee9bc16 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -95,7 +95,7 @@ struct MergeAdd {
 
 enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
 
-// out = seleted_rows_in / tensor
+// out = selected_rows_in / tensor
 template <typename DeviceContext, typename T>
 struct UpdateToTensor {
   void operator()(const DeviceContext& context, const ScatterOps& op,

From 9573d610ef7e364c91ea3346aa2d0903041c2f72 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 11:10:19 +0800
Subject: [PATCH 50/98] use rpc common in parameter send and recv

---
 .../operators/distributed/parameter_recv.cc   | 17 +++++------
 .../operators/distributed/parameter_recv.h    |  5 ++--
 .../operators/distributed/parameter_send.cc   | 30 +++++++++----------
 .../operators/distributed/parameter_send.h    |  6 ++--
 .../fluid/operators/distributed/rpc_common.h  |  7 +++++
 .../operators/distributed_ops/recv_op.cc      |  7 +++--
 .../operators/distributed_ops/send_op.cc      |  9 ++++--
 7 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index b8d3b77ae4..00956d8e6d 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -39,9 +39,7 @@ using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void ParameterRecv<T>::operator()(const std::string &var_name,
-                                  const std::vector<std::string> &recv_varnames,
-                                  const std::vector<std::string> &epmap,
+void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::ExecutionContext &ctx,
                                   const framework::Scope &scope) {
   framework::Scope *local_scope = scope.NewTmpScope();
@@ -53,21 +51,22 @@ void ParameterRecv<T>::operator()(const std::string &var_name,
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
           ctx.Attr<int>("trainer_id"));
 
-  auto *recv_var = scope.FindVar(var_name);
+  auto *recv_var = scope.FindVar(rpc_ctx.var_name);
 
   std::vector<framework::Tensor *> recved_tensors;
 
   // recv all vars to local scope
   if (recv_var->IsType<framework::LoDTensor>()) {
     std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < recv_varnames.size(); i++) {
-      auto &recv_var_name = recv_varnames[i];
+    for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
+      auto &recv_var_name = rpc_ctx.splited_var_names[i];
       framework::Tensor *t =
           local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
       recved_tensors.push_back(t);
-      VLOG(3) << "recv " << recv_var_name << " from " << epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(epmap[i], cpu_ctx, *local_scope,
-                                             recv_var_name, recv_var_name));
+      VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
+      rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
+                                             *local_scope, recv_var_name,
+                                             recv_var_name));
     }
     for (size_t i = 0; i < rets.size(); i++) {
       PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
index bc6f5f5adf..e25594024a 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ b/paddle/fluid/operators/distributed/parameter_recv.h
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/distributed/rpc_common.h"
 
 namespace paddle {
 namespace operators {
@@ -25,9 +26,7 @@ namespace distributed {
 
 template <typename T>
 struct ParameterRecv {
-  void operator()(const std::string &var_name,
-                  const std::vector<std::string> &recv_varnames,
-                  const std::vector<std::string> &epmap,
+  void operator()(const RpcContext &rpc_ctx,
                   const framework::ExecutionContext &context,
                   const framework::Scope &scope);
 };
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index fd97926623..eaa1c3ae8e 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -38,10 +38,7 @@ using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void ParameterSend<T>::operator()(const std::string &var_name,
-                                  const std::vector<std::string> &send_varnames,
-                                  const std::vector<std::string> &epmap,
-                                  const std::vector<int64_t> &height_sections,
+void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::ExecutionContext &ctx,
                                   const framework::Scope &scope, bool sync) {
   framework::Scope *local_scope = scope.NewTmpScope();
@@ -53,8 +50,8 @@ void ParameterSend<T>::operator()(const std::string &var_name,
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
           ctx.Attr<int>("trainer_id"));
 
-  auto *send_var = scope.FindVar(var_name);
-  size_t out_num = send_varnames.size();
+  auto *send_var = scope.FindVar(rpc_ctx.var_name);
+  size_t out_num = rpc_ctx.splited_var_names.size();
   if (send_var->IsType<framework::LoDTensor>()) {
     if (out_num > 1) {
       auto &send_tensor = send_var->Get<framework::LoDTensor>();
@@ -63,19 +60,19 @@ void ParameterSend<T>::operator()(const std::string &var_name,
       outs_dims.reserve(out_num);
 
       // infer output shape
-      PADDLE_ENFORCE_EQ(height_sections.size(), out_num,
+      PADDLE_ENFORCE_EQ(rpc_ctx.height_sections.size(), out_num,
                         "tensor split sections size"
                         "should be equal to output size.");
       for (size_t i = 0; i < out_num; ++i) {
         auto dim = send_tensor_dims;
-        dim[0] = height_sections[i];
+        dim[0] = rpc_ctx.height_sections[i];
         outs_dims.push_back(dim);
       }
 
       // create output var in local scope
       size_t row_offset = 0;
       for (auto i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(send_varnames[i])
+        framework::Tensor *out = local_scope->Var(rpc_ctx.splited_var_names[i])
                                      ->GetMutable<framework::LoDTensor>();
         *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
         row_offset += outs_dims[i][0];
@@ -83,7 +80,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
     }
   } else if (send_var->IsType<framework::SelectedRows>()) {
     auto &send_slr = send_var->Get<framework::SelectedRows>();
-    auto abs_sections = ToAbsoluteSection(height_sections);
+    auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections);
 
     auto send_rows = send_slr.rows();
     std::vector<std::vector<int>> outs_rows_idx;
@@ -97,7 +94,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
 
     // create output var in local scope
     std::vector<framework::SelectedRows *> outs;
-    for (auto &name : send_varnames) {
+    for (auto &name : rpc_ctx.splited_var_names) {
       auto *out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
       outs.push_back(out);
     }
@@ -112,7 +109,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
 
     for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
       auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
+      outs[i]->set_height(rpc_ctx.height_sections[i]);
       auto dims = send_slr.GetCompleteDims();
       dims[0] = rows_idx.size();
       outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
@@ -149,15 +146,16 @@ void ParameterSend<T>::operator()(const std::string &var_name,
   }
 
   std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < send_varnames.size(); i++) {
-    auto &send_var_name = send_varnames[i];
-    auto &endpoint = epmap[i];
+  for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
+    auto &send_var_name = rpc_ctx.splited_var_names[i];
+    auto &endpoint = rpc_ctx.epmap[i];
     if (NeedSend(*local_scope, send_var_name)) {
       VLOG(3) << "sending " << send_var_name << " to " << endpoint;
       rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
                                               send_var_name));
     } else {
-      VLOG(3) << "don't send non-initialized variable: " << send_varnames[i];
+      VLOG(3) << "don't send non-initialized variable: "
+              << rpc_ctx.splited_var_names[i];
     }
   }
 
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
index 1746377228..4500497163 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/distributed/rpc_common.h"
 
 namespace paddle {
 namespace operators {
@@ -25,10 +26,7 @@ namespace distributed {
 
 template <typename T>
 struct ParameterSend {
-  void operator()(const std::string &var_name,
-                  const std::vector<std::string> &send_varnames,
-                  const std::vector<std::string> &epmap,
-                  const std::vector<int64_t> &height_sections,
+  void operator()(const RpcContext &rpc_ctx,
                   const framework::ExecutionContext &context,
                   const framework::Scope &scope, bool sync);
 };
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
index dc50414b9a..7dede07b5a 100644
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -22,6 +22,13 @@ namespace operators {
 namespace distributed {
 
 struct RpcContext {
+  RpcContext(const std::string& name, const std::vector<std::string>& names,
+             const std::vector<std::string>& emap,
+             const std::vector<int64_t>& sections)
+      : var_name(name),
+        splited_var_names(names),
+        epmap(emap),
+        height_sections(sections) {}
   std::string var_name;
   std::vector<std::string> splited_var_names;
   std::vector<std::string> epmap;
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index bcb16ff2e5..a4a5ab89a7 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/parameter_recv.h"
+#include "paddle/fluid/operators/distributed/rpc_common.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -57,9 +58,11 @@ class RecvOp : public framework::OperatorBase {
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = pool.Get(place);
-      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
+      auto exe_ctx =
+          framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope);
+      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {});
+      recv_functor(rpc_ctx, exe_ctx, scope);
     } else {
       if (with_barrier) {
         std::vector<distributed::VarHandlePtr> rets;
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 801909e2c0..1823d89897 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/distributed/rpc_common.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -50,10 +51,12 @@ class SendOp : public framework::OperatorBase {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
-      auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
+      auto exe_ctx =
+          framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ins[0], send_varnames, epmap, height_sections, exe_ctx,
-                   scope, static_cast<bool>(sync_send));
+      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
+                                             height_sections);
+      send_functor(rpc_ctx, exe_ctx, scope, static_cast<bool>(sync_send));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From 3c6b733d14c0db61eb70208aa79c3999f29efc1d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 12:11:21 +0800
Subject: [PATCH 51/98] remove exe context

---
 .../operators/distributed/parameter_recv.cc   |  9 +++---
 .../operators/distributed/parameter_recv.h    |  4 +--
 .../operators/distributed/parameter_send.cc   | 29 ++++++++++---------
 .../operators/distributed/parameter_send.h    |  5 ++--
 .../operators/distributed_ops/recv_op.cc      |  2 +-
 .../operators/distributed_ops/send_op.cc      |  2 +-
 6 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 00956d8e6d..fecc76955d 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -40,7 +40,6 @@ using DDim = framework::DDim;
 
 template <typename T>
 void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
-                                  const framework::ExecutionContext &ctx,
                                   const framework::Scope &scope) {
   framework::Scope *local_scope = scope.NewTmpScope();
 
@@ -48,8 +47,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          ctx.Attr<int>("trainer_id"));
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
   auto *recv_var = scope.FindVar(rpc_ctx.var_name);
 
@@ -80,12 +78,13 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
     size_t output_offset = 0;
     framework::Tensor *recv_tensor =
         recv_var->GetMutable<framework::LoDTensor>();
+    auto dev_ctx = paddle::platform::CPUDeviceContext();
     for (auto *in : recved_tensors) {
       auto in_stride = framework::stride_numel(in->dims());
       auto out_stride = framework::stride_numel(recv_tensor->dims());
       StridedNumelCopyWithAxis<T>(
-          ctx.device_context(), 0, recv_tensor->data<T>() + output_offset,
-          out_stride, in->data<T>(), in_stride, in_stride[0]);
+          dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
+          in->data<T>(), in_stride, in_stride[0]);
       output_offset += in_stride[0];
     }
   }
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
index e25594024a..e955fca725 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ b/paddle/fluid/operators/distributed/parameter_recv.h
@@ -26,9 +26,7 @@ namespace distributed {
 
 template <typename T>
 struct ParameterRecv {
-  void operator()(const RpcContext &rpc_ctx,
-                  const framework::ExecutionContext &context,
-                  const framework::Scope &scope);
+  void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope);
 };
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index eaa1c3ae8e..3fe3be193a 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -39,7 +39,6 @@ using DDim = framework::DDim;
 
 template <typename T>
 void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
-                                  const framework::ExecutionContext &ctx,
                                   const framework::Scope &scope, bool sync) {
   framework::Scope *local_scope = scope.NewTmpScope();
 
@@ -47,8 +46,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          ctx.Attr<int>("trainer_id"));
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
   auto *send_var = scope.FindVar(rpc_ctx.var_name);
   size_t out_num = rpc_ctx.splited_var_names.size();
@@ -105,7 +103,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
       outs_rows_idx[out_idx].push_back(send_rows[i]);
       outs_dense_idx[out_idx].push_back(i);
     }
-    auto place = ctx.GetPlace();
+    auto place = platform::CPUPlace();
 
     for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
       auto rows_idx = outs_rows_idx[i];
@@ -118,22 +116,25 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
         for (auto idx : rows_idx) {
           outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
         }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
+        auto dst = outs[i]->mutable_value()->mutable_data<T>(place);
         for (size_t j = 0; j < rows_idx.size(); j++) {
           if (platform::is_cpu_place(place)) {
             memory::Copy(
                 platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
                 src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
           } else {
-#ifdef PADDLE_WITH_CUDA
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
+            PADDLE_THROW("do not support GPU now");
+            /*
+            #ifdef PADDLE_WITH_CUDA
+                        auto stream = ctx.cuda_device_context().stream();
+                        memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
+                                     platform::CUDAPlace(),
+                                     src + outs_dense_idx[i][j] * row_numel,
+                                     sizeof(T) * row_numel, stream);
+            #else
+                        PADDLE_THROW("Paddle is not compiled with GPU");
+            #endif
+            */
           }
         }
       }
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
index 4500497163..9077f4a4fb 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
@@ -26,9 +26,8 @@ namespace distributed {
 
 template <typename T>
 struct ParameterSend {
-  void operator()(const RpcContext &rpc_ctx,
-                  const framework::ExecutionContext &context,
-                  const framework::Scope &scope, bool sync);
+  void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope,
+                  bool sync);
 };
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index a4a5ab89a7..41701d3a3e 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -62,7 +62,7 @@ class RecvOp : public framework::OperatorBase {
           framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto recv_functor = distributed::ParameterRecv<float>();
       auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {});
-      recv_functor(rpc_ctx, exe_ctx, scope);
+      recv_functor(rpc_ctx, scope);
     } else {
       if (with_barrier) {
         std::vector<distributed::VarHandlePtr> rets;
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 1823d89897..5585ad21ce 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -56,7 +56,7 @@ class SendOp : public framework::OperatorBase {
       auto send_functor = distributed::ParameterSend<float>();
       auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
                                              height_sections);
-      send_functor(rpc_ctx, exe_ctx, scope, static_cast<bool>(sync_send));
+      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From c2cce6bafaabe8b2b32c42fc885c7e6a09586c8f Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 13:20:34 +0800
Subject: [PATCH 52/98] simplify parameter send and recv

---
 paddle/fluid/operators/distributed/communicator.cc | 10 +++++-----
 paddle/fluid/operators/distributed_ops/recv_op.cc  |  6 ------
 paddle/fluid/operators/distributed_ops/send_op.cc  |  6 ------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index fb9ecfa808..bc0a57f344 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -74,9 +74,9 @@ void Communicator::SendThread() {
       merged_var_num++;
     }
     MergeVars(var_name, vars, send_scope_.get());
-    auto send_functor = distributed::ParameterSend<float>();
-    //    send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx,
-    //    send_scope_, true);
+    // auto send_functor = distributed::ParameterSend<float>();
+    // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx,
+    // send_scope_, true);
   }
 }
 
@@ -85,8 +85,8 @@ void Communicator::RecvThread() {
   for (auto &iter : recv_varname_to_ctx_) {
     auto &var_name = iter.first;
     VLOG(3) << "recv var " << iter.first;
-    auto recv_functor = distributed::ParameterRecv<float>();
-    //    recv_functor(var_name, iter.second, exe_ctx, recv_scope_);
+    // auto recv_functor = distributed::ParameterRecv<float>();
+    // recv_functor(var_name, iter.second, exe_ctx, recv_scope_);
   }
 }
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 41701d3a3e..680b484d41 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -54,12 +54,6 @@ class RecvOp : public framework::OperatorBase {
         Attr<std::vector<std::string>>("recv_varnames");
 
     if (recv_varnames.size() > 0) {
-      framework::RuntimeContext ctx(Inputs(), Outputs(), scope);
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx = pool.Get(place);
-      auto exe_ctx =
-          framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto recv_functor = distributed::ParameterRecv<float>();
       auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {});
       recv_functor(rpc_ctx, scope);
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 5585ad21ce..8b09cf86d7 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -47,12 +47,6 @@ class SendOp : public framework::OperatorBase {
 
     if (send_varnames.size() > 0) {
       PADDLE_ENFORCE_EQ(ins.size(), 1, "");
-      framework::RuntimeContext ctx(Inputs(), Outputs(), scope);
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto* dev_ctx = pool.Get(place);
-      auto exe_ctx =
-          framework::ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr);
       auto send_functor = distributed::ParameterSend<float>();
       auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
                                              height_sections);

From 50601501e52ce6bd0b34864dc2410e1a6083a3cd Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 15:01:22 +0800
Subject: [PATCH 53/98] improve communicator

---
 .../operators/distributed/CMakeLists.txt      |  2 +-
 .../operators/distributed/communicator.cc     | 69 ++++++++++++-------
 .../operators/distributed/communicator.h      | 16 ++++-
 .../fluid/operators/distributed/rpc_common.h  |  8 +++
 4 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 22f44c4217..1301467fa7 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -54,7 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index bc0a57f344..403fcf4b16 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -25,9 +25,9 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-static void MergeVars(const std::string &var_name,
-                      const std::vector<std::shared_ptr<Variable>> &vars,
-                      Scope *scope) {
+static inline void MergeVars(const std::string &var_name,
+                             const std::vector<std::shared_ptr<Variable>> &vars,
+                             Scope *scope) {
   PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];
@@ -62,31 +62,53 @@ static void MergeVars(const std::string &var_name,
 }
 
 void Communicator::SendThread() {
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    VLOG(3) << "merge var " << var_name << " and send";
-    auto &var_queue = iter.second;
-    std::vector<std::shared_ptr<Variable>> vars;
-    const size_t max_merge_var_num = 20;
-    size_t merged_var_num = 0;
-    while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
-      vars.push_back(var_queue->Pop());
-      merged_var_num++;
+  while (running_) {
+    std::vector<std::future<void>> task_futures;
+    task_futures.reserve(send_varname_to_ctx_.size());
+    for (auto &iter : send_varname_to_queue_) {
+      auto send_task = [this, &iter] {
+        auto &var_name = iter.first;
+        VLOG(3) << "merge var " << var_name << " and send";
+        auto &var_queue = iter.second;
+        std::vector<std::shared_ptr<Variable>> vars;
+        const size_t max_merge_var_num = 20;
+        size_t merged_var_num = 0;
+        while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
+          vars.push_back(var_queue->Pop());
+          merged_var_num++;
+        }
+        MergeVars(var_name, vars, send_scope_.get());
+        auto send_functor = distributed::ParameterSend<float>();
+        auto &ctx = send_varname_to_ctx_.at(var_name);
+        send_functor(ctx, *send_scope_, true);
+      };
+      task_futures.emplace_back(
+          send_threadpool_->enqueue(std::move(send_task)));
+    }
+    for (auto &task_f : task_futures) {
+      task_f.wait();
     }
-    MergeVars(var_name, vars, send_scope_.get());
-    // auto send_functor = distributed::ParameterSend<float>();
-    // send_functor(var_name, send_varname_to_ctx_[var_name], exe_ctx,
-    // send_scope_, true);
   }
 }
 
 void Communicator::RecvThread() {
-  // parallel run recv graph
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto &var_name = iter.first;
-    VLOG(3) << "recv var " << iter.first;
-    // auto recv_functor = distributed::ParameterRecv<float>();
-    // recv_functor(var_name, iter.second, exe_ctx, recv_scope_);
+  while (running_) {
+    // parallel run recv graph
+    std::vector<std::future<void>> task_futures;
+    task_futures.reserve(recv_varname_to_ctx_.size());
+    for (auto &iter : recv_varname_to_ctx_) {
+      auto recv_task = [this, &iter] {
+        auto &var_name = iter.first;
+        VLOG(3) << "recv var " << var_name;
+        auto recv_functor = distributed::ParameterRecv<float>();
+        recv_functor(iter.second, *recv_scope_);
+      };
+      task_futures.emplace_back(
+          recv_threadpool_->enqueue(std::move(recv_task)));
+    }
+    for (auto &task : task_futures) {
+      task.wait();
+    }
   }
 }
 
@@ -101,6 +123,7 @@ void Communicator::Send(const std::string &var_name,
 }
 
 void Communicator::Start() {
+  running_ = true;
   // start send and recv thread
   send_thread_.reset(
       new std::thread(std::bind(&Communicator::SendThread, this)));
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 614d6ade81..ffdfa38b12 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include <ThreadPool.h>
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/distributed/rpc_common.h"
@@ -100,9 +102,18 @@ class Communicator {
       send_varname_to_queue_[iter.first] =
           std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(10);
     }
+    // TODO(qiao): default 5, need to config
+    send_threadpool_.reset(new ::ThreadPool(5));
+    recv_threadpool_.reset(new ::ThreadPool(5));
   }
 
-  ~Communicator() {}
+  ~Communicator() {
+    VLOG(3) << "~Communicator";
+    running_ = false;
+    send_thread_->join();
+    recv_thread_->join();
+    VLOG(3) << "~Communicator done";
+  }
 
   void Start();
 
@@ -113,6 +124,7 @@ class Communicator {
   void SendThread();
   void RecvThread();
 
+  bool running_ = false;
   std::unordered_map<std::string,
                      std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
       send_varname_to_queue_;
@@ -122,6 +134,8 @@ class Communicator {
   std::unique_ptr<std::thread> recv_thread_;
   Scope* recv_scope_;                  // should be global scope
   std::unique_ptr<Scope> send_scope_;  // an independent scope
+  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
+  std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
index 7dede07b5a..39eb2d078c 100644
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -29,6 +29,14 @@ struct RpcContext {
         splited_var_names(names),
         epmap(emap),
         height_sections(sections) {}
+
+  RpcContext(const RpcContext& ctx) {
+    var_name = ctx.var_name;
+    splited_var_names = ctx.splited_var_names;
+    epmap = ctx.epmap;
+    height_sections = ctx.height_sections;
+  }
+
   std::string var_name;
   std::vector<std::string> splited_var_names;
   std::vector<std::string> epmap;

From 13e8b5bf8962eea9aafe0e6c32f761e386767cea Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 15:31:56 +0800
Subject: [PATCH 54/98] clear gradient before merge

---
 paddle/fluid/operators/distributed/communicator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 403fcf4b16..a88b764474 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -47,6 +47,8 @@ static inline void MergeVars(const std::string &var_name,
     }
   } else if (var0->IsType<framework::SelectedRows>()) {
     auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
+    out_slr->mutable_rows()->clear();
+    out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
     std::vector<const paddle::framework::SelectedRows *> inputs;
     inputs.reserve(vars.size());
     for (auto &var : vars) {
@@ -71,6 +73,7 @@ void Communicator::SendThread() {
         VLOG(3) << "merge var " << var_name << " and send";
         auto &var_queue = iter.second;
         std::vector<std::shared_ptr<Variable>> vars;
+        // TODO(qiao): need to be configurable
         const size_t max_merge_var_num = 20;
         size_t merged_var_num = 0;
         while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {

From 8744f9a083719626c56190672b66eb7ac24d32be Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 4 Mar 2019 22:54:26 +0800
Subject: [PATCH 55/98] fix parallel executor async mode

---
 paddle/fluid/framework/parallel_executor.cc | 10 ++++++++--
 paddle/fluid/framework/parallel_executor.h  |  3 ++-
 paddle/fluid/pybind/pybind.cc               |  2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c133772e6e..ae7cd800ad 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -188,7 +188,7 @@ ParallelExecutor::ParallelExecutor(
     const std::string &loss_var_name, Scope *scope,
     const std::vector<Scope *> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    std::vector<ir::Graph *> graphs)
+    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -218,12 +218,18 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+  std::vector<ir::Graph *> graphs;
   if (build_strategy.async_mode_) {
     PADDLE_ENFORCE(!member_->use_cuda_,
                    "gpu mode does not support async_mode_ now!");
+    graphs.push_back(graph);
+    for (int i = 1; i < places.size(); ++i) {
+      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
+      async_graphs_.emplace_back(tmp_graph);
+      graphs.push_back(tmp_graph);
+    }
   }
 
-  ir::Graph *graph = graphs[0];
   std::unique_ptr<ir::Graph> temp_owned_graph(graph);
 
   // FIXME(Yancey1989): parallel graph mode get better performance
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 0e05b2a460..987f715066 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -50,7 +50,7 @@ class ParallelExecutor {
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
                             const BuildStrategy &build_strategy,
-                            std::vector<ir::Graph *> graphs);
+                            ir::Graph *graph);
 
   ~ParallelExecutor();
 
@@ -76,6 +76,7 @@ class ParallelExecutor {
                                     const BuildStrategy &build_strategy) const;
 
   ParallelExecutorPrivate *member_;
+  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<ncclUniqueId> local_nccl_id_;
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6d1fc0be23..69cfe280c6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1271,7 +1271,7 @@ All parameter, weight, gradient are variables in Paddle.
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &, const std::string &,
                   Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
-                  const BuildStrategy &, std::vector<ir::Graph *>>())
+                  const BuildStrategy &, ir::Graph *>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*

From 8c38aca95401324a44a0aab8e017cae26a179b65 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Mar 2019 16:49:52 +0800
Subject: [PATCH 56/98] tmp commit

---
 paddle/fluid/framework/details/CMakeLists.txt |  2 +-
 .../details/async_ssa_graph_executor.cc       | 38 +++++++++++++++++++
 .../operators/distributed/communicator.h      | 36 +++++++++++++++---
 3 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index b39673e229..88e7dd3f88 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -82,7 +82,7 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
 
-cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
+cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor communicator)
 
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 69f770afee..43391804c5 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/distributed/communicator.h"
 
 namespace paddle {
 namespace framework {
@@ -39,6 +40,43 @@ inline void NewTempScopeAndInitVars(const std::vector<VarInfo> &var_infos,
   }
 }
 
+// get RpcContext and remote send and recv op
+void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
+  using RpcCtxMap = operators::distributed::RpcCtxMap;
+  RpcCtxMap send_varname_to_ctx;
+  RpcCtxMap recv_varname_to_ctx;
+  for (auto i = 0; i < graphs.size(); ++i) {
+    for (auto &node : graphs[i]->Nodes()) {
+      if (node->IsOp()) {
+        if (node->Op()->Type() == "send") {
+          auto send_var_name = node->Op()->Input("X")[0];
+          auto send_varnames = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("send_varnames"));
+          auto epmap = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("epmap"));
+          auto height_section = boost::get<std::vector<int64_t>>(
+              node->Op()->GetNullableAttr("sections"));
+          send_varname_to_ctx[send_var_name] =
+              operators::distributed::RpcContext(send_var_name, send_varnames,
+                                                 epmap, height_section);
+        } else if (node->Op()->Type() == "recv") {
+          auto recv_var_name = node->Op()->Input("X")[0];
+          auto recv_varnames = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("recv_varnames"));
+          auto epmap = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("epmap"));
+          recv_varname_to_ctx[recv_var_name] =
+              operators::distributed::RpcContext(recv_var_name, recv_varnames,
+                                                 epmap, {});
+        }
+      }
+    }
+  }
+  // init communicator here
+  operators::distributed::Communicator::Init(send_varname_to_ctx,
+                                             recv_varname_to_ctx, scope);
+}
+
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, std::vector<ir::Graph *> graphs)
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index ffdfa38b12..44e2aa3be7 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -87,12 +87,12 @@ class BlockingQueue {
   std::condition_variable send_cv_;
 };
 
+using RpcCtxMap = std::unordered_map<std::string, RpcContext>;
+
 class Communicator {
  public:
-  Communicator(
-      const std::unordered_map<std::string, RpcContext>& send_varname_to_ctx,
-      const std::unordered_map<std::string, RpcContext>& recv_varname_to_ctx,
-      Scope* recv_scope)
+  Communicator(const RpcCtxMap& send_varname_to_ctx,
+               const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope)
       : send_varname_to_ctx_(send_varname_to_ctx),
         recv_varname_to_ctx_(recv_varname_to_ctx),
         recv_scope_(recv_scope) {
@@ -128,14 +128,38 @@ class Communicator {
   std::unordered_map<std::string,
                      std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
       send_varname_to_queue_;
-  std::unordered_map<std::string, RpcContext> send_varname_to_ctx_;
-  std::unordered_map<std::string, RpcContext> recv_varname_to_ctx_;
+  RpcCtxMap send_varname_to_ctx_;
+  RpcCtxMap recv_varname_to_ctx_;
   std::unique_ptr<std::thread> send_thread_;
   std::unique_ptr<std::thread> recv_thread_;
   Scope* recv_scope_;                  // should be global scope
   std::unique_ptr<Scope> send_scope_;  // an independent scope
   std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
   std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
+
+  // the following code is for initialize the commnunicator
+ public:
+  static void Init(const RpcCtxMap& send_varname_to_ctx,
+                   const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) {
+    InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope);
+  }
+
+  static Communicator* GetInstance() { return communicator_.get(); }
+
+ private:
+  // Init is called by GetInstance.
+  static void InitImpl(const RpcCtxMap& send_varname_to_ctx,
+                       const RpcCtxMap& recv_varname_to_ctx,
+                       Scope* recv_scope) {
+    if (communicator_ == nullptr) {
+      communicator_.reset(new Communicator(send_varname_to_ctx,
+                                           recv_varname_to_ctx, recv_scope));
+    }
+  }
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<Communicator> communicator_;
 };
 
 }  // namespace distributed

From e92ad8a2097ecffdfa412306b60dba4df68b8541 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Mar 2019 16:56:56 +0800
Subject: [PATCH 57/98] optimize test_async_ssa_graph_executor_mnist
 test=develop

---
 .../tests/unittests/test_async_ssa_graph_executor_mnist.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 4fbda407f1..5e77ce9b81 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -178,8 +178,8 @@ class TestAsyncSSAGraphExecutor(unittest.TestCase):
                         main_program=fluid.Program(),
                         startup_program=fluid.Program()):
                     test()
-        assert int(step_list[0] / 2) == int(step_list[1])
-        assert int(step_list[1] / 2) == int(step_list[2])
+        assert abs(int(step_list[0] / 2) - int(step_list[1])) < 5
+        assert abs(int(step_list[1] / 2) - int(step_list[2])) < 5
 
 
 if __name__ == "__main__":

From f28c25845330cf47250f7f6cba67f6f4cdaae97d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Mar 2019 17:10:17 +0800
Subject: [PATCH 58/98] code clean test=develop

---
 .../framework/details/multi_devices_graph_pass.cc | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 109037c3e6..c8e9c5d687 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -167,10 +167,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
 
   bool is_forwarding = true;
   bool insert_collection_ops = NeedCollectiveOps();
-  if (strategy_.async_mode_) {
-    // async mode did not need to merge gradient
-    insert_collection_ops = false;
-  }
 
   for (ir::Node *node : sorted_ops) {
     if (DealWithSpecialOp(&result, node)) {
@@ -749,10 +745,6 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
                                             ir::Node *node) const {
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
-    // in async_mode, each graph will send it's own gradient.
-    if (strategy_.async_mode_ && node->Op()->Type() == "send") {
-      return false;
-    }
     int op_dev_id = CreateRPCOp(result, node);
     PADDLE_ENFORCE(op_dev_id != -1,
                    "Can not schedule the RPC operator to the right place.");
@@ -768,11 +760,6 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
     insert_op = true;
     need_broadcast_var_ = true;
   } else if (OpHaveRole(*node, OpRole::kDist)) {
-    // in async_mode, each graph will send it's own gradient, do not need to
-    // merge gradient.
-    if (strategy_.async_mode_ && node->Op()->Type() != "concat") {
-      return false;
-    }
     int op_dev_id = CreateDistTrainOp(result, node);
     if (node->Op()->Type() == "concat") {
       // the input(block of parameter) of concat is on different device,
@@ -844,7 +831,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
     }
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-    if (recv_param_grad.size() == 2U && !strategy_.async_mode_) {
+    if (recv_param_grad.size() == 2U) {
       op_dev_id = GetVarDeviceID(recv_param_grad[1]);
       VLOG(10) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]

From c09477b05755da2c61862b37c82fc4031bbf04b1 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Mar 2019 23:13:00 +0800
Subject: [PATCH 59/98] revert change

---
 python/paddle/fluid/parallel_executor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 84beb37c1d..2ebaab3b10 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -104,6 +104,7 @@ class ParallelExecutor(object):
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
+
         self._compiled_program = compiler.CompiledProgram(main_program)
         self._compiled_program.with_data_parallel(
             loss_name=loss_name,

From 4e218dabc5cb24c753186503389fd533087bae81 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Mar 2019 23:29:09 +0800
Subject: [PATCH 60/98] code format test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc   | 3 +++
 paddle/fluid/framework/details/build_strategy.cc             | 1 +
 paddle/fluid/framework/details/build_strategy.h              | 1 +
 paddle/fluid/framework/details/exception_holder.h            | 1 +
 paddle/fluid/framework/details/multi_devices_graph_pass.cc   | 3 +++
 paddle/fluid/framework/details/multi_devices_graph_pass.h    | 3 +++
 paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 ++
 paddle/fluid/framework/parallel_executor.h                   | 1 +
 paddle/fluid/operators/reader/blocking_queue.h               | 1 +
 paddle/fluid/operators/reader/lod_tensor_blocking_queue.h    | 1 +
 10 files changed, 17 insertions(+)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 69f770afee..83fd8a50c3 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 
+#include <memory>
+#include <utility>
+
 #include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 4c5384af61..c073f10d8c 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <memory>
+#include <utility>
 
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 8cb57ad674..9c807560f5 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index 77ca03b86e..f8fd395bd9 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index c8e9c5d687..8e4f049721 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -13,7 +13,10 @@
 // limitations under the License.
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 377ba50fcc..f7ec9d28de 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 923e940884..778bbab505 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -16,7 +16,9 @@
 
 #include <deque>
 #include <list>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 987f715066..9a9f4e08fe 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index fe3f2f4031..2b7cb16bc7 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -16,6 +16,7 @@
 
 #include <condition_variable>  // NOLINT
 #include <deque>
+#include <utility>
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index eeba330d66..be044085f1 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"

From 5e8de51409e52b9bc0210f32cf0759b5925995d4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 6 Mar 2019 09:31:34 +0800
Subject: [PATCH 61/98] code format test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 ---
 paddle/fluid/framework/details/async_ssa_graph_executor.h  | 2 ++
 paddle/fluid/framework/parallel_executor.cc                | 1 +
 paddle/fluid/framework/reader.h                            | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 83fd8a50c3..69f770afee 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -14,9 +14,6 @@
 
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 
-#include <memory>
-#include <utility>
-
 #include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
index 7d7296772d..6aaf8f9a16 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "ThreadPool.h"
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ae7cd800ad..6c5f246f95 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 6cf0ec2937..4b400e72a4 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"

From 255b36dad2a3500a108977cee2b5eb041b086d2b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 6 Mar 2019 14:39:14 +0800
Subject: [PATCH 62/98] can run

---
 .../details/async_ssa_graph_executor.cc       | 13 +++++--
 .../operators/distributed/CMakeLists.txt      |  2 +-
 .../operators/distributed/communicator.cc     |  6 ++++
 .../operators/distributed/communicator.h      |  2 +-
 .../fluid/operators/distributed/rpc_common.h  | 36 ++++++++++++++++---
 .../operators/distributed_ops/CMakeLists.txt  |  4 +--
 .../operators/distributed_ops/send_op.cc      | 11 +++---
 7 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 43391804c5..18fba0d19b 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -59,6 +59,8 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           send_varname_to_ctx[send_var_name] =
               operators::distributed::RpcContext(send_var_name, send_varnames,
                                                  epmap, height_section);
+          VLOG(3) << "find and init an send op: "
+                  << send_varname_to_ctx[send_var_name];
         } else if (node->Op()->Type() == "recv") {
           auto recv_var_name = node->Op()->Input("X")[0];
           auto recv_varnames = boost::get<std::vector<std::string>>(
@@ -68,13 +70,19 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           recv_varname_to_ctx[recv_var_name] =
               operators::distributed::RpcContext(recv_var_name, recv_varnames,
                                                  epmap, {});
+          graphs[i]->RemoveNode(node);
+          VLOG(3) << "find and remove an recv op: "
+                  << recv_varname_to_ctx[recv_var_name];
         }
       }
     }
   }
   // init communicator here
-  operators::distributed::Communicator::Init(send_varname_to_ctx,
-                                             recv_varname_to_ctx, scope);
+  if (send_varname_to_ctx.size() > 0) {
+    VLOG(3) << "this is distribute mode, will use ";
+    operators::distributed::Communicator::Init(send_varname_to_ctx,
+                                               recv_varname_to_ctx, scope);
+  }
 }
 
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
@@ -110,6 +118,7 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
   for (auto *scope : local_scopes_) {
     NewTempScopeAndInitVars(var_infos_, scope);
   }
+  ProcessGraph(graphs_, local_scopes_[0]);
 }
 
 void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() {
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1301467fa7..6a269a4fbe 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -30,7 +30,7 @@ if(WITH_GRPC)
 
 else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
 
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index a88b764474..e800cd5f41 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -63,6 +63,9 @@ static inline void MergeVars(const std::string &var_name,
   }
 }
 
+std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
+std::once_flag Communicator::init_flag_;
+
 void Communicator::SendThread() {
   while (running_) {
     std::vector<std::future<void>> task_futures;
@@ -117,6 +120,7 @@ void Communicator::RecvThread() {
 
 void Communicator::Send(const std::string &var_name,
                         const framework::Scope &scope) {
+  VLOG(3) << "communicator send " << var_name;
   // push var into send queue by var_name
   auto *grad_var = scope.FindVar(var_name);
   PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
@@ -125,6 +129,8 @@ void Communicator::Send(const std::string &var_name,
   send_varname_to_queue_[var_name]->Push(tmp_grad_var);
 }
 
+Communicator *Communicator::GetInstance() { return communicator_.get(); }
+
 void Communicator::Start() {
   running_ = true;
   // start send and recv thread
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 44e2aa3be7..bc753bb75e 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -144,7 +144,7 @@ class Communicator {
     InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope);
   }
 
-  static Communicator* GetInstance() { return communicator_.get(); }
+  static Communicator* GetInstance();
 
  private:
   // Init is called by GetInstance.
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
index 39eb2d078c..3de89c2ae8 100644
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <iostream>
 #include <string>
 #include <vector>
 
@@ -22,15 +23,17 @@ namespace operators {
 namespace distributed {
 
 struct RpcContext {
-  RpcContext(const std::string& name, const std::vector<std::string>& names,
-             const std::vector<std::string>& emap,
-             const std::vector<int64_t>& sections)
+  RpcContext() = default;
+
+  RpcContext(const std::string &name, const std::vector<std::string> &names,
+             const std::vector<std::string> &emap,
+             const std::vector<int64_t> &sections)
       : var_name(name),
         splited_var_names(names),
         epmap(emap),
         height_sections(sections) {}
 
-  RpcContext(const RpcContext& ctx) {
+  RpcContext(const RpcContext &ctx) {
     var_name = ctx.var_name;
     splited_var_names = ctx.splited_var_names;
     epmap = ctx.epmap;
@@ -43,6 +46,31 @@ struct RpcContext {
   std::vector<int64_t> height_sections;
 };
 
+inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
+  os << "{";
+  os << "var_name: " << rpc_ctx.var_name << "\n";
+
+  os << "splited_var_names: [";
+  for (auto &name : rpc_ctx.splited_var_names) {
+    os << name << ", ";
+  }
+  os << "]\n";
+
+  os << "epmap: [";
+  for (auto &ep : rpc_ctx.epmap) {
+    os << ep << ", ";
+  }
+  os << "]\n";
+
+  os << "height_sections: [";
+  for (auto &section : rpc_ctx.height_sections) {
+    os << section << ", ";
+  }
+  os << "]\n";
+  os << "}";
+  return os;
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 3bcfc532e8..a1ef1af39f 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 8b09cf86d7..347395b7cc 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/communicator.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 #include "paddle/fluid/operators/distributed/rpc_common.h"
@@ -47,10 +48,12 @@ class SendOp : public framework::OperatorBase {
 
     if (send_varnames.size() > 0) {
       PADDLE_ENFORCE_EQ(ins.size(), 1, "");
-      auto send_functor = distributed::ParameterSend<float>();
-      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
-                                             height_sections);
-      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+      //      auto send_functor = distributed::ParameterSend<float>();
+      //      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames,
+      //      epmap,
+      //                                             height_sections);
+      //      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+      distributed::Communicator::GetInstance()->Send(ins[0], scope);
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From 7d5dc4ef06dcfce01b7489f92ccb18c7ef7e67b4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 6 Mar 2019 15:47:20 +0800
Subject: [PATCH 63/98] fix cmake list

---
 paddle/fluid/operators/distributed/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 6a269a4fbe..750aac8dd0 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -54,7 +54,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}

From a23f1ee85a0a08497fd372e28360e41a2818c14c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Mar 2019 09:46:40 +0800
Subject: [PATCH 64/98] optimize code

---
 .../details/async_ssa_graph_executor.cc       | 21 +++++---
 paddle/fluid/framework/parallel_executor.cc   |  6 +--
 .../operators/distributed/communicator.cc     | 48 +++++++++++--------
 .../operators/distributed/communicator.h      |  6 +++
 .../operators/distributed/variable_response.h |  6 ++-
 5 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 18fba0d19b..3f4d9f6ca4 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -23,6 +23,7 @@ namespace details {
 
 inline void NewTempScopeAndInitVars(const std::vector<VarInfo> &var_infos,
                                     Scope *scope) {
+  VLOG(3) << "NewTempScopeAndInitVars";
   Scope &local_scope = scope->NewScope();
   *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
       &local_scope;
@@ -43,12 +44,15 @@ inline void NewTempScopeAndInitVars(const std::vector<VarInfo> &var_infos,
 // get RpcContext and remote send and recv op
 void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
   using RpcCtxMap = operators::distributed::RpcCtxMap;
+  VLOG(3) << "ProcessGraph";
   RpcCtxMap send_varname_to_ctx;
   RpcCtxMap recv_varname_to_ctx;
   for (auto i = 0; i < graphs.size(); ++i) {
     for (auto &node : graphs[i]->Nodes()) {
-      if (node->IsOp()) {
-        if (node->Op()->Type() == "send") {
+      VLOG(3) << "node name " << node->Name();
+      std::vector<ir::Node *> nodes_to_delete;
+      if (node && node->IsOp()) {
+        if (node->Name() == "send") {
           auto send_var_name = node->Op()->Input("X")[0];
           auto send_varnames = boost::get<std::vector<std::string>>(
               node->Op()->GetNullableAttr("send_varnames"));
@@ -61,8 +65,8 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
                                                  epmap, height_section);
           VLOG(3) << "find and init an send op: "
                   << send_varname_to_ctx[send_var_name];
-        } else if (node->Op()->Type() == "recv") {
-          auto recv_var_name = node->Op()->Input("X")[0];
+        } else if (node->Name() == "recv") {
+          auto recv_var_name = node->Op()->Output("Out")[0];
           auto recv_varnames = boost::get<std::vector<std::string>>(
               node->Op()->GetNullableAttr("recv_varnames"));
           auto epmap = boost::get<std::vector<std::string>>(
@@ -70,18 +74,23 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           recv_varname_to_ctx[recv_var_name] =
               operators::distributed::RpcContext(recv_var_name, recv_varnames,
                                                  epmap, {});
-          graphs[i]->RemoveNode(node);
+          nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
         }
+        VLOG(3) << "delete all recv ops";
+        for (auto *node : nodes_to_delete) {
+          graphs[i]->RemoveNode(node);
+        }
       }
     }
   }
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {
-    VLOG(3) << "this is distribute mode, will use ";
+    VLOG(3) << "this is distribute mode, will use communicator";
     operators::distributed::Communicator::Init(send_varname_to_ctx,
                                                recv_varname_to_ctx, scope);
+    operators::distributed::Communicator::GetInstance()->Start();
   }
 }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 6c5f246f95..6c710abd7a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -277,7 +277,7 @@ ParallelExecutor::ParallelExecutor(
   // ncclOp
   std::vector<ir::Graph *> async_graphs(places.size());
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
+  if (build_strategy.async_mode_) {
     VLOG(3) << "use local async mode";
     temp_owned_graph =
         build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]},
@@ -298,7 +298,7 @@ ParallelExecutor::ParallelExecutor(
         member_->nccl_ctxs_.get());
   }
 #else
-  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
+  if (build_strategy.async_mode_) {
     VLOG(3) << "use local async mode";
     temp_owned_graph = build_strategy.Apply(
         std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
@@ -358,7 +358,7 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
-  if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
+  if (build_strategy.async_mode_) {
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_, async_graphs));
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index e800cd5f41..b2bb8fb403 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/communicator.h"
 
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -28,6 +31,7 @@ namespace distributed {
 static inline void MergeVars(const std::string &var_name,
                              const std::vector<std::shared_ptr<Variable>> &vars,
                              Scope *scope) {
+  VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to one";
   PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];
@@ -67,29 +71,32 @@ std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
 std::once_flag Communicator::init_flag_;
 
 void Communicator::SendThread() {
+  VLOG("SendThread start!");
   while (running_) {
     std::vector<std::future<void>> task_futures;
     task_futures.reserve(send_varname_to_ctx_.size());
     for (auto &iter : send_varname_to_queue_) {
-      auto send_task = [this, &iter] {
-        auto &var_name = iter.first;
-        VLOG(3) << "merge var " << var_name << " and send";
-        auto &var_queue = iter.second;
-        std::vector<std::shared_ptr<Variable>> vars;
-        // TODO(qiao): need to be configurable
-        const size_t max_merge_var_num = 20;
-        size_t merged_var_num = 0;
-        while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
-          vars.push_back(var_queue->Pop());
-          merged_var_num++;
-        }
-        MergeVars(var_name, vars, send_scope_.get());
-        auto send_functor = distributed::ParameterSend<float>();
-        auto &ctx = send_varname_to_ctx_.at(var_name);
-        send_functor(ctx, *send_scope_, true);
-      };
-      task_futures.emplace_back(
-          send_threadpool_->enqueue(std::move(send_task)));
+      auto &var_name = iter.first;
+      auto &var_queue = iter.second;
+      if (var_queue->NotEmpty()) {  // will block if queue is empty
+        auto send_task = [this, &var_name, &var_queue] {
+          VLOG(3) << "merge var " << var_name << " and send";
+          std::vector<std::shared_ptr<Variable>> vars;
+          // TODO(qiao): need to be configurable
+          const size_t max_merge_var_num = 20;
+          size_t merged_var_num = 0;
+          while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
+            vars.push_back(var_queue->Pop());
+            merged_var_num++;
+          }
+          MergeVars(var_name, vars, send_scope_.get());
+          auto send_functor = distributed::ParameterSend<float>();
+          auto &ctx = send_varname_to_ctx_.at(var_name);
+          send_functor(ctx, *send_scope_, true);
+        };
+        task_futures.emplace_back(
+            send_threadpool_->enqueue(std::move(send_task)));
+      }
     }
     for (auto &task_f : task_futures) {
       task_f.wait();
@@ -98,6 +105,7 @@ void Communicator::SendThread() {
 }
 
 void Communicator::RecvThread() {
+  VLOG(3) << "RecvThread start!";
   while (running_) {
     // parallel run recv graph
     std::vector<std::future<void>> task_futures;
@@ -115,6 +123,8 @@ void Communicator::RecvThread() {
     for (auto &task : task_futures) {
       task.wait();
     }
+    // TODO(qiao) need to be configuable
+    std::this_thread::sleep_for(std::chrono::milliseconds(200));
   }
 }
 
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index bc753bb75e..c93ad02555 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -68,6 +68,12 @@ class BlockingQueue {
     return rc;
   }
 
+  bool NotEmpty() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    recv_cv_.wait(lock, [=] { return !queue_.empty(); });
+    return true;
+  }
+
   size_t Cap() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return capacity_;
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 3ecb696069..edc12e2091 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -60,12 +60,14 @@ class VariableResponse {
                    bool create_scope = false)
       : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
     if (create_scope) {
-      local_scope_ = scope->NewTmpScope();
+      local_scope_ = &scope->NewScope();
     }
   }
 
   virtual ~VariableResponse() {
-    if (local_scope_) delete local_scope_;
+    if (local_scope_) {
+      scope_->DeleteScope(local_scope_);
+    }
   }
 
   int Parse(Source* source, const sendrecv::VariableMessage& meta) {

From 446fdf95634df26dd18388a3834ff9a556764296 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Mar 2019 10:00:27 +0800
Subject: [PATCH 65/98] fix compile problem

---
 paddle/fluid/framework/details/build_strategy.cc   | 6 +++---
 paddle/fluid/operators/distributed/communicator.cc | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 92b69334b8..22ce1b52c1 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -136,11 +136,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
 
-    if (strategy_.is_distribution_) {
+    if (strategy_.async_mode_) {
+      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
+    } else if (strategy_.is_distribution_) {
       VLOG(3) << "multi device parameter server mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
-    } else if (strategy_.async_mode_) {
-      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
         VLOG(3) << "multi devices collective mode with allreduce";
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index b2bb8fb403..506c5fbebd 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -71,7 +71,7 @@ std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
 std::once_flag Communicator::init_flag_;
 
 void Communicator::SendThread() {
-  VLOG("SendThread start!");
+  VLOG(3) << "SendThread start!";
   while (running_) {
     std::vector<std::future<void>> task_futures;
     task_futures.reserve(send_varname_to_ctx_.size());

From fe6a8409241f69d52661e555fb02a1e1daca3cf7 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Mar 2019 11:41:55 +0800
Subject: [PATCH 66/98] fix delete recv ops

---
 .../framework/details/async_ssa_graph_executor.cc      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 3f4d9f6ca4..e7cc14b0d1 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -48,9 +48,9 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
   RpcCtxMap send_varname_to_ctx;
   RpcCtxMap recv_varname_to_ctx;
   for (auto i = 0; i < graphs.size(); ++i) {
+    std::vector<ir::Node *> nodes_to_delete;
     for (auto &node : graphs[i]->Nodes()) {
       VLOG(3) << "node name " << node->Name();
-      std::vector<ir::Node *> nodes_to_delete;
       if (node && node->IsOp()) {
         if (node->Name() == "send") {
           auto send_var_name = node->Op()->Input("X")[0];
@@ -78,12 +78,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
         }
-        VLOG(3) << "delete all recv ops";
-        for (auto *node : nodes_to_delete) {
-          graphs[i]->RemoveNode(node);
-        }
       }
     }
+    VLOG(3) << "delete all recv ops";
+    for (auto *node : nodes_to_delete) {
+      graphs[i]->RemoveNode(node);
+    }
   }
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {

From 3225e195912b1c467558bce192c6468d7f0e8540 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 7 Mar 2019 14:54:59 +0800
Subject: [PATCH 67/98] fix remove recv op

---
 .../details/async_ssa_graph_executor.cc       | 21 +++++++++++++++++++
 .../operators/distributed/communicator.cc     |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index e7cc14b0d1..b36ed8af9a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -82,6 +82,27 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
     }
     VLOG(3) << "delete all recv ops";
     for (auto *node : nodes_to_delete) {
+      // delete input edge
+      for (auto *in : node->inputs) {
+        auto &in_outs = in->outputs;
+        for (auto iter = in_outs.begin(); iter != in_outs.end();) {
+          if (*iter == node) {
+            VLOG(3) << "delete input edge from " << in->Name() << " for "
+                    << node->Name();
+            iter = in_outs.erase(iter);
+          } else {
+            ++iter;
+          }
+        }
+      }
+      // delete output edge
+      for (auto *out : node->outputs) {
+        PADDLE_ENFORCE_EQ(out->outputs.size(), 0, "%s should have no outputs",
+                          out->Name());
+        VLOG(3) << "delete output edge to " << out->Name();
+        graphs[i]->RemoveNode(out);
+      }
+      VLOG(3) << "delete node " << node->Name();
       graphs[i]->RemoveNode(node);
     }
   }
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 506c5fbebd..f5d274b66d 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -31,7 +31,7 @@ namespace distributed {
 static inline void MergeVars(const std::string &var_name,
                              const std::vector<std::shared_ptr<Variable>> &vars,
                              Scope *scope) {
-  VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to one";
+  VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to 1";
   PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];

From ff8054c5a7f4ea34f6f112c318c03a16adf37e64 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Mar 2019 10:23:54 +0800
Subject: [PATCH 68/98] can run

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 2 ++
 paddle/fluid/framework/details/multi_devices_graph_pass.h  | 4 ++++
 paddle/fluid/operators/distributed_ops/recv_op.cc          | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index b36ed8af9a..12822c64e9 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -80,6 +80,7 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
         }
       }
     }
+    /*
     VLOG(3) << "delete all recv ops";
     for (auto *node : nodes_to_delete) {
       // delete input edge
@@ -105,6 +106,7 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
       VLOG(3) << "delete node " << node->Name();
       graphs[i]->RemoveNode(node);
     }
+    */
   }
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index f7ec9d28de..0b9061ad60 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -127,6 +127,10 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
   bool NeedCollectiveOps() const override { return false; }
 
   bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
+    if (node->Op()->Type() == "recv") {
+      node->Op()->SetAttr("do_not_run", true);
+      node->Op()->Flush();
+    }
     return false;
   }
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 680b484d41..afbf7a4a23 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -36,6 +36,11 @@ class RecvOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    bool do_not_run = Attr<bool>("do_not_run");
+    if (do_not_run) {
+      VLOG(3) << "recv do not run!";
+      return;
+    }
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
     std::vector<std::string> varnames =
         Attr<std::vector<std::string>>("varnames");
@@ -126,6 +131,7 @@ This operator can get variables from server side.
         "(vector<string>) "
         "the splited parameter varnames to be recved from pserver")
         .SetDefault(std::vector<std::string>{});
+    AddAttr<bool>("do_not_run", "").SetDefault(false);
   }
 };
 

From c0e5941e31000447c10dd64fe5dfc47309ec33c7 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Mar 2019 10:35:01 +0800
Subject: [PATCH 69/98] add commnet for recv do_not_run

---
 paddle/fluid/operators/distributed_ops/recv_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index afbf7a4a23..3fd0700a07 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -131,7 +131,7 @@ This operator can get variables from server side.
         "(vector<string>) "
         "the splited parameter varnames to be recved from pserver")
         .SetDefault(std::vector<std::string>{});
-    AddAttr<bool>("do_not_run", "").SetDefault(false);
+    AddAttr<bool>("do_not_run", "if recv need to really run").SetDefault(false);
   }
 };
 

From 63cd70a8b84905adc83d0fc082e4eaf15d91361b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 8 Mar 2019 17:36:02 +0800
Subject: [PATCH 70/98] fix blocking problem

---
 .../operators/distributed/communicator.cc     | 51 +++++++++++--------
 .../operators/distributed/communicator.h      | 38 +++++++-------
 .../operators/distributed/parameter_recv.cc   |  2 +
 .../operators/distributed_ops/send_op.cc      | 13 +++--
 4 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index f5d274b66d..a7bce26234 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -75,10 +75,11 @@ void Communicator::SendThread() {
   while (running_) {
     std::vector<std::future<void>> task_futures;
     task_futures.reserve(send_varname_to_ctx_.size());
+    VLOG(3) << "run send graph";
     for (auto &iter : send_varname_to_queue_) {
       auto &var_name = iter.first;
       auto &var_queue = iter.second;
-      if (var_queue->NotEmpty()) {  // will block if queue is empty
+      if (var_queue->Size() > 0) {
         auto send_task = [this, &var_name, &var_queue] {
           VLOG(3) << "merge var " << var_name << " and send";
           std::vector<std::shared_ptr<Variable>> vars;
@@ -96,33 +97,41 @@ void Communicator::SendThread() {
         };
         task_futures.emplace_back(
             send_threadpool_->enqueue(std::move(send_task)));
+      } else {
+        VLOG(3) << var_name << " queue empty";
       }
     }
     for (auto &task_f : task_futures) {
       task_f.wait();
     }
+    VLOG(3) << "run send graph done";
+    RecvAll();
   }
 }
 
+void Communicator::RecvAll() {
+  VLOG(3) << "parallel run recv graph";
+  std::vector<std::future<void>> task_futures;
+  task_futures.reserve(recv_varname_to_ctx_.size());
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto recv_task = [this, &iter] {
+      auto &var_name = iter.first;
+      VLOG(3) << "recv var " << var_name;
+      auto recv_functor = distributed::ParameterRecv<float>();
+      recv_functor(iter.second, *recv_scope_);
+    };
+    task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
+  }
+  for (auto &task : task_futures) {
+    task.wait();
+  }
+  VLOG(3) << "run recv graph done";
+}
+
 void Communicator::RecvThread() {
   VLOG(3) << "RecvThread start!";
   while (running_) {
-    // parallel run recv graph
-    std::vector<std::future<void>> task_futures;
-    task_futures.reserve(recv_varname_to_ctx_.size());
-    for (auto &iter : recv_varname_to_ctx_) {
-      auto recv_task = [this, &iter] {
-        auto &var_name = iter.first;
-        VLOG(3) << "recv var " << var_name;
-        auto recv_functor = distributed::ParameterRecv<float>();
-        recv_functor(iter.second, *recv_scope_);
-      };
-      task_futures.emplace_back(
-          recv_threadpool_->enqueue(std::move(recv_task)));
-    }
-    for (auto &task : task_futures) {
-      task.wait();
-    }
+    RecvAll();
     // TODO(qiao) need to be configuable
     std::this_thread::sleep_for(std::chrono::milliseconds(200));
   }
@@ -136,7 +145,9 @@ void Communicator::Send(const std::string &var_name,
   PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
   auto tmp_grad_var = std::make_shared<Variable>();
   framework::CopyVariable(*grad_var, tmp_grad_var.get());
-  send_varname_to_queue_[var_name]->Push(tmp_grad_var);
+  auto &queue = send_varname_to_queue_.at(var_name);
+  VLOG(3) << "send " << var_name << " queue size " << queue->Size();
+  queue->Push(tmp_grad_var);
 }
 
 Communicator *Communicator::GetInstance() { return communicator_.get(); }
@@ -146,8 +157,8 @@ void Communicator::Start() {
   // start send and recv thread
   send_thread_.reset(
       new std::thread(std::bind(&Communicator::SendThread, this)));
-  recv_thread_.reset(
-      new std::thread(std::bind(&Communicator::RecvThread, this)));
+  //  recv_thread_.reset(
+  //      new std::thread(std::bind(&Communicator::RecvThread, this)));
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index c93ad02555..3c98b36b74 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -43,37 +43,36 @@ class BlockingQueue {
   }
 
   bool Push(const T& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    send_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.push_back(elem);
-    recv_cv_.notify_one();
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+      PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+      queue_.push_back(elem);
+    }
+    cv_.notify_one();
     return true;
   }
 
   bool Push(T&& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    send_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.emplace_back(std::move(elem));
-    recv_cv_.notify_one();
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+      PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+      queue_.emplace_back(std::move(elem));
+    }
+    cv_.notify_one();
     return true;
   }
 
   T Pop() {
     std::unique_lock<std::mutex> lock(mutex_);
-    recv_cv_.wait(lock, [=] { return !queue_.empty(); });
+    cv_.wait(lock, [=] { return !queue_.empty(); });
     T rc(std::move(queue_.front()));
     queue_.pop_front();
+    cv_.notify_one();
     return rc;
   }
 
-  bool NotEmpty() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    recv_cv_.wait(lock, [=] { return !queue_.empty(); });
-    return true;
-  }
-
   size_t Cap() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return capacity_;
@@ -89,8 +88,7 @@ class BlockingQueue {
   std::deque<T> queue_;
 
   mutable std::mutex mutex_;
-  std::condition_variable recv_cv_;
-  std::condition_variable send_cv_;
+  std::condition_variable cv_;
 };
 
 using RpcCtxMap = std::unordered_map<std::string, RpcContext>;
@@ -127,6 +125,8 @@ class Communicator {
   void Send(const std::string& var_name, const framework::Scope& scope);
 
  private:
+  // recv all parameter
+  void RecvAll();
   void SendThread();
   void RecvThread();
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index fecc76955d..c3238f28f6 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -41,6 +41,7 @@ using DDim = framework::DDim;
 template <typename T>
 void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::Scope &scope) {
+  VLOG(3) << "ParameterRecv in";
   framework::Scope *local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -90,6 +91,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
   }
 
   delete local_scope;
+  VLOG(3) << "ParameterRecv out";
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 347395b7cc..67de7b4185 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -48,12 +48,15 @@ class SendOp : public framework::OperatorBase {
 
     if (send_varnames.size() > 0) {
       PADDLE_ENFORCE_EQ(ins.size(), 1, "");
-      //      auto send_functor = distributed::ParameterSend<float>();
-      //      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames,
-      //      epmap,
-      //                                             height_sections);
-      //      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+      /*
+      auto send_functor = distributed::ParameterSend<float>();
+      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
+      height_sections);
+      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+      */
+      VLOG(3) << "send " << ins[0];
       distributed::Communicator::GetInstance()->Send(ins[0], scope);
+      VLOG(3) << "send " << ins[0] << " done";
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From 0a828fef8286c6b9cd7a5ca2345d19057762dc79 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 10 Mar 2019 23:16:50 +0800
Subject: [PATCH 71/98] add some flags for communicator

---
 .../operators/distributed/communicator.cc     | 54 +++++++++++++++++--
 .../operators/distributed/communicator.h      | 23 +-------
 python/paddle/fluid/__init__.py               |  4 ++
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index a7bce26234..73b9800d43 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/communicator.h"
 
+#include <gflags/gflags.h>
 #include <chrono>  // NOLINT
 #include <thread>  // NOLINT
 
@@ -24,6 +25,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
+DEFINE_bool(communicator_independent_recv_thread, true,
+            "use an independent to recv vars from parameter server");
+DEFINE_int32(communicator_send_queue_size, 20,
+             "queue size to recv gradient before send");
+DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv");
+DEFINE_int32(communicator_thread_pool_size, 5, "wait time between each recv");
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -70,6 +78,38 @@ static inline void MergeVars(const std::string &var_name,
 std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
 std::once_flag Communicator::init_flag_;
 
+Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
+                           const RpcCtxMap &recv_varname_to_ctx,
+                           Scope *recv_scope)
+    : send_varname_to_ctx_(send_varname_to_ctx),
+      recv_varname_to_ctx_(recv_varname_to_ctx),
+      recv_scope_(recv_scope) {
+  // get all send information from graph, build vars_to_send
+  VLOG(0) << "communicator_independent_recv_thread: "
+          << FLAGS_communicator_independent_recv_thread;
+  VLOG(0) << "communicator_send_queue_size: "
+          << FLAGS_communicator_send_queue_size;
+  VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms;
+  VLOG(0) << "communicator_thread_pool_size: "
+          << FLAGS_communicator_thread_pool_size;
+  send_scope_.reset(new Scope());
+  for (auto &iter : send_varname_to_ctx_) {
+    send_varname_to_queue_[iter.first] =
+        std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
+            FLAGS_communicator_send_queue_size);
+  }
+  send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
+  recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
+}
+
+Communicator::~Communicator() {
+  VLOG(3) << "~Communicator";
+  running_ = false;
+  if (send_thread_) send_thread_->join();
+  if (recv_thread_) recv_thread_->join();
+  VLOG(3) << "~Communicator done";
+}
+
 void Communicator::SendThread() {
   VLOG(3) << "SendThread start!";
   while (running_) {
@@ -105,7 +145,9 @@ void Communicator::SendThread() {
       task_f.wait();
     }
     VLOG(3) << "run send graph done";
-    RecvAll();
+    if (!FLAGS_communicator_independent_recv_thread) {
+      RecvAll();
+    }
   }
 }
 
@@ -132,8 +174,8 @@ void Communicator::RecvThread() {
   VLOG(3) << "RecvThread start!";
   while (running_) {
     RecvAll();
-    // TODO(qiao) need to be configuable
-    std::this_thread::sleep_for(std::chrono::milliseconds(200));
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(FLAGS_communicator_recv_wait_ms));
   }
 }
 
@@ -157,8 +199,10 @@ void Communicator::Start() {
   // start send and recv thread
   send_thread_.reset(
       new std::thread(std::bind(&Communicator::SendThread, this)));
-  //  recv_thread_.reset(
-  //      new std::thread(std::bind(&Communicator::RecvThread, this)));
+  if (FLAGS_communicator_independent_recv_thread) {
+    recv_thread_.reset(
+        new std::thread(std::bind(&Communicator::RecvThread, this)));
+  }
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 3c98b36b74..4104cb20a3 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -96,28 +96,9 @@ using RpcCtxMap = std::unordered_map<std::string, RpcContext>;
 class Communicator {
  public:
   Communicator(const RpcCtxMap& send_varname_to_ctx,
-               const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope)
-      : send_varname_to_ctx_(send_varname_to_ctx),
-        recv_varname_to_ctx_(recv_varname_to_ctx),
-        recv_scope_(recv_scope) {
-    // get all send information from graph, build vars_to_send
-    send_scope_.reset(new Scope());
-    for (auto& iter : send_varname_to_ctx_) {
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(10);
-    }
-    // TODO(qiao): default 5, need to config
-    send_threadpool_.reset(new ::ThreadPool(5));
-    recv_threadpool_.reset(new ::ThreadPool(5));
-  }
+               const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope);
 
-  ~Communicator() {
-    VLOG(3) << "~Communicator";
-    running_ = false;
-    send_thread_->join();
-    recv_thread_->join();
-    VLOG(3) << "~Communicator done";
-  }
+  ~Communicator();
 
   void Start();
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index d12f04a6ab..8af5e1c509 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -150,6 +150,10 @@ def __bootstrap__():
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
         read_env_flags.append('rpc_disable_reuse_port')
+        read_env_flags.append('communicator_independent_recv_thread')
+        read_env_flags.append('communicator_send_queue_size')
+        read_env_flags.append('communicator_recv_wait_ms')
+        read_env_flags.append('communicator_thread_pool_size')
         if core.is_compiled_with_brpc():
             read_env_flags.append('max_body_size')
             #set brpc max body size

From eb6af305d62f233bc70a313f8c24ef5088d4bac6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 10 Mar 2019 23:18:09 +0800
Subject: [PATCH 72/98] change embedding interface addnremote_prefetch

---
 python/paddle/fluid/layers/nn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index efb400ccc6..48a46a0ff0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -306,7 +306,8 @@ def embedding(input,
               is_distributed=False,
               padding_idx=None,
               param_attr=None,
-              dtype='float32'):
+              dtype='float32',
+              remote_prefetch=False):
     """
     **Embedding Layer**
 
@@ -345,7 +346,7 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
-    remote_prefetch = is_sparse and (not is_distributed)
+    remote_prefetch = is_sparse and (not is_distributed) and remote_prefetch
     if remote_prefetch:
         assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(

From ad5a2b3edfb437a225d7f42ab5c35b65a3b9d49e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 11 Mar 2019 11:02:54 +0800
Subject: [PATCH 73/98] add some debug flags for communicator

---
 .../operators/distributed/communicator.cc     | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 73b9800d43..06f7859f4f 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -30,7 +30,11 @@ DEFINE_bool(communicator_independent_recv_thread, true,
 DEFINE_int32(communicator_send_queue_size, 20,
              "queue size to recv gradient before send");
 DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv");
-DEFINE_int32(communicator_thread_pool_size, 5, "wait time between each recv");
+DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
+DEFINE_int32(communicator_max_merge_var_num, 20,
+             "max var num to merge and send");
+DEFINE_bool(communicator_fake_rpc, false,
+            "fake mode does not really send any thing");
 
 namespace paddle {
 namespace operators {
@@ -92,6 +96,9 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
   VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
+  VLOG(0) << "communicator_max_merge_var_num"
+          << FLAGS_communicator_max_merge_var_num;
+  VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
   send_scope_.reset(new Scope());
   for (auto &iter : send_varname_to_ctx_) {
     send_varname_to_queue_[iter.first] =
@@ -123,17 +130,18 @@ void Communicator::SendThread() {
         auto send_task = [this, &var_name, &var_queue] {
           VLOG(3) << "merge var " << var_name << " and send";
           std::vector<std::shared_ptr<Variable>> vars;
-          // TODO(qiao): need to be configurable
-          const size_t max_merge_var_num = 20;
           size_t merged_var_num = 0;
-          while (var_queue->Size() > 0 && merged_var_num < max_merge_var_num) {
+          while (var_queue->Size() > 0 &&
+                 merged_var_num < FLAGS_communicator_max_merge_var_num) {
             vars.push_back(var_queue->Pop());
             merged_var_num++;
           }
           MergeVars(var_name, vars, send_scope_.get());
           auto send_functor = distributed::ParameterSend<float>();
           auto &ctx = send_varname_to_ctx_.at(var_name);
-          send_functor(ctx, *send_scope_, true);
+          if (!FLAGS_communicator_fake_rpc) {
+            send_functor(ctx, *send_scope_, true);
+          }
         };
         task_futures.emplace_back(
             send_threadpool_->enqueue(std::move(send_task)));
@@ -160,7 +168,9 @@ void Communicator::RecvAll() {
       auto &var_name = iter.first;
       VLOG(3) << "recv var " << var_name;
       auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_);
+      if (!FLAGS_communicator_fake_rpc) {
+        recv_functor(iter.second, *recv_scope_);
+      }
     };
     task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
   }

From 43378ad626460e11e7afd1cf8176c51fe592396b Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 11 Mar 2019 12:37:57 +0800
Subject: [PATCH 74/98] add flags to init

---
 paddle/fluid/operators/distributed/communicator.cc | 2 +-
 python/paddle/fluid/__init__.py                    | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 06f7859f4f..6acb572de9 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -96,7 +96,7 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
   VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
-  VLOG(0) << "communicator_max_merge_var_num"
+  VLOG(0) << "communicator_max_merge_var_num: "
           << FLAGS_communicator_max_merge_var_num;
   VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
   send_scope_.reset(new Scope());
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8af5e1c509..c478c8ceee 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -143,6 +143,7 @@ def __bootstrap__():
         read_env_flags.append('use_mkldnn')
 
     if core.is_compiled_with_dist():
+        #env for rpc
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_path')
         read_env_flags.append('enable_rpc_profiler')
@@ -150,10 +151,14 @@ def __bootstrap__():
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
         read_env_flags.append('rpc_disable_reuse_port')
+
+        # env for communicator
         read_env_flags.append('communicator_independent_recv_thread')
         read_env_flags.append('communicator_send_queue_size')
         read_env_flags.append('communicator_recv_wait_ms')
         read_env_flags.append('communicator_thread_pool_size')
+        read_env_flags.append('communicator_max_merge_var_num')
+        read_env_flags.append('communicator_fake_rpc')
         if core.is_compiled_with_brpc():
             read_env_flags.append('max_body_size')
             #set brpc max body size

From d3a14377d5cf0376a5f0170406fecd336e3fc41a Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 11 Mar 2019 15:08:38 +0800
Subject: [PATCH 75/98] add fake rpc to send

---
 .../operators/distributed/communicator.cc     | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 6acb572de9..d3b77a758c 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -191,15 +191,17 @@ void Communicator::RecvThread() {
 
 void Communicator::Send(const std::string &var_name,
                         const framework::Scope &scope) {
-  VLOG(3) << "communicator send " << var_name;
-  // push var into send queue by var_name
-  auto *grad_var = scope.FindVar(var_name);
-  PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
-  auto tmp_grad_var = std::make_shared<Variable>();
-  framework::CopyVariable(*grad_var, tmp_grad_var.get());
-  auto &queue = send_varname_to_queue_.at(var_name);
-  VLOG(3) << "send " << var_name << " queue size " << queue->Size();
-  queue->Push(tmp_grad_var);
+  if (!FLAGS_communicator_fake_rpc) {
+    VLOG(3) << "communicator send " << var_name;
+    // push var into send queue by var_name
+    auto *grad_var = scope.FindVar(var_name);
+    PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
+    auto tmp_grad_var = std::make_shared<Variable>();
+    framework::CopyVariable(*grad_var, tmp_grad_var.get());
+    auto &queue = send_varname_to_queue_.at(var_name);
+    VLOG(3) << "send " << var_name << " queue size " << queue->Size();
+    queue->Push(tmp_grad_var);
+  }
 }
 
 Communicator *Communicator::GetInstance() { return communicator_.get(); }

From 23d3929a4bb758b70c1aafe31b3eabedc5d2ea3d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Mar 2019 17:20:08 +0800
Subject: [PATCH 76/98] optimize merge vars

---
 .../operators/distributed/communicator.cc     | 85 ++++++++++++++-----
 1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index d3b77a758c..91e2417d0c 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <chrono>  // NOLINT
 #include <thread>  // NOLINT
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/parameter_recv.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
@@ -40,28 +43,54 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
 static inline void MergeVars(const std::string &var_name,
                              const std::vector<std::shared_ptr<Variable>> &vars,
                              Scope *scope) {
-  VLOG(3) << "merge " << vars.size() << " vars " << var_name << " to 1";
   PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];
   auto *out_var = scope->Var(var_name);
   if (var0->IsType<framework::LoDTensor>()) {
+    VLOG(3) << "merge " << var_name << " LoDTensor"
+            << var0->Get<framework::LoDTensor>().dims();
+
+    // init output tensor
     auto *out_t = out_var->GetMutable<framework::LoDTensor>();
     auto *out_ptr = out_t->mutable_data<float>(
         var0->Get<framework::LoDTensor>().dims(), cpu_place);
     auto numel = out_t->numel();
-    for (auto i = 0; i < numel; ++i) {
-      out_ptr[i] = 0;
-      for (auto &var : vars) {
-        auto &var_t = var->Get<framework::LoDTensor>();
-        PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims");
-        out_ptr[i] += var_t.data<float>()[i];
-      }
+
+    // check the input dims
+    for (auto &var : vars) {
+      auto &var_t = var->Get<framework::LoDTensor>();
+      PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims");
+    }
+
+    // set output tensor to 0.
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    math::SetConstant<paddle::platform::CPUDeviceContext, float>
+        constant_functor;
+    constant_functor(cpu_ctx, out_t, static_cast<float>(0));
+
+    // sum all vars to out
+    auto result = EigenVector<T>::Flatten(*out_t);
+    for (auto &var : vars) {
+      auto &in_t = var->Get<framework::LoDTensor>();
+      auto in = EigenVector<float>::Flatten(in_t);
+      result.device(*cpu_ctx.eigen_device()) = result + in;
     }
   } else if (var0->IsType<framework::SelectedRows>()) {
+    auto &slr0 = var0->Get<framework::SelectedRows>();
     auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
     out_slr->mutable_rows()->clear();
     out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
@@ -74,6 +103,8 @@ static inline void MergeVars(const std::string &var_name,
         merge_add;
     auto dev_ctx = paddle::platform::CPUDeviceContext();
     merge_add(dev_ctx, inputs, out_slr, false);
+    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
+            << " dims: " << slr0.value().dims();
   } else {
     PADDLE_THROW("unsupported var type!");
   }
@@ -123,12 +154,13 @@ void Communicator::SendThread() {
     std::vector<std::future<void>> task_futures;
     task_futures.reserve(send_varname_to_ctx_.size());
     VLOG(3) << "run send graph";
+    auto before_run_send_graph = GetCurrentUS();
     for (auto &iter : send_varname_to_queue_) {
       auto &var_name = iter.first;
       auto &var_queue = iter.second;
       if (var_queue->Size() > 0) {
         auto send_task = [this, &var_name, &var_queue] {
-          VLOG(3) << "merge var " << var_name << " and send";
+          VLOG(3) << var_name << " merge and send";
           std::vector<std::shared_ptr<Variable>> vars;
           size_t merged_var_num = 0;
           while (var_queue->Size() > 0 &&
@@ -136,12 +168,19 @@ void Communicator::SendThread() {
             vars.push_back(var_queue->Pop());
             merged_var_num++;
           }
+          auto before_merge = GetCurrentUS();
           MergeVars(var_name, vars, send_scope_.get());
+          auto after_merge = GetCurrentUS();
+          VLOG(3) << "merge " << var_name << " use time "
+                  << after_merge - before_merge;
           auto send_functor = distributed::ParameterSend<float>();
           auto &ctx = send_varname_to_ctx_.at(var_name);
           if (!FLAGS_communicator_fake_rpc) {
             send_functor(ctx, *send_scope_, true);
           }
+          auto after_send = GetCurrentUS();
+          VLOG(3) << "send " << var_name << " use time "
+                  << after_send - after_merge;
         };
         task_futures.emplace_back(
             send_threadpool_->enqueue(std::move(send_task)));
@@ -152,7 +191,9 @@ void Communicator::SendThread() {
     for (auto &task_f : task_futures) {
       task_f.wait();
     }
-    VLOG(3) << "run send graph done";
+    auto after_run_send_graph = GetCurrentUS();
+    VLOG(3) << "run send graph use time "
+            << after_run_send_graph - before_run_send_graph;
     if (!FLAGS_communicator_independent_recv_thread) {
       RecvAll();
     }
@@ -161,6 +202,7 @@ void Communicator::SendThread() {
 
 void Communicator::RecvAll() {
   VLOG(3) << "parallel run recv graph";
+  auto before_send = GetCurrentUS();
   std::vector<std::future<void>> task_futures;
   task_futures.reserve(recv_varname_to_ctx_.size());
   for (auto &iter : recv_varname_to_ctx_) {
@@ -177,7 +219,8 @@ void Communicator::RecvAll() {
   for (auto &task : task_futures) {
     task.wait();
   }
-  VLOG(3) << "run recv graph done";
+  auto after_recv = GetCurrentUS();
+  VLOG(3) << "run recv graph use time " << after_recv - before_send;
 }
 
 void Communicator::RecvThread() {
@@ -191,17 +234,15 @@ void Communicator::RecvThread() {
 
 void Communicator::Send(const std::string &var_name,
                         const framework::Scope &scope) {
-  if (!FLAGS_communicator_fake_rpc) {
-    VLOG(3) << "communicator send " << var_name;
-    // push var into send queue by var_name
-    auto *grad_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
-    auto tmp_grad_var = std::make_shared<Variable>();
-    framework::CopyVariable(*grad_var, tmp_grad_var.get());
-    auto &queue = send_varname_to_queue_.at(var_name);
-    VLOG(3) << "send " << var_name << " queue size " << queue->Size();
-    queue->Push(tmp_grad_var);
-  }
+  VLOG(3) << "communicator send " << var_name;
+  // push var into send queue by var_name
+  auto *grad_var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
+  auto tmp_grad_var = std::make_shared<Variable>();
+  framework::CopyVariable(*grad_var, tmp_grad_var.get());
+  auto &queue = send_varname_to_queue_.at(var_name);
+  VLOG(3) << "send " << var_name << " queue size " << queue->Size();
+  queue->Push(tmp_grad_var);
 }
 
 Communicator *Communicator::GetInstance() { return communicator_.get(); }

From 9b74707cbf293f17e3b8a84c319f14ee3370f53d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Mar 2019 17:24:05 +0800
Subject: [PATCH 77/98] fix compile problem

---
 paddle/fluid/operators/distributed/communicator.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 91e2417d0c..f17af56400 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -66,8 +66,6 @@ static inline void MergeVars(const std::string &var_name,
 
     // init output tensor
     auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-    auto *out_ptr = out_t->mutable_data<float>(
-        var0->Get<framework::LoDTensor>().dims(), cpu_place);
     auto numel = out_t->numel();
 
     // check the input dims
@@ -83,7 +81,7 @@ static inline void MergeVars(const std::string &var_name,
     constant_functor(cpu_ctx, out_t, static_cast<float>(0));
 
     // sum all vars to out
-    auto result = EigenVector<T>::Flatten(*out_t);
+    auto result = EigenVector<float>::Flatten(*out_t);
     for (auto &var : vars) {
       auto &in_t = var->Get<framework::LoDTensor>();
       auto in = EigenVector<float>::Flatten(in_t);

From 0fcdae8418b8bbc06013ca540d8a7b8d2e4d790e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 12 Mar 2019 23:08:55 +0800
Subject: [PATCH 78/98] add communicator_test

---
 .../operators/distributed/CMakeLists.txt      |   1 +
 .../operators/distributed/communicator.cc     |  62 ----------
 .../operators/distributed/communicator.h      |  61 ++++++++++
 .../distributed/communicator_test.cc          | 110 ++++++++++++++++++
 4 files changed, 172 insertions(+), 62 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/communicator_test.cc

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 750aac8dd0..972b4f67a8 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -55,6 +55,7 @@ cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc mem
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
 cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
+cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index f17af56400..72f26e91b2 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -24,9 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/parameter_recv.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/device_context.h"
 
 DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
@@ -43,71 +40,12 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 inline double GetCurrentUS() {
   struct timeval time;
   gettimeofday(&time, NULL);
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
-static inline void MergeVars(const std::string &var_name,
-                             const std::vector<std::shared_ptr<Variable>> &vars,
-                             Scope *scope) {
-  PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
-  auto cpu_place = platform::CPUPlace();
-  auto &var0 = vars[0];
-  auto *out_var = scope->Var(var_name);
-  if (var0->IsType<framework::LoDTensor>()) {
-    VLOG(3) << "merge " << var_name << " LoDTensor"
-            << var0->Get<framework::LoDTensor>().dims();
-
-    // init output tensor
-    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-    auto numel = out_t->numel();
-
-    // check the input dims
-    for (auto &var : vars) {
-      auto &var_t = var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(var_t.numel(), numel, "should have the same dims");
-    }
-
-    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, float>
-        constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<float>(0));
-
-    // sum all vars to out
-    auto result = EigenVector<float>::Flatten(*out_t);
-    for (auto &var : vars) {
-      auto &in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<float>::Flatten(in_t);
-      result.device(*cpu_ctx.eigen_device()) = result + in;
-    }
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto &slr0 = var0->Get<framework::SelectedRows>();
-    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
-    out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows *> inputs;
-    inputs.reserve(vars.size());
-    for (auto &var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
-    }
-    math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, float>
-        merge_add;
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    merge_add(dev_ctx, inputs, out_slr, false);
-    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims();
-  } else {
-    PADDLE_THROW("unsupported var type!");
-  }
-}
-
 std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
 std::once_flag Communicator::init_flag_;
 
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 4104cb20a3..3fe2a21232 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/distributed/rpc_common.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -91,6 +93,65 @@ class BlockingQueue {
   std::condition_variable cv_;
 };
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+inline void MergeVars(const std::string& var_name,
+                      const std::vector<std::shared_ptr<Variable>>& vars,
+                      Scope* scope) {
+  PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
+  auto cpu_place = platform::CPUPlace();
+  auto& var0 = vars[0];
+  auto* out_var = scope->Var(var_name);
+  if (var0->IsType<framework::LoDTensor>()) {
+    auto dims = var0->Get<framework::LoDTensor>().dims();
+    VLOG(3) << "merge " << var_name << " LoDTensor " << dims;
+
+    // init output tensor
+    auto* out_t = out_var->GetMutable<framework::LoDTensor>();
+    out_t->mutable_data<float>(dims, cpu_place);
+
+    // check the input dims
+    for (auto& var : vars) {
+      auto& var_t = var->Get<framework::LoDTensor>();
+      PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims");
+    }
+
+    // set output tensor to 0.
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    math::SetConstant<paddle::platform::CPUDeviceContext, float>
+        constant_functor;
+    constant_functor(cpu_ctx, out_t, static_cast<float>(0));
+
+    // sum all vars to out
+    auto result = EigenVector<float>::Flatten(*out_t);
+    for (auto& var : vars) {
+      auto& in_t = var->Get<framework::LoDTensor>();
+      auto in = EigenVector<float>::Flatten(in_t);
+      result.device(*cpu_ctx.eigen_device()) = result + in;
+    }
+  } else if (var0->IsType<framework::SelectedRows>()) {
+    auto& slr0 = var0->Get<framework::SelectedRows>();
+    auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+    out_slr->mutable_rows()->clear();
+    out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
+    std::vector<const paddle::framework::SelectedRows*> inputs;
+    inputs.reserve(vars.size());
+    for (auto& var : vars) {
+      inputs.push_back(&var->Get<framework::SelectedRows>());
+    }
+    math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, float>
+        merge_add;
+    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    merge_add(dev_ctx, inputs, out_slr, false);
+    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
+            << " dims: " << slr0.value().dims();
+  } else {
+    PADDLE_THROW("unsupported var type!");
+  }
+}
+
 using RpcCtxMap = std::unordered_map<std::string, RpcContext>;
 
 class Communicator {
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
new file mode 100644
index 0000000000..5294ac33d1
--- /dev/null
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
@@ -0,0 +1,110 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/communicator.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+TEST(communicator, merge_lod_tensors) {
+  auto cpu_place = platform::CPUPlace();
+  auto dims = framework::make_ddim({2, 3});
+  std::vector<std::shared_ptr<framework::Variable>> in_vars;
+  float out_value = 0;
+  for (auto i = 0; i < 10; ++i) {
+    auto var = std::make_shared<Variable>();
+    in_vars.emplace_back(var);
+    auto *tensor = var->GetMutable<LoDTensor>();
+    auto *data = tensor->mutable_data<float>(dims, cpu_place);
+    for (auto j = 0; j < tensor->numel(); ++j) {
+      data[j] = static_cast<float>(i);
+    }
+    out_value += static_cast<float>(i);
+  }
+  const std::string out_name = "Out";
+  std::unique_ptr<framework::Scope> scope;
+  scope.reset(new framework::Scope());
+  scope->Var(out_name);
+  for (auto i = 0; i < 10; ++i) {
+    MergeVars(out_name, in_vars, scope.get());
+  }
+  auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
+  auto *out_data = out_tensor.data<float>();
+  ASSERT_EQ(out_tensor.dims(), dims);
+  for (auto i = 0; i < out_tensor.numel(); ++i) {
+    ASSERT_EQ(out_data[i], out_value);
+  }
+}
+
+TEST(communicator, merge_selected_rows) {
+  auto cpu_place = platform::CPUPlace();
+  int64_t width = 10;
+  std::vector<std::shared_ptr<framework::Variable>> in_vars;
+  const int64_t height = 100;
+  for (auto i = 0; i < 10; ++i) {
+    std::vector<int64_t> rows;
+    for (auto k = 0; k <= i; ++k) {
+      rows.push_back(k);
+    }
+    auto var = std::make_shared<Variable>();
+    in_vars.emplace_back(var);
+    auto *slr = var->GetMutable<SelectedRows>();
+    slr->set_height(height);
+    slr->set_rows(rows);
+    auto dims =
+        framework::make_ddim({static_cast<int64_t>(rows.size()), width});
+    auto *data = slr->mutable_value()->mutable_data<float>(dims, cpu_place);
+    for (auto i = 0; i < rows.size(); ++i) {
+      for (auto j = 0; j < width; ++j) {
+        data[i * width + j] = static_cast<float>(rows[i]);
+      }
+    }
+  }
+  const std::string out_name = "Out";
+  std::unique_ptr<framework::Scope> scope;
+  scope.reset(new framework::Scope());
+  scope->Var(out_name);
+  for (auto i = 0; i < 10; ++i) {
+    MergeVars(out_name, in_vars, scope.get());
+  }
+  auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
+  auto &out_t = out_slr.value();
+  auto *out_data = out_t.data<float>();
+  ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width}));
+  std::vector<float> out_values;
+  out_values.reserve(10);
+  for (auto i = 0; i < 10; ++i) {
+    out_values.push_back(static_cast<float>(i * (10 - i)));
+  }
+  for (auto i = 0; i < out_slr.rows().size(); ++i) {
+    ASSERT_EQ(out_slr.rows()[i], i);
+    for (auto j = 0; j < width; ++j) {
+      ASSERT_EQ(out_data[i * width + j], out_values[i]);
+    }
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle

From c567debcd94e4d5aaf46dddccb1d17f06b992c89 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 13 Mar 2019 19:01:53 +0800
Subject: [PATCH 79/98] optimize log

---
 paddle/fluid/operators/distributed/communicator.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 72f26e91b2..3661c2763d 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -128,8 +128,11 @@ void Communicator::SendThread() {
       task_f.wait();
     }
     auto after_run_send_graph = GetCurrentUS();
-    VLOG(3) << "run send graph use time "
-            << after_run_send_graph - before_run_send_graph;
+    auto send_graph_use_time = after_run_send_graph - before_run_send_graph;
+    if (send_graph_use_time > 10) {
+      VLOG(1) << "run send graph use time "
+              << after_run_send_graph - before_run_send_graph;
+    }
     if (!FLAGS_communicator_independent_recv_thread) {
       RecvAll();
     }
@@ -156,7 +159,7 @@ void Communicator::RecvAll() {
     task.wait();
   }
   auto after_recv = GetCurrentUS();
-  VLOG(3) << "run recv graph use time " << after_recv - before_send;
+  VLOG(1) << "run recv graph use time " << after_recv - before_send;
 }
 
 void Communicator::RecvThread() {

From 347178bd977eb1323402d10a64bc3c3f6b157ae6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 14 Mar 2019 15:50:08 +0800
Subject: [PATCH 80/98] fix pserver memory leak

---
 paddle/fluid/operators/distributed/grpc/grpc_server.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index a0ed79201d..f32681738c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -107,9 +107,11 @@ class RequestSend final : public RequestBase {
     int trainer_id = request_->GetTrainerId();
     framework::Variable* outvar = nullptr;
 
+    /*
     if (!request_handler_->sync_mode()) {
       request_->ReleaseOwnershipOfLocalScope();
     }
+    */
     request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
     Finish(reply_, &responder_);
   }

From 065b68b6ca53b3eb140a9f3ebe95b8cdd856fef4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 14 Mar 2019 23:34:25 +0800
Subject: [PATCH 81/98] clean code

---
 .../fluid/operators/distributed/grpc/grpc_server.cc   |  6 ------
 paddle/fluid/operators/distributed/parameter_send.cc  |  6 +++---
 paddle/fluid/operators/distributed/request_handler.h  |  6 +-----
 .../operators/distributed/request_handler_impl.cc     | 11 ++---------
 .../fluid/operators/distributed/variable_response.h   | 11 +++--------
 .../fluid/operators/distributed_ops/send_recv_util.h  |  1 +
 6 files changed, 10 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index f32681738c..b86f0a53c4 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -106,12 +106,6 @@ class RequestSend final : public RequestBase {
     auto invar = request_->GetVar();
     int trainer_id = request_->GetTrainerId();
     framework::Variable* outvar = nullptr;
-
-    /*
-    if (!request_handler_->sync_mode()) {
-      request_->ReleaseOwnershipOfLocalScope();
-    }
-    */
     request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
     Finish(reply_, &responder_);
   }
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 3fe3be193a..388bc781c1 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -80,7 +80,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
     auto &send_slr = send_var->Get<framework::SelectedRows>();
     auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections);
 
-    auto send_rows = send_slr.rows();
+    auto &send_rows = send_slr.rows();
     std::vector<std::vector<int>> outs_rows_idx;
     std::vector<std::vector<int>> outs_dense_idx;
 
@@ -88,7 +88,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
     outs_dense_idx.resize(out_num);
 
     auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto src = send_slr.value().data<T>();
+    auto *src = send_slr.value().data<T>();
 
     // create output var in local scope
     std::vector<framework::SelectedRows *> outs;
@@ -110,8 +110,8 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
       outs[i]->set_height(rpc_ctx.height_sections[i]);
       auto dims = send_slr.GetCompleteDims();
       dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
       outs[i]->mutable_rows()->clear();
+      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
       if (rows_idx.size() > 0) {
         for (auto idx : rows_idx) {
           outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index e777d515ce..991158ac72 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -71,15 +71,13 @@ class VarHandle {
   VarHandle(const std::string ep, const std::string& method,
             const std::string& name,
             const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr,
-            bool delete_local_scope = false)
+            const framework::Scope* p_scope = nullptr)
       : status_(kDefaultState) {
     ep_ = ep;
     ctx_ = p_ctx;
     scope_ = p_scope;
     name_ = name;
     method_ = method;
-    delete_local_scope_ = delete_local_scope;
   }
 
   virtual ~VarHandle() {}
@@ -101,7 +99,6 @@ class VarHandle {
       std::unique_lock<std::mutex> lk(sync_mutex_);
       status_ = ok ? kFinishState : kErrorState;
     }
-    if (delete_local_scope_ && scope_) delete scope_;
     VLOG(7) << "VarHandle finish:" << ok;
     wait_cond_.notify_all();
   }
@@ -128,7 +125,6 @@ class VarHandle {
   std::string name_;
   // RPC method name.
   std::string method_;
-  bool delete_local_scope_;
 
  protected:
   std::mutex sync_mutex_;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index e5318f98ca..e289ec929d 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -59,15 +59,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
             "async mode should not recv BATCH_BARRIER_MESSAGE or "
             "COMPLETE_MESSAGE");
       }
-
-      try {
-        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                      scope);
-        delete scope;
-      } catch (std::exception& e) {
-        LOG(ERROR) << "async: run sub program error " << e.what();
-        return false;
-      }
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
       return true;
     } else {  // sync
       rpc_server_->WaitCond(kRequestSend);
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index edc12e2091..eb3265e092 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -60,13 +60,14 @@ class VariableResponse {
                    bool create_scope = false)
       : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
     if (create_scope) {
-      local_scope_ = &scope->NewScope();
+      local_scope_ = scope->NewTmpScope();
     }
   }
 
   virtual ~VariableResponse() {
     if (local_scope_) {
-      scope_->DeleteScope(local_scope_);
+      delete local_scope_;
+      local_scope_ = nullptr;
     }
   }
 
@@ -86,12 +87,6 @@ class VariableResponse {
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
   inline std::string TableName() const { return meta_.table_name(); }
-  inline void ReleaseOwnershipOfLocalScope() {
-    PADDLE_ENFORCE(create_scope_,
-                   "only when create_scope_ is true can you release the "
-                   "ownership of local scope");
-    local_scope_ = nullptr;
-  }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index 1e91f0dd51..01caee9a92 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -54,6 +54,7 @@ inline int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
       return i - 1;
     }
   }
+  PADDLE_ENFORCE_LT(row, abs_sections.back(), "row should be less then max id");
   return abs_sections.size() - 1;
 }
 

From ea0df4e8a2cf291a0e6626771c58d1d75635b3c1 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 16 Mar 2019 15:11:45 +0800
Subject: [PATCH 82/98] add some check

---
 .../fluid/operators/distributed/parameter_recv.cc   |  3 +++
 .../fluid/operators/distributed/parameter_send.cc   |  2 +-
 .../operators/distributed_ops/send_recv_util.h      | 10 ----------
 paddle/fluid/operators/split_selected_rows_op.h     | 13 +++++++++----
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index c3238f28f6..ae6516b246 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -80,7 +80,9 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
     framework::Tensor *recv_tensor =
         recv_var->GetMutable<framework::LoDTensor>();
     auto dev_ctx = paddle::platform::CPUDeviceContext();
+    int64_t recv_numel = 0;
     for (auto *in : recved_tensors) {
+      recv_numel += in->numel();
       auto in_stride = framework::stride_numel(in->dims());
       auto out_stride = framework::stride_numel(recv_tensor->dims());
       StridedNumelCopyWithAxis<T>(
@@ -88,6 +90,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
           in->data<T>(), in_stride, in_stride[0]);
       output_offset += in_stride[0];
     }
+    PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel());
   }
 
   delete local_scope;
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 388bc781c1..ec2884c252 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -99,7 +99,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
 
     // split rows index into output sparse vars
     for (size_t i = 0; i < send_rows.size(); ++i) {
-      int out_idx = FindOutIdx(send_rows[i], abs_sections);
+      int out_idx = GetSectionIndex(send_rows[i], abs_sections);
       outs_rows_idx[out_idx].push_back(send_rows[i]);
       outs_dense_idx[out_idx].push_back(i);
     }
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index 01caee9a92..c05a1ff1da 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -48,16 +48,6 @@ inline bool NeedSend(const framework::Scope& scope,
   return false;
 }
 
-inline int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (row < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  PADDLE_ENFORCE_LT(row, abs_sections.back(), "row should be less then max id");
-  return abs_sections.size() - 1;
-}
-
 inline std::vector<int64_t> ToAbsoluteSection(
     const std::vector<int64_t>& height_sections) {
   std::vector<int64_t> abs_sections;
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index c29065649e..9ec459e2a6 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -32,7 +32,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
 
     auto abs_sections = ToAbsoluteSection(height_sections);
 
-    auto x_rows = x->rows();
+    auto& x_rows = x->rows();
+    auto height = x->height();
     std::vector<std::vector<int>> outs_rows_idx;
     std::vector<std::vector<int>> outs_dense_idx;
 
@@ -44,8 +45,10 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
 
     // split rows index into output sparse vars
     for (size_t i = 0; i < x_rows.size(); ++i) {
-      int out_idx = FindOutIdx(x_rows[i], abs_sections);
-      outs_rows_idx[out_idx].push_back(x_rows[i]);
+      auto& id = x_rows[i];
+      PADDLE_ENFORCE_LT(id, height);
+      int out_idx = GetSectionIndex(id, abs_sections);
+      outs_rows_idx[out_idx].push_back(id);
       outs_dense_idx[out_idx].push_back(i);
     }
     auto place = ctx.GetPlace();
@@ -59,7 +62,9 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
       outs[i]->mutable_rows()->clear();
       if (rows_idx.size() > 0) {
         for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
+          auto id_offset = idx - abs_sections[i];
+          PADDLE_ENFORCE_LT(id_offset, height_sections[i]);
+          outs[i]->mutable_rows()->push_back(id_offset);
         }
         auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
         for (size_t j = 0; j < rows_idx.size(); j++) {

From 039d783db5ed14a5eabadb3177c800697afec39d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 18 Mar 2019 13:35:37 +0800
Subject: [PATCH 83/98] change communicator_recv_wait_ms to
 communicator_max_send_grad_num_before_recv

---
 .../operators/distributed/communicator.cc     | 23 ++++++++++++++-----
 .../operators/distributed/communicator.h      |  2 ++
 python/paddle/fluid/__init__.py               |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 3661c2763d..eba18c6777 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -29,7 +29,8 @@ DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
 DEFINE_int32(communicator_send_queue_size, 20,
              "queue size to recv gradient before send");
-DEFINE_int32(communicator_recv_wait_ms, 200, "wait time between each recv");
+DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
+             "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
 DEFINE_int32(communicator_max_merge_var_num, 20,
              "max var num to merge and send");
@@ -60,7 +61,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
           << FLAGS_communicator_independent_recv_thread;
   VLOG(0) << "communicator_send_queue_size: "
           << FLAGS_communicator_send_queue_size;
-  VLOG(0) << "communicator_recv_wait_ms: " << FLAGS_communicator_recv_wait_ms;
+  VLOG(0) << "communicator_max_send_grad_num_before_recv: "
+          << FLAGS_communicator_max_send_grad_num_before_recv;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
   VLOG(0) << "communicator_max_merge_var_num: "
@@ -102,6 +104,10 @@ void Communicator::SendThread() {
           while (var_queue->Size() > 0 &&
                  merged_var_num < FLAGS_communicator_max_merge_var_num) {
             vars.push_back(var_queue->Pop());
+            // only count the send number of the first var
+            if (var_name == send_varname_to_queue_.begin()->first) {
+              grad_num_.fetch_add(1, std::memory_order_relaxed);
+            }
             merged_var_num++;
           }
           auto before_merge = GetCurrentUS();
@@ -129,7 +135,7 @@ void Communicator::SendThread() {
     }
     auto after_run_send_graph = GetCurrentUS();
     auto send_graph_use_time = after_run_send_graph - before_run_send_graph;
-    if (send_graph_use_time > 10) {
+    if (send_graph_use_time > 100) {
       VLOG(1) << "run send graph use time "
               << after_run_send_graph - before_run_send_graph;
     }
@@ -165,9 +171,14 @@ void Communicator::RecvAll() {
 void Communicator::RecvThread() {
   VLOG(3) << "RecvThread start!";
   while (running_) {
-    RecvAll();
-    std::this_thread::sleep_for(
-        std::chrono::milliseconds(FLAGS_communicator_recv_wait_ms));
+    auto grad_num = grad_num_.load();
+    if (grad_num > FLAGS_communicator_max_send_grad_num_before_recv) {
+      VLOG(1) << "current grad num " << grad_num;
+      RecvAll();
+      grad_num_.store(0);
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 3fe2a21232..859c0a7f51 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <atomic>
 #include <deque>
 #include <memory>
 #include <string>
@@ -184,6 +185,7 @@ class Communicator {
   std::unique_ptr<Scope> send_scope_;  // an independent scope
   std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
   std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
+  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
 
   // the following code is for initialize the commnunicator
  public:
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c478c8ceee..97ac7fd97b 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -155,7 +155,7 @@ def __bootstrap__():
         # env for communicator
         read_env_flags.append('communicator_independent_recv_thread')
         read_env_flags.append('communicator_send_queue_size')
-        read_env_flags.append('communicator_recv_wait_ms')
+        read_env_flags.append('communicator_max_send_grad_num_before_recv')
         read_env_flags.append('communicator_thread_pool_size')
         read_env_flags.append('communicator_max_merge_var_num')
         read_env_flags.append('communicator_fake_rpc')

From 37f6b9ab7a24ace68167b68bfc3bce746a8abf7a Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 12:20:38 +0800
Subject: [PATCH 84/98] fix build test=develop

---
 .../fluid/framework/details/multi_devices_graph_pass.h   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 1d9ce17c50..21b0687f63 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   bool UseGPU() const;
 
-  bool NeedCollectiveForGrad(const std::string &grad_name,
-                             std::vector<ir::Node *> ops) const;
+  virtual bool NeedCollectiveForGrad(const std::string &grad_name,
+                                     std::vector<ir::Node *> ops) const;
 
   bool IsScaleLossOp(ir::Node *node) const;
 
@@ -117,7 +117,10 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
   void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
                           const std::string &g_name) const override {}
 
-  bool NeedCollectiveOps() const override { return false; }
+  bool NeedCollectiveForGrad(const std::string &grad_name,
+                             std::vector<ir::Node *> ops) const {
+    return false;
+  }
 
   bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
     if (node->Op()->Type() == "recv") {

From d640c6cfa93179a592b662df36025e6e57c6fb17 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 12:55:51 +0800
Subject: [PATCH 85/98] fix pylint

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 4ddfc084e0..41e5f47976 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1401,8 +1401,9 @@ class DistributeTranspiler(object):
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops if 'Param' in op.input_names and
-            op.input("Param")[0] == self.table_name
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[

From 392e97aae5451b5135ff3c971b4d8cc95ec9ae99 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 13:04:00 +0800
Subject: [PATCH 86/98] fix cpplint test=develop

---
 paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 2 +-
 paddle/fluid/framework/ir/pass.cc                            | 4 ++++
 paddle/fluid/operators/distributed/communicator.h            | 2 ++
 paddle/fluid/operators/distributed/grpc/grpc_server.cc       | 1 +
 paddle/fluid/operators/distributed/parameter_prefetch.cc     | 1 +
 paddle/fluid/operators/hierarchical_sigmoid_op.h             | 3 +++
 6 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 51f625cf2d..ec0a0064c4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -25,7 +25,7 @@
 #include <vector>
 
 #include <ThreadPool.h>  // ThreadPool in thrird party
-#include <functional>
+
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 823697495e..a03ba10b94 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
+
+#include <memory>
+#include <utility>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 859c0a7f51..41155bfc31 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include <ThreadPool.h>
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index b86f0a53c4..0eb313f75d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <limits>
+#include <memory>
 #include <string>
 
 #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 539a038099..a1eba34662 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -14,6 +14,7 @@
 
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 751091478e..ed97878240 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <iostream>
 #include <iterator>
+#include <memory>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"

From b542639dc04f55584a70cb44413ca4ba9c8f2abe Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 15:58:52 +0800
Subject: [PATCH 87/98] code clean test=develop

---
 .../details/async_ssa_graph_executor.cc       | 27 -------------------
 .../operators/distributed_ops/send_op.cc      | 17 ++++++------
 2 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 12822c64e9..5ca676ccde 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -80,33 +80,6 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
         }
       }
     }
-    /*
-    VLOG(3) << "delete all recv ops";
-    for (auto *node : nodes_to_delete) {
-      // delete input edge
-      for (auto *in : node->inputs) {
-        auto &in_outs = in->outputs;
-        for (auto iter = in_outs.begin(); iter != in_outs.end();) {
-          if (*iter == node) {
-            VLOG(3) << "delete input edge from " << in->Name() << " for "
-                    << node->Name();
-            iter = in_outs.erase(iter);
-          } else {
-            ++iter;
-          }
-        }
-      }
-      // delete output edge
-      for (auto *out : node->outputs) {
-        PADDLE_ENFORCE_EQ(out->outputs.size(), 0, "%s should have no outputs",
-                          out->Name());
-        VLOG(3) << "delete output edge to " << out->Name();
-        graphs[i]->RemoveNode(out);
-      }
-      VLOG(3) << "delete node " << node->Name();
-      graphs[i]->RemoveNode(node);
-    }
-    */
   }
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 67de7b4185..47688d0ad4 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -48,15 +48,14 @@ class SendOp : public framework::OperatorBase {
 
     if (send_varnames.size() > 0) {
       PADDLE_ENFORCE_EQ(ins.size(), 1, "");
-      /*
-      auto send_functor = distributed::ParameterSend<float>();
-      auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
-      height_sections);
-      send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
-      */
-      VLOG(3) << "send " << ins[0];
-      distributed::Communicator::GetInstance()->Send(ins[0], scope);
-      VLOG(3) << "send " << ins[0] << " done";
+      if (distributed::Communicator::GetInstance() == nullptr) {
+        auto send_functor = distributed::ParameterSend<float>();
+        auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
+                                               height_sections);
+        send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+      } else {
+        distributed::Communicator::GetInstance()->Send(ins[0], scope);
+      }
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();

From 33be014535609d3e4d58a36bf5243390cd8cc265 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 17:12:58 +0800
Subject: [PATCH 88/98] fix distribute compile problem test=develop

---
 paddle/fluid/framework/details/CMakeLists.txt              | 6 +++++-
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a3a10eade8..9c4634bcbc 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -89,7 +89,11 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
 
-cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor communicator)
+set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
+if(WITH_DISTRIBUTE)
+    list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator)
+endif()
+cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})
 
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 5ca676ccde..e9aad5d264 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -15,7 +15,10 @@
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 
 #include "paddle/fluid/framework/variable_helper.h"
+
+#ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/fluid/operators/distributed/communicator.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -43,6 +46,7 @@ inline void NewTempScopeAndInitVars(const std::vector<VarInfo> &var_infos,
 
 // get RpcContext and remote send and recv op
 void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
+#ifdef PADDLE_WITH_DISTRIBUTE
   using RpcCtxMap = operators::distributed::RpcCtxMap;
   VLOG(3) << "ProcessGraph";
   RpcCtxMap send_varname_to_ctx;
@@ -88,6 +92,7 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
                                                recv_varname_to_ctx, scope);
     operators::distributed::Communicator::GetInstance()->Start();
   }
+#endif
 }
 
 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(

From b68f84090bfc00c2c73aa49aca5f760bd2859352 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 27 Mar 2019 19:09:58 +0800
Subject: [PATCH 89/98] fix test_split_selected_rows_op test=develop

---
 .../paddle/fluid/tests/unittests/test_split_selected_rows_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index f8847e1570..d8c57d964d 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -38,7 +38,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
         rows = [0, 5, 7, 4, 20]
-        height = 20
+        height = 21
         row_numel = 2
 
         # initialize input variable X

From 34890fd3b129f85f28489453ddd1d5f62dd526f7 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 28 Mar 2019 09:07:50 +0800
Subject: [PATCH 90/98] fix gpu build for lookup_table_op test=develop

---
 paddle/fluid/operators/lookup_table_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 0af8b9e69c..a863af4af9 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -84,7 +84,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 
     // for remote prefetch
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto height_sections =
+        context.Attr<std::vector<int64_t>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
     if (!epmap.empty()) {

From 61912e879d23811e966fc6dae8eeaf080056b4e4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 29 Mar 2019 21:24:29 +0800
Subject: [PATCH 91/98] test_dist_base set runtime_split_send_recv to false
 test=develop

---
 python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 9fd2fe739e..a5d8cd4660 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -52,7 +52,7 @@ class TestDistRunnerBase(object):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
-        config.runtime_split_send_recv = True
+        # config.runtime_split_send_recv = True
         t = fluid.DistributeTranspiler(config=config)
         t.transpile(
             trainer_id=trainer_id,

From a1821a04493152facc8ff63a2bcd6b339028d7a5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 30 Mar 2019 22:52:19 +0800
Subject: [PATCH 92/98] remote remote_prefetch in embedding layer test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 3 +++
 paddle/fluid/operators/lookup_table_op.h                   | 3 ++-
 python/paddle/fluid/layers/nn.py                           | 5 ++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index e9aad5d264..8fe4cdc709 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -81,6 +81,9 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
+        } else if (node->Name() == "lookup_table") {
+          VLOG(0) << "set lookup_table op remote_prefetch to false";
+          node->Op()->SetAttr("remote_prefetch", false);
         }
       }
     }
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 524565a439..62e298e066 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -50,11 +50,12 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     // for remote prefetch
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
     auto height_sections =
         context.Attr<std::vector<int64_t>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
-    if (!epmap.empty()) {
+    if (remote_prefetch && !epmap.empty()) {
 // if epmap is not empty, then the parameter will be fetched from remote
 // parameter
 // server
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9743cfa727..f2413f6033 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -341,8 +341,7 @@ def embedding(input,
               is_distributed=False,
               padding_idx=None,
               param_attr=None,
-              dtype='float32',
-              remote_prefetch=False):
+              dtype='float32'):
     """
     **Embedding Layer**
 
@@ -381,7 +380,7 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
-    remote_prefetch = is_sparse and (not is_distributed) and remote_prefetch
+    remote_prefetch = is_sparse and (not is_distributed)
     if remote_prefetch:
         assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(

From df45c8c538bddc1d43f933438413d4143c588fce Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 30 Mar 2019 23:00:17 +0800
Subject: [PATCH 93/98] update nce and hierarchical_sigmoid remote_prefetch
 test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 5 +++--
 paddle/fluid/operators/hierarchical_sigmoid_op.h           | 3 ++-
 paddle/fluid/operators/nce_op.h                            | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 8fe4cdc709..52641260a6 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -81,8 +81,9 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
-        } else if (node->Name() == "lookup_table") {
-          VLOG(0) << "set lookup_table op remote_prefetch to false";
+        } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
+                   node->Name() == "hierarchical_sigmoid") {
+          VLOG(0) << "set " << node->Name() << " op remote_prefetch to false";
           node->Op()->SetAttr("remote_prefetch", false);
         }
       }
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index ed97878240..82c8171ca5 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -68,8 +68,9 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
     // for remote prefetch
 
+    auto remote_prefetch = ctx.Attr<bool>("remote_prefetch");
     auto epmap = ctx.Attr<std::vector<std::string>>("epmap");
-    if (!epmap.empty()) {
+    if (remote_prefetch && !epmap.empty()) {
       // if epmap is not empty, then the parameter will be fetched from remote
       // parameter
       // server
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 25b6ed851b..12f3118ec7 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -156,9 +156,10 @@ class NCEKernel : public framework::OpKernel<T> {
     auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
 
     // for remote prefetch
+    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
 
-    if (!epmap.empty()) {
+    if (remote_prefetch && !epmap.empty()) {
       // if epmap is not empty, then the parameter will be fetched from remote
       // parameter
       // server

From 8342f12e3159c74cb6753be15c6661a3bf5ac789 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 31 Mar 2019 09:02:50 +0800
Subject: [PATCH 94/98] fix set remote_prefetch test=develop

---
 paddle/fluid/framework/details/async_ssa_graph_executor.cc | 4 ----
 paddle/fluid/framework/details/multi_devices_graph_pass.h  | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 52641260a6..e9aad5d264 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -81,10 +81,6 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
           nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
-        } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
-                   node->Name() == "hierarchical_sigmoid") {
-          VLOG(0) << "set " << node->Name() << " op remote_prefetch to false";
-          node->Op()->SetAttr("remote_prefetch", false);
         }
       }
     }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index a3fe9e8b13..82d003fad7 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -127,8 +127,13 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
 
   bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
     if (node->Op()->Type() == "recv") {
+      VLOG(0) << "set recv op do_not_run to true";
       node->Op()->SetAttr("do_not_run", true);
       node->Op()->Flush();
+    } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
+               node->Name() == "hierarchical_sigmoid") {
+      VLOG(0) << "set " << node->Name() << " op remote_prefetch to false";
+      node->Op()->SetAttr("remote_prefetch", false);
     }
     return false;
   }

From 9db1a9e1288433878128ba40f88a32e4ef5a1691 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 31 Mar 2019 12:03:08 +0800
Subject: [PATCH 95/98] change log level test=develop

---
 paddle/fluid/framework/details/multi_devices_graph_pass.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 82d003fad7..26fc8dc198 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -127,12 +127,12 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
 
   bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
     if (node->Op()->Type() == "recv") {
-      VLOG(0) << "set recv op do_not_run to true";
+      VLOG(1) << "set recv op do_not_run to true";
       node->Op()->SetAttr("do_not_run", true);
       node->Op()->Flush();
     } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
                node->Name() == "hierarchical_sigmoid") {
-      VLOG(0) << "set " << node->Name() << " op remote_prefetch to false";
+      VLOG(1) << "set " << node->Name() << " op remote_prefetch to false";
       node->Op()->SetAttr("remote_prefetch", false);
     }
     return false;

From fb6cc3a1bd40378b3a9d560bd975ab22b730eb2d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 1 Apr 2019 09:06:33 +0800
Subject: [PATCH 96/98] follow commnet, optimize code and add comment
 test=develop

---
 .../framework/details/multi_devices_graph_pass.h    |  3 +++
 paddle/fluid/framework/scope.h                      |  4 ++++
 .../fluid/operators/distributed/parameter_send.cc   | 13 ++++++-------
 paddle/fluid/operators/distributed_ops/send_op.cc   |  2 +-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 26fc8dc198..7cc68dd2d5 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -132,8 +132,11 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
       node->Op()->Flush();
     } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
                node->Name() == "hierarchical_sigmoid") {
+      // in async_mode, we do not need remote prefetch, because communicator
+      // will do async parameter recv.
       VLOG(1) << "set " << node->Name() << " op remote_prefetch to false";
       node->Op()->SetAttr("remote_prefetch", false);
+      node->Op()->Flush();
     }
     return false;
   }
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index cd752077d6..6665458d4c 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -52,6 +52,10 @@ class Scope {
   /// Mark it to const because that new kid scope cannot change parent scope.
   Scope& NewScope() const;
 
+  /// Create a sub-scope for current scope but do not record it in the kids to
+  /// avoid performance problems.
+  /// Note!!! You should delete the result pointer yourself to avoid memory
+  /// leak!
   Scope* NewTmpScope() const;
 
   /// Create a variable with given name if it doesn't exist.
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index ec2884c252..4858dbe84e 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -81,8 +81,8 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
     auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections);
 
     auto &send_rows = send_slr.rows();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
+    std::vector<std::vector<size_t>> outs_rows_idx;
+    std::vector<std::vector<size_t>> outs_dense_idx;
 
     outs_rows_idx.resize(out_num);
     outs_dense_idx.resize(out_num);
@@ -99,7 +99,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
 
     // split rows index into output sparse vars
     for (size_t i = 0; i < send_rows.size(); ++i) {
-      int out_idx = GetSectionIndex(send_rows[i], abs_sections);
+      size_t out_idx = GetSectionIndex(send_rows[i], abs_sections);
       outs_rows_idx[out_idx].push_back(send_rows[i]);
       outs_dense_idx[out_idx].push_back(i);
     }
@@ -160,10 +160,9 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
     }
   }
 
-  // note!! only support sync send now
-  if (true || sync) {
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  if (sync) {
+    for (auto &handle : rets) {
+      PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient");
     }
   }
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 47688d0ad4..b08cd0942f 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -52,7 +52,7 @@ class SendOp : public framework::OperatorBase {
         auto send_functor = distributed::ParameterSend<float>();
         auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
                                                height_sections);
-        send_functor(rpc_ctx, scope, static_cast<bool>(sync_send));
+        send_functor(rpc_ctx, scope, true);
       } else {
         distributed::Communicator::GetInstance()->Send(ins[0], scope);
       }

From 9861a92f6f014b826050b1c292eff3fb1b6ea5dc Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 1 Apr 2019 12:19:40 +0800
Subject: [PATCH 97/98] change the return type of NewTempScope to unique ptr
 test=develop

---
 paddle/fluid/framework/scope.cc                     |  4 +++-
 paddle/fluid/framework/scope.h                      |  4 +---
 .../operators/distributed/parameter_prefetch.cc     | 13 ++++++-------
 .../fluid/operators/distributed/parameter_recv.cc   |  5 ++---
 .../fluid/operators/distributed/parameter_send.cc   | 10 ++++------
 .../fluid/operators/distributed/variable_response.h |  2 +-
 6 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index aa1039baf0..49e22a5ad3 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -59,7 +59,9 @@ Scope& Scope::NewScope() const {
   return *child;
 }
 
-Scope* Scope::NewTmpScope() const { return new Scope(this); }
+std::unique_ptr<Scope> Scope::NewTmpScope() const {
+  return std::unique_ptr<Scope>(new Scope(this));
+}
 
 Variable* Scope::Var(const std::string& name) {
   SCOPE_VARS_WRITER_LOCK
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 6665458d4c..5f3d106e09 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -54,9 +54,7 @@ class Scope {
 
   /// Create a sub-scope for current scope but do not record it in the kids to
   /// avoid performance problems.
-  /// Note!!! You should delete the result pointer yourself to avoid memory
-  /// leak!
-  Scope* NewTmpScope() const;
+  std::unique_ptr<Scope> NewTmpScope() const;
 
   /// Create a variable with given name if it doesn't exist.
   /// Caller doesn't own the returned Variable.
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index a686672813..7c33153ba7 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -160,7 +160,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope) {
-  framework::Scope* local_scope = scope.NewTmpScope();
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -206,7 +206,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   auto splited_ids = SplitIds(ids_vector, height_sections);
   SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
-                                    local_scope);
+                                    local_scope.get());
 
   // create output var in local scope
   for (auto& name : out_var_names) {
@@ -215,12 +215,12 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   std::vector<distributed::VarHandlePtr> rets;
   for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope, in_var_names[i])) {
+    if (NeedSend(*local_scope.get(), in_var_names[i])) {
       VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
               << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], cpu_ctx, *local_scope, in_var_names[i], out_var_names[i],
-          table_names[i]));
+          epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i],
+          out_var_names[i], table_names[i]));
     } else {
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
@@ -232,8 +232,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                     out_var_names, height_sections, splited_ids,
-                                    context, local_scope, &actual_ctx);
-  delete local_scope;
+                                    context, local_scope.get(), &actual_ctx);
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index ae6516b246..2466be3254 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -42,7 +42,7 @@ template <typename T>
 void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::Scope &scope) {
   VLOG(3) << "ParameterRecv in";
-  framework::Scope *local_scope = scope.NewTmpScope();
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -64,7 +64,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
       recved_tensors.push_back(t);
       VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
       rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
-                                             *local_scope, recv_var_name,
+                                             *local_scope.get(), recv_var_name,
                                              recv_var_name));
     }
     for (size_t i = 0; i < rets.size(); i++) {
@@ -93,7 +93,6 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
     PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel());
   }
 
-  delete local_scope;
   VLOG(3) << "ParameterRecv out";
 }
 
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 4858dbe84e..c8a00cce7e 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -40,7 +40,7 @@ using DDim = framework::DDim;
 template <typename T>
 void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::Scope &scope, bool sync) {
-  framework::Scope *local_scope = scope.NewTmpScope();
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -150,10 +150,10 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
   for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
     auto &send_var_name = rpc_ctx.splited_var_names[i];
     auto &endpoint = rpc_ctx.epmap[i];
-    if (NeedSend(*local_scope, send_var_name)) {
+    if (NeedSend(*local_scope.get(), send_var_name)) {
       VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncSendVar(endpoint, cpu_ctx, *local_scope,
-                                              send_var_name));
+      rets.push_back(rpc_client->AsyncSendVar(
+          endpoint, cpu_ctx, *local_scope.get(), send_var_name));
     } else {
       VLOG(3) << "don't send non-initialized variable: "
               << rpc_ctx.splited_var_names[i];
@@ -165,8 +165,6 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
       PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient");
     }
   }
-
-  delete local_scope;
 }
 
 template struct ParameterSend<float>;
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index eb3265e092..3cabcd22cd 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -60,7 +60,7 @@ class VariableResponse {
                    bool create_scope = false)
       : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
     if (create_scope) {
-      local_scope_ = scope->NewTmpScope();
+      local_scope_ = scope->NewTmpScope().release();
     }
   }
 

From 4031c1a7b1248f0f909dc30dd852aacedb4a4daa Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 1 Apr 2019 13:25:58 +0800
Subject: [PATCH 98/98] fix ci build test=develop

---
 paddle/fluid/operators/distributed/parameter_prefetch.cc | 1 +
 paddle/fluid/operators/distributed/parameter_recv.cc     | 1 +
 paddle/fluid/operators/distributed/parameter_send.cc     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 7c33153ba7..0e8d877e08 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 2466be3254..e7d4c262aa 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <set>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index c8a00cce7e..9ce4244452 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <set>
 #include <string>
 #include <vector>