Merge pull request #11616 from chengduoZH/fix_parallel_exe

Enhance Parallel Executor stable
7 years ago · 9b3f48d7e6
parent bcea248b60 c99fca5f90
commit 9b3f48d7e6
6 changed files with 88 additions and 24 deletions
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() {
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
    std::vector<std::function<void()>> broadcast_calls;
    int type = platform::ToNCCLDataType(in_tensor.type());
    size_t numel = static_cast<size_t>(in_tensor.numel());
    for (auto out_var_handle : out_var_handles) {
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                              ->FindVar(out_var_handle->name_);
@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() {
        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
        out_handle = out_var_handle;
      } else {
-        send_recv_buffer =
+        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
-            VariableVisitor::GetMutableTensor(out_var).mutable_data(
+                               .Resize(in_tensor.dims())
-                out_var_handle->place_);
+                               .mutable_data(out_var_handle->place_);
      }
      int type = platform::ToNCCLDataType(in_tensor.type());
      size_t numel = static_cast<size_t>(in_tensor.numel());
      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
            PADDLE_ENFORCE(platform::dynload::ncclBcast(
@ -102,23 +103,50 @@ void BroadcastOpHandle::RunImpl() {
          });
    }
-    this->RunAndRecordEvent([&] {
+    // FIXME(zcd): a temporary fix for some language model that has sparse
-      {
+    // parameter.
-        platform::NCCLGroupGuard guard;
+    bool use_mutex = true;
-        for (auto &call : broadcast_calls) {
+    if (in_var->IsType<paddle::framework::SelectedRows>()) {
-          call();
+      use_mutex = false;
    }
    if (use_mutex) {
      this->RunAndRecordEvent([&] {
        {
          platform::NCCLGroupGuard guard;
          for (auto &call : broadcast_calls) {
            call();
          }
        }
      }
-      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                           ->FindVar(out_var_handles[0]->name_);
+                             ->FindVar(out_var_handles[0]->name_);
-        paddle::framework::TensorCopy(
+          paddle::framework::TensorCopy(
-            in_tensor, in_var_handle->place_,
+              in_tensor, in_var_handle->place_,
-            *(dev_ctxes_.at(in_var_handle->place_)),
+              *(dev_ctxes_.at(in_var_handle->place_)),
-            &VariableVisitor::GetMutableTensor(out_var));
+              &VariableVisitor::GetMutableTensor(out_var));
-      }
+        }
-    });
+      });
    } else {
      this->RunAndRecordEventNoMutex([&] {
        {
          platform::NCCLGroupGuard guard;
          for (auto &call : broadcast_calls) {
            call();
          }
        }
        if (!out_handle->IsTheSameVar(*in_var_handle)) {
          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
                             ->FindVar(out_var_handles[0]->name_);
          paddle::framework::TensorCopy(
              in_tensor, in_var_handle->place_,
              *(dev_ctxes_.at(in_var_handle->place_)),
              &VariableVisitor::GetMutableTensor(out_var));
        }
      });
    }
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -351,7 +351,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
-    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    auto var = new VarHandle(vars.size(), i, og, p);
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
@ -447,8 +447,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
    op_handle->AddInput(prev_grad.get());
  }
  auto &vars = result->vars_[dst_dev_id][og];
-  auto var =
+  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
      new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
  return var;
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@ -139,6 +139,29 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #endif
 }
 void OpHandleBase::RunAndRecordEventNoMutex(
    const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
    for (auto &p : dev_ctxes_) {
      method = [method, p, this]() {
        static_cast<platform::CUDADeviceContext *>(p.second)
            ->RecordEventNoMutex(
                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
                method);
      };
    }
    method();
  } else {
 #endif
    callback();
 #ifdef PADDLE_WITH_CUDA
  }
 #endif
 }
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                     const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -85,6 +85,10 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);
  // FIXME(zcd): A temporary fix for some language model that has sparse
  // parameter.
  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -80,7 +80,9 @@ void ReduceOpHandle::RunImpl() {
  }
  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    this->RunAndRecordEvent([&] {
+    // FIXME(zcd): A temporary fix for some language model that has sparse
    // parameter.
    this->RunAndRecordEventNoMutex([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@ -106,6 +106,14 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
  // FIXME(zcd): A temporary fix for some language model that has sparse
  // parameter.
  template <typename Callback>
  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
    callback();
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
 private:
  CUDAPlace place_;