Fix NCCLBcast hang up bug in Parallel Executor (#11377)

* 1. Create buddy allocator in each places before NcclBcast the variables 2. Check the memory usage of ALL gpus rather than the first one * 1. Make NCCLGroupGuard guards only the ncclBcast part, which avoid ncclGroupEnd blocking the exception throwing 2. NOTE the usage of NCCLGroupGuard * Remove the memory usage check of gpus * Fix code style
7 years ago · 046bb5c8cb
parent cbaa24f597
commit 046bb5c8cb
2 changed files with 20 additions and 5 deletions
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      platform::NCCLGroupGuard guard;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        buffers.push_back(buffer);
        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
                                     nccl_ctx.comm_, nccl_ctx.stream());
      }
-      member_->nccl_ctxs_->WaitAll();
+
      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                        "variables' buffer size to bcast NOT equal to places");
      {
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                       nccl_ctx.comm_, nccl_ctx.stream());
        }
        member_->nccl_ctxs_->WaitAll();
      }
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
  }
 }
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
 // cause blocking problem when a runtime_error was thrown, so try only guard
 // NCCL actions when use it.
 class NCCLGroupGuard {
 public:
  static std::mutex &NCCLMutex() {