[Prepare for MultiProcess xpu] unified gen nccl id, refine imperative reducer (#30455)

5 years ago · 572c466d19
parent 549855ac20
commit 572c466d19
17 changed files with 599 additions and 444 deletions
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@ -16,8 +16,24 @@
 #include "paddle/fluid/imperative/all_reduce.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <nccl.h>
 #include <string>
 #include <utility>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 namespace paddle {
 namespace imperative {
 static const platform::Place &GetVarPlace(const framework::Variable &src) {
  if (src.IsType<framework::LoDTensor>()) {
    return src.Get<framework::LoDTensor>().place();
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@ -16,21 +16,6 @@
 #ifdef PADDLE_WITH_NCCL
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <nccl.h>
 #include <string>
 #include <utility>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 namespace paddle {
 namespace framework {
 class Variable;
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@ -13,73 +13,20 @@
 // limitations under the License.
 #pragma once
-// network header files
+#include <memory>
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <netinet/in.h>
 #include <stdlib.h>
 #include <sys/socket.h>
 #endif
 #include <string>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/imperative/all_reduce.h"
+#include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/fluid/string/string_helper.h"
 namespace paddle {
 namespace imperative {
 struct ParallelStrategy {
  int nranks_{1};
  int local_rank_{0};
  std::vector<std::string> trainer_endpoints_{};
  std::string current_endpoint_{""};
  // TODO(shenliang03): support multi stream communication
  int nrings_{1};
 };
 class ParallelContext {
 public:
  explicit ParallelContext(const ParallelStrategy& strategy,
                           const platform::Place& place)
      : strategy_(strategy), place_(place) {}
  virtual ~ParallelContext() {}
  virtual void Init() = 0;
  virtual void AllReduceByStream(const framework::Variable& src,
                                 framework::Variable* dst, int ring_id = 0,
                                 bool use_calc_stream = false) = 0;
 #if defined(PADDLE_WITH_NCCL)
  virtual paddle::platform::CUDADeviceContext* GetDeviceContext(
      int ring_id) = 0;
 #endif
  inline int GetNRings() { return strategy_.nrings_; }
 protected:
  ParallelStrategy strategy_;
  platform::Place place_;
 };
 #if defined(PADDLE_WITH_NCCL)
 class NCCLParallelContext : public ParallelContext {
 public:
@ -87,7 +34,7 @@ class NCCLParallelContext : public ParallelContext {
                               const platform::Place& place)
      : ParallelContext(strategy, place) {}
-  ~NCCLParallelContext() {}
+  ~NCCLParallelContext() override = default;
  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root);  // NOLINT
@ -97,14 +44,18 @@ class NCCLParallelContext : public ParallelContext {
                         framework::Variable* dst, int ring_id,
                         bool use_calc_stream) override;
-  paddle::platform::CUDADeviceContext* GetDeviceContext(int ring_id) override;
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
  void WaitCompute(int ring_id) override;
  void WaitComm(int ring_id) override;
- protected:
+ private:
-  void RecvNCCLID(const std::string& endpoint,
+  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
-                  std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
+  std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;
-  void SendNCCLID(const std::string& endpoint,
+  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
-                  const std::vector<ncclUniqueId>& nccl_ids);
+  std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
 };
 #endif
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@ -0,0 +1,75 @@
 // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace platform {
 class DeviceContext;
 }  // namespace platform
 namespace framework {
 class Variable;
 }  // namespace framework
 }  // namespace paddle
 namespace paddle {
 namespace imperative {
 struct ParallelStrategy {
  int nranks_{1};
  int local_rank_{0};
  std::vector<std::string> trainer_endpoints_{};
  std::string current_endpoint_{""};
  int nrings_{1};
 };
 class ParallelContext {
 public:
  explicit ParallelContext(const ParallelStrategy& strategy,
                           const platform::Place& place)
      : strategy_(strategy), place_(place) {}
  virtual ~ParallelContext() = default;
  virtual void Init() = 0;
  virtual void AllReduceByStream(const framework::Variable& src,
                                 framework::Variable* dst, int ring_id,
                                 bool use_calc_stream) = 0;
  virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0;
  // comm_stream[ring_id] wait compute_stream.
  // if CPU, should do nothing.
  virtual void WaitCompute(int ring_id) = 0;
  // compute_stream wait comm_stream[ring_id]
  // if CPU, should do nothing.
  virtual void WaitComm(int ring_id) = 0;
  inline int GetNRings() const { return strategy_.nrings_; }
 protected:
  ParallelStrategy strategy_;
  platform::Place place_;
 };
 }  //  namespace imperative
 }  //  namespace paddle
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@ -24,60 +24,27 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/imperative/op_base.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/string/string_helper.h"
-#if defined(PADDLE_WITH_NCCL)
+namespace paddle {
-#include "paddle/fluid/imperative/all_reduce.h"
+namespace platform {
-#include "paddle/fluid/operators/math/concat_and_split.h"
+class DeviceContext;
-#include "paddle/fluid/operators/strided_memcpy.h"
+}  // namespace platform
-#include "paddle/fluid/platform/cuda_resource_pool.h"
+
-#endif
+namespace imperative {
 class ParallelContext;
 class VarBase;
 class VariableWrapper;
 }  // namespace imperative
 }  // namespace paddle
 namespace paddle {
 namespace imperative {
 #if defined(PADDLE_WITH_NCCL)
 template <typename T>
 void ConcatTensorsForAllReduce(
    const platform::CUDADeviceContext& context,
    const std::vector<framework::Tensor>& dense_tensors_,
    framework::Variable* p_dense_contents) {
  operators::math::ConcatFunctor<platform::CUDADeviceContext, T>
      concat_functor_;
  concat_functor_(context, dense_tensors_, 0,
                  p_dense_contents->GetMutable<framework::LoDTensor>());
 }
 template <typename T>
 void SplitTensorsForAllReduce(const platform::CUDADeviceContext& context,
                              framework::Variable* p_dense_contents,
                              std::vector<framework::Tensor>* p_dense_tensors) {
  auto* in = p_dense_contents->GetMutable<framework::LoDTensor>();
  std::vector<framework::Tensor*> outs;
  std::vector<const framework::Tensor*> shape_refer;
  outs.reserve(p_dense_tensors->size());
  shape_refer.reserve(p_dense_tensors->size());
  for (auto& tensor : *p_dense_tensors) {
    outs.emplace_back(&tensor);
    shape_refer.emplace_back(&tensor);
  }
  // Sometimes direct copies will be faster
  if (p_dense_tensors->size() < 10) {
    operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
  } else {
    operators::math::SplitFunctor<platform::CUDADeviceContext, T>
        split_functor_;
    split_functor_(context, *in, shape_refer, 0, &outs);
  }
 }
 class Group {
 public:
  // Here, we use dense_contents_ & sparse_contents_ to
@ -104,10 +71,10 @@ class Group {
  framework::proto::VarType::Type dtype_;
  // context is used to select the stream for concat
-  void ConcatTensors(const platform::CUDADeviceContext& context);
+  void ConcatTensors(const platform::DeviceContext& context);
  // context is used to select the stream for split
-  void SplitTensors(const platform::CUDADeviceContext& context);
+  void SplitTensors(const platform::DeviceContext& context);
  friend std::ostream& operator<<(std::ostream&, const Group&);
 };
@ -155,8 +122,6 @@ class Reducer {
  std::vector<std::vector<size_t>> RebuildGruops();
  void CreateGroupEvents(int group_num);
  inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }
  // Reducer Singleton
@ -193,11 +158,6 @@ class Reducer {
  std::shared_ptr<imperative::ParallelContext> parallel_ctx_;
  std::vector<VariableLocator> variable_locators_;
  // Following variables are to help sync stream
  std::vector<std::shared_ptr<platform::CudaEventObject>> group_events_;
  std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
  cudaStream_t compute_stream_;
  std::vector<cudaStream_t> comm_streams_;
  int nrings_ = 1;
  // Following variables are to help rebuild group
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <thread>  // NOLINT
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "gtest/gtest.h"
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@ -60,6 +60,109 @@ TEST(TestGroup, TestPrintGroupMessage) {
  ASSERT_STREQ(stream2.str().c_str(), head.c_str());
 }
 template <typename T, typename Place>
 void GroupConcatSplit(Place place, size_t size) {
  platform::CPUPlace cpu_place;
  Group group;
  // [[0.0], [0.0, 1.0], [0.0, 1.0, 2.0] .. ]
  std::vector<framework::Variable> vars;
  vars.resize(size);
  for (size_t i = 0; i < size; ++i) {
    auto len = i + 1;
    auto* tensor = vars[i].GetMutable<framework::LoDTensor>();
    tensor->Resize({static_cast<int64_t>(len)});
    auto* data = tensor->mutable_data<T>(place);
    std::vector<T> value;
    for (size_t j = 0; j < len; ++j) {
      value.push_back(static_cast<T>(1.0 * j));
    }
    if (std::is_same<Place, platform::CUDAPlace>::value) {
      paddle::memory::Copy(place, data, cpu_place, value.data(),
                           sizeof(T) * value.size(), 0);
    } else {
      paddle::memory::Copy(place, data, cpu_place, value.data(),
                           sizeof(T) * value.size());
    }
    framework::Tensor tmp;
    tmp.ShareDataWith(*tensor).Resize({static_cast<int64_t>(len)});
    group.dense_tensors_.push_back(std::move(tmp));
    group.all_length_ += len;
    group.dtype_ = tensor->type();
  }
  paddle::platform::DeviceContextPool& pool =
      paddle::platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);
  {  // concat
    group.ConcatTensors(*dev_ctx);
    auto* tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
    framework::Tensor tmp;
    framework::TensorCopySync(*tensor, cpu_place, &tmp);
    auto* data = tmp.data<T>();
    size_t offset = 0;
    for (size_t i = 0; i < size; ++i) {
      auto len = i + 1;
      for (size_t j = 0; j < len; ++j) {
        EXPECT_EQ(data[offset + j], static_cast<T>(1.0 * j));
        // [[-0.0], [-0.0, -1.0], [-0.0, -1.0, -2.0] .. ]
        data[offset + j] = -data[offset + j];
      }
      offset += len;
    }
    framework::TensorCopySync(tmp, place, tensor);
  }
  {  // split
    group.SplitTensors(*dev_ctx);
    for (size_t i = 0; i < size; ++i) {
      auto len = i + 1;
      auto& tensor = group.dense_tensors_[i];
      framework::Tensor tmp;
      framework::TensorCopySync(tensor, cpu_place, &tmp);
      auto* data = tmp.data<T>();
      for (size_t j = 0; j < len; ++j) {
        EXPECT_EQ(data[j], static_cast<T>(-1.0 * j));
      }
    }
  }
 }
 TEST(TestGroup, TestConcatSplit) {
  platform::CUDAPlace cuda_place(0);
  platform::CPUPlace cpu_place;
  int size = 3;
  GroupConcatSplit<float>(cpu_place, size);
  GroupConcatSplit<double>(cpu_place, size);
  GroupConcatSplit<platform::float16>(cpu_place, size);
  GroupConcatSplit<float>(cuda_place, size);
  GroupConcatSplit<double>(cuda_place, size);
  GroupConcatSplit<platform::float16>(cuda_place, size);
  size = 15;
  GroupConcatSplit<float>(cpu_place, size);
  GroupConcatSplit<double>(cpu_place, size);
  GroupConcatSplit<platform::float16>(cpu_place, size);
  GroupConcatSplit<float>(cuda_place, size);
  GroupConcatSplit<double>(cuda_place, size);
  GroupConcatSplit<platform::float16>(cuda_place, size);
 }
 TEST(TestGroup, TestConcatSplitException) {
  platform::CUDAPinnedPlace place;
  int size = 3;
  ASSERT_ANY_THROW(GroupConcatSplit<float>(place, size));
 }
 #endif
 }  // namespace imperative
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@ -15,9 +15,8 @@ register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DE
 if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
-    cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common)
+    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
-    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
+    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
 endif()
 if(WITH_GLOO)
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@ -23,11 +23,32 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 namespace paddle {
 namespace operators {
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
  for (size_t i = 0; i < nccl_ids->size(); ++i) {
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
  }
 }
 static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
                            std::function<std::string(size_t)> func,
                            const framework::Scope& scope) {
  for (size_t i = 0; i < nccl_ids.size(); ++i) {
    std::string var_name = func(i);
    auto var = scope.FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(
        var, platform::errors::NotFound("Variable with name %s is not found",
                                        var_name.c_str()));
    auto nccl_id = var->GetMutable<ncclUniqueId>();
    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
  }
 }
 class CGenNCCLIdOp : public framework::OperatorBase {
 public:
  CGenNCCLIdOp(const std::string& type,
@ -45,14 +66,20 @@ class CGenNCCLIdOp : public framework::OperatorBase {
      return Output("Out");
    };
    std::vector<ncclUniqueId> nccl_ids;
    nccl_ids.resize(1);
    if (rank == 0) {
      GenNCCLID(&nccl_ids);
      std::vector<std::string> endpoint_list =
          Attr<std::vector<std::string>>("other_endpoints");
-      SendBroadCastNCCLID(endpoint_list, 1, func, local_scope);
+      platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
    } else {
      std::string endpoint = Attr<std::string>("endpoint");
-      RecvBroadCastNCCLID(endpoint, 1, func, local_scope);
+      platform::RecvBroadCastCommID(endpoint, &nccl_ids);
    }
    CopyNCCLIDToVar(nccl_ids, func, scope);
    scope.DeleteScope(&local_scope);
  }
 };
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@ -27,11 +27,32 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 namespace paddle {
 namespace operators {
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
  for (size_t i = 0; i < nccl_ids->size(); ++i) {
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
  }
 }
 static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
                            std::function<std::string(size_t)> func,
                            const framework::Scope& scope) {
  for (size_t i = 0; i < nccl_ids.size(); ++i) {
    std::string var_name = func(i);
    auto var = scope.FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(
        var, platform::errors::NotFound("Variable with name %s is not found",
                                        var_name.c_str()));
    auto nccl_id = var->GetMutable<ncclUniqueId>();
    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
  }
 }
 class GenNCCLIdOp : public framework::OperatorBase {
 public:
  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
@ -98,19 +119,25 @@ class GenNCCLIdOp : public framework::OperatorBase {
            << ", trainers:" << ss.str();
    int server_fd = -1;
    std::vector<ncclUniqueId> nccl_ids;
    nccl_ids.resize(nccl_comm_num);
    /// 1. init flat
    std::function<std::string(size_t)> func = platform::GetFlatNCCLVarName;
    // broadcast unique id
    if (trainer_id == 0) {
      GenNCCLID(&nccl_ids);
      // server endpoints
      std::vector<std::string> flat_endpoints;
      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
                            trainers.end());
-      SendBroadCastNCCLID(flat_endpoints, nccl_comm_num, func, scope);
+      platform::SendBroadCastCommID(flat_endpoints, &nccl_ids);
    } else {
-      server_fd = CreateListenSocket(endpoint);
+      server_fd = platform::CreateListenSocket(endpoint);
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
    }
    CopyNCCLIDToVar(nccl_ids, func, scope);
    /// 2. hierarchical inter ncclid
    func = platform::GetHierarchicalInterNCCLVarName;
@ -127,10 +154,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
      }
      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      SendBroadCastNCCLID(inter_endpoints, nccl_comm_num, func, scope);
+      GenNCCLID(&nccl_ids);
      platform::SendBroadCastCommID(inter_endpoints, &nccl_ids);
      CopyNCCLIDToVar(nccl_ids, func, scope);
    } else if (inter_trainer_id > 0) {
      VLOG(1) << "Hierarchical inter ring";
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
      CopyNCCLIDToVar(nccl_ids, func, scope);
    }
    /// 3. hierarchical exter ncclid
@ -146,15 +176,18 @@ class GenNCCLIdOp : public framework::OperatorBase {
      }
      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      SendBroadCastNCCLID(exter_endpoints, nccl_comm_num, func, scope);
+      GenNCCLID(&nccl_ids);
      platform::SendBroadCastCommID(exter_endpoints, &nccl_ids);
      CopyNCCLIDToVar(nccl_ids, func, scope);
    } else if (exter_trainer_id > 0) {
      VLOG(1) << "Hierarchical exter ring";
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
      CopyNCCLIDToVar(nccl_ids, func, scope);
    }
    // close socket server
    if (trainer_id != 0) {
-      CloseSocket(server_fd);
+      platform::CloseSocket(server_fd);
    }
  }
 };
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -101,7 +101,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
-cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 if(WITH_GPU)
    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
--- a/paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include <arpa/inet.h>
 #include <netdb.h>
@ -31,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"
 namespace paddle {
-namespace operators {
+namespace platform {
 constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
@ -257,26 +258,29 @@ static int ConnectAddr(const std::string& ep, const char* head) {
  return sock;
 }
-static void RecvNCCLID(int conn, ncclUniqueId* nccl_id) {
+template <typename CommUniqueId>
 static void RecvCommID(int conn, CommUniqueId* nccl_id) {
  char buffer[1024] = {0};
-  static_assert(NCCL_UNIQUE_ID_BYTES <= 1024,
+  static_assert(sizeof(CommUniqueId) <= 1024,
                "nccl id bytes must <= buffer size");
-  CHECK_SYS_CALL(SocketRecv(conn, buffer, NCCL_UNIQUE_ID_BYTES), "recv ncc id");
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, sizeof(CommUniqueId)),
-  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
+                 "recv comm unique id");
  memcpy(nccl_id, buffer, sizeof(CommUniqueId));
 }
-static void SendNCCLID(int conn, ncclUniqueId* nccl_id) {
+template <typename CommUniqueId>
 static void SendCommID(int conn, CommUniqueId* nccl_id) {
  char buffer[1024] = {0};
-  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
+  memcpy(buffer, nccl_id, sizeof(CommUniqueId));
-  CHECK_SYS_CALL(SocketSend(conn, buffer, NCCL_UNIQUE_ID_BYTES),
+  CHECK_SYS_CALL(SocketSend(conn, buffer, sizeof(CommUniqueId)),
-                 "send nccl id");
+                 "send comm unique id");
 }
-void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
+template <typename CommUniqueId>
-                         std::function<std::string(size_t)> func,
+void SendBroadCastCommID(std::vector<std::string> servers,
-                         const framework::Scope& scope) {
+                         std::vector<CommUniqueId>* nccl_ids) {
  // connect with server
  std::vector<int> connects;
  for (auto server : servers) {
@ -286,23 +290,13 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
  }
  VLOG(3) << "connecting completed...";
-  for (int i = 0; i < nccl_comm_num; ++i) {
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
    std::string var_name = func(i);
    auto var = scope.FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(
        var, platform::errors::NotFound("Variable with name %s is not found",
                                        var_name.c_str()));
    auto nccl_id = var->GetMutable<ncclUniqueId>();
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(nccl_id));
    int j = 0;
    for (auto conn : connects) {
-      VLOG(3) << "sending nccl_id_var: " << var_name << " to " << servers[j]
+      VLOG(3) << "sending comm_id to " << servers[j] << " nccl_comm_no: " << i;
-              << " nccl_comm_no: " << i;
+      SendCommID(conn, &(*nccl_ids)[i]);
      SendNCCLID(conn, nccl_id);
      ++j;
    }
    VLOG(3) << "sending completed...";
  }
  // close client
@ -311,34 +305,43 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
  }
 }
-void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
+template <typename CommUniqueId>
-                         std::function<std::string(size_t)> func,
+void RecvBroadCastCommID(std::string endpoint,
-                         const framework::Scope& scope) {
+                         std::vector<CommUniqueId>* nccl_ids) {
  int server = CreateListenSocket(endpoint);
-  RecvBroadCastNCCLID(server, endpoint, nccl_comm_num, func, scope);
+  RecvBroadCastCommID(server, endpoint, nccl_ids);
  CloseSocket(server);
 }
-void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+template <typename CommUniqueId>
-                         std::function<std::string(size_t)> func,
+void RecvBroadCastCommID(int server_fd, std::string endpoint,
-                         const framework::Scope& scope) {
+                         std::vector<CommUniqueId>* nccl_ids) {
  int client = SocketAccept(server_fd, COMM_HEAD);
-  for (int i = 0; i < nccl_comm_num; ++i) {
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
-    std::string var_name = func(i);
+    VLOG(3) << "trainer: " << endpoint
-    auto var = scope.FindVar(var_name);
+            << " receiving comm_id from trainer 0, nccl_comm_no: " << i;
-    PADDLE_ENFORCE_NOT_NULL(
+    RecvCommID(client, &(*nccl_ids)[i]);
        var, platform::errors::NotFound("Variable with name %s is not found",
                                        var_name.c_str()));
    auto nccl_id = var->GetMutable<ncclUniqueId>();
    VLOG(3) << "trainer: " << endpoint << " receiving nccl_id_var: " << var_name
            << " from trainer 0, nccl_comm_no: " << i;
    RecvNCCLID(client, nccl_id);
  }
  VLOG(3) << "receiving completed...";
  CloseSocket(client);
 }
-}  // namespace operators
+/// template instantiation
 #define INSTANT_TEMPLATE(Type)                                              \
  template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
                                          std::vector<Type> * nccl_ids);    \
  template void RecvBroadCastCommID<Type>(std::string endpoint,             \
                                          std::vector<Type> * nccl_ids);
 #ifdef PADDLE_WITH_NCCL
 INSTANT_TEMPLATE(ncclUniqueId)
 #endif
 #ifdef PADDLE_WITH_XPU_BKCL
 INSTANT_TEMPLATE(bkclUniqueId)
 #endif
 }  // namespace platform
 }  // namespace paddle
 #endif
--- a/paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
@ -14,35 +14,31 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_NCCL
 #include <functional>
 #include <string>
 #include <vector>
 namespace paddle {
-namespace framework {
+namespace platform {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
 namespace paddle {
 namespace operators {
 int CreateListenSocket(const std::string& ep);
 void CloseSocket(int fd);
-void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
+template <typename CommUniqueId>
-                         std::function<std::string(size_t)> func,
+void SendBroadCastCommID(std::vector<std::string> servers,
-                         const framework::Scope& scope);
+                         std::vector<CommUniqueId>* nccl_ids);
-// server listen on endpoint, then recv nccl id
+template <typename CommUniqueId>
-void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
+void RecvBroadCastCommID(std::string endpoint,
-                         std::function<std::string(size_t)> func,
+                         std::vector<CommUniqueId>* nccl_ids);
                         const framework::Scope& scope);
 // recv nccl id from socket
-void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+template <typename CommUniqueId>
-                         std::function<std::string(size_t)> func,
+void RecvBroadCastCommID(int server_fd, std::string endpoint,
-                         const framework::Scope& scope);
+                         std::vector<CommUniqueId>* nccl_ids);
-}  // namespace operators
+}  // namespace platform
 }  // namespace paddle
 #endif
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifdef PADDLE_WITH_NCCL
 #pragma once
 #ifdef PADDLE_WITH_NCCL
 #include <stdio.h>
 #include <memory>
 #include <string>
--- a/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
@ -14,10 +14,11 @@
 import unittest
 import os
 import copy
 from launch_function_helper import wait, _find_free_port
-from multiprocessing import Pool, Process
+from threading import Thread
-os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10")
+os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10,gen_comm_id*=10")
 import paddle
 from paddle.fluid import core
@ -29,8 +30,8 @@ def run_gen_ncc_id(attr):
    nccl_comm_num = attr['nccl_comm_num']
    use_hallreduce = attr['use_hierarchical_allreduce']
-    startup_program = paddle.static.default_startup_program()
+    startup_program = paddle.static.Program()
-    main_program = paddle.static.default_main_program()
+    main_program = paddle.static.Program()
    with paddle.static.program_guard(main_program, startup_program):
        nccl_id_var = startup_program.global_block().create_var(
@ -60,9 +61,10 @@ def run_gen_ncc_id(attr):
            attrs=attr)
    place = paddle.CPUPlace()
    exe = paddle.static.Executor(place)
-    exe.run(startup_program)
+    scope = paddle.static.Scope()
    with paddle.static.scope_guard(scope):
        exe.run(startup_program)
 class TestGenNcclIdOp(unittest.TestCase):
@ -97,16 +99,19 @@ class TestGenNcclIdOp(unittest.TestCase):
        procs = []
        for i in range(nranks):
            attr['trainer_id'] = i
-            p = Process(target=run_gen_ncc_id, args=(attr, ))
+            # NOTE. multiprocessing cannot be covered by coverage
            p = Thread(target=run_gen_ncc_id, args=(copy.copy(attr), ))
            p.start()
            procs.append(p)
-        wait(procs, timeout=120)
+        for p in procs:
            p.join()
    def test_flat(self):
        print(">>> test gen flat nccl id")
        self.gen_nccl_id(2)
        print("<<< end test gen flat nccl id")
        print()
    def test_hierarchical(self):
        print(">>> test gen hierarchical nccl id")