[Prepare for MultiProcess xpu] unified gen nccl id, refine imperative reducer (#30455)

4 years ago · 572c466d19
parent 549855ac20
commit 572c466d19
17 changed files with 599 additions and 444 deletions
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@ -16,8 +16,24 @@

 #include "paddle/fluid/imperative/all_reduce.h"

+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/nccl_context.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
 namespace paddle {
 namespace imperative {
+
 static const platform::Place &GetVarPlace(const framework::Variable &src) {
  if (src.IsType<framework::LoDTensor>()) {
    return src.Get<framework::LoDTensor>().place();
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@ -16,21 +16,6 @@

 #ifdef PADDLE_WITH_NCCL

-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <nccl.h>
-#include <string>
-#include <utility>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/nccl_context.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
 namespace paddle {
 namespace framework {
 class Variable;
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@ -13,73 +13,20 @@
 // limitations under the License.
 #pragma once

-// network header files
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#endif
-
+#include <memory>
 #include <string>
-#include <utility>
 #include <vector>

-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-
 #if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/imperative/all_reduce.h"
+#include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif

-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-#include "paddle/fluid/string/string_helper.h"
+#include "paddle/fluid/imperative/parallel_context.h"

 namespace paddle {
 namespace imperative {

-struct ParallelStrategy {
-  int nranks_{1};
-  int local_rank_{0};
-  std::vector<std::string> trainer_endpoints_{};
-  std::string current_endpoint_{""};
-  // TODO(shenliang03): support multi stream communication
-  int nrings_{1};
-};
-
-class ParallelContext {
- public:
-  explicit ParallelContext(const ParallelStrategy& strategy,
-                           const platform::Place& place)
-      : strategy_(strategy), place_(place) {}
-
-  virtual ~ParallelContext() {}
-
-  virtual void Init() = 0;
-
-  virtual void AllReduceByStream(const framework::Variable& src,
-                                 framework::Variable* dst, int ring_id = 0,
-                                 bool use_calc_stream = false) = 0;
-#if defined(PADDLE_WITH_NCCL)
-  virtual paddle::platform::CUDADeviceContext* GetDeviceContext(
-      int ring_id) = 0;
-#endif
-
-  inline int GetNRings() { return strategy_.nrings_; }
-
- protected:
-  ParallelStrategy strategy_;
-  platform::Place place_;
-};
-
 #if defined(PADDLE_WITH_NCCL)
 class NCCLParallelContext : public ParallelContext {
 public:
@ -87,7 +34,7 @@ class NCCLParallelContext : public ParallelContext {
                               const platform::Place& place)
      : ParallelContext(strategy, place) {}

-  ~NCCLParallelContext() {}
+  ~NCCLParallelContext() override = default;

  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root);  // NOLINT

@ -97,14 +44,18 @@ class NCCLParallelContext : public ParallelContext {
                         framework::Variable* dst, int ring_id,
                         bool use_calc_stream) override;

-  paddle::platform::CUDADeviceContext* GetDeviceContext(int ring_id) override;
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
+
+  void WaitCompute(int ring_id) override;
+
+  void WaitComm(int ring_id) override;

- protected:
-  void RecvNCCLID(const std::string& endpoint,
-                  std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
+ private:
+  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
+  std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;

-  void SendNCCLID(const std::string& endpoint,
-                  const std::vector<ncclUniqueId>& nccl_ids);
+  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
+  std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
 };
 #endif

--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@ -0,0 +1,75 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+
+namespace framework {
+class Variable;
+}  // namespace framework
+
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+struct ParallelStrategy {
+  int nranks_{1};
+  int local_rank_{0};
+  std::vector<std::string> trainer_endpoints_{};
+  std::string current_endpoint_{""};
+  int nrings_{1};
+};
+
+class ParallelContext {
+ public:
+  explicit ParallelContext(const ParallelStrategy& strategy,
+                           const platform::Place& place)
+      : strategy_(strategy), place_(place) {}
+
+  virtual ~ParallelContext() = default;
+
+  virtual void Init() = 0;
+
+  virtual void AllReduceByStream(const framework::Variable& src,
+                                 framework::Variable* dst, int ring_id,
+                                 bool use_calc_stream) = 0;
+
+  virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0;
+
+  // comm_stream[ring_id] wait compute_stream.
+  // if CPU, should do nothing.
+  virtual void WaitCompute(int ring_id) = 0;
+
+  // compute_stream wait comm_stream[ring_id]
+  // if CPU, should do nothing.
+  virtual void WaitComm(int ring_id) = 0;
+
+  inline int GetNRings() const { return strategy_.nrings_; }
+
+ protected:
+  ParallelStrategy strategy_;
+  platform::Place place_;
+};
+
+}  //  namespace imperative
+}  //  namespace paddle
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@ -24,60 +24,27 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/op_base.h"
-#include "paddle/fluid/imperative/variable_wrapper.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/string/string_helper.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"

-#if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/imperative/all_reduce.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/cuda_resource_pool.h"
-#endif
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+
+namespace imperative {
+class ParallelContext;
+class VarBase;
+class VariableWrapper;
+}  // namespace imperative
+}  // namespace paddle

 namespace paddle {
 namespace imperative {

 #if defined(PADDLE_WITH_NCCL)
-template <typename T>
-void ConcatTensorsForAllReduce(
-    const platform::CUDADeviceContext& context,
-    const std::vector<framework::Tensor>& dense_tensors_,
-    framework::Variable* p_dense_contents) {
-  operators::math::ConcatFunctor<platform::CUDADeviceContext, T>
-      concat_functor_;
-  concat_functor_(context, dense_tensors_, 0,
-                  p_dense_contents->GetMutable<framework::LoDTensor>());
-}
-
-template <typename T>
-void SplitTensorsForAllReduce(const platform::CUDADeviceContext& context,
-                              framework::Variable* p_dense_contents,
-                              std::vector<framework::Tensor>* p_dense_tensors) {
-  auto* in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor*> outs;
-  std::vector<const framework::Tensor*> shape_refer;
-
-  outs.reserve(p_dense_tensors->size());
-  shape_refer.reserve(p_dense_tensors->size());
-
-  for (auto& tensor : *p_dense_tensors) {
-    outs.emplace_back(&tensor);
-    shape_refer.emplace_back(&tensor);
-  }
-  // Sometimes direct copies will be faster
-  if (p_dense_tensors->size() < 10) {
-    operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
-  } else {
-    operators::math::SplitFunctor<platform::CUDADeviceContext, T>
-        split_functor_;
-    split_functor_(context, *in, shape_refer, 0, &outs);
-  }
-}
-
 class Group {
 public:
  // Here, we use dense_contents_ & sparse_contents_ to
@ -104,10 +71,10 @@ class Group {
  framework::proto::VarType::Type dtype_;

  // context is used to select the stream for concat
-  void ConcatTensors(const platform::CUDADeviceContext& context);
+  void ConcatTensors(const platform::DeviceContext& context);

  // context is used to select the stream for split
-  void SplitTensors(const platform::CUDADeviceContext& context);
+  void SplitTensors(const platform::DeviceContext& context);

  friend std::ostream& operator<<(std::ostream&, const Group&);
 };
@ -155,8 +122,6 @@ class Reducer {

  std::vector<std::vector<size_t>> RebuildGruops();

-  void CreateGroupEvents(int group_num);
-
  inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }

  // Reducer Singleton
@ -193,11 +158,6 @@ class Reducer {
  std::shared_ptr<imperative::ParallelContext> parallel_ctx_;
  std::vector<VariableLocator> variable_locators_;

-  // Following variables are to help sync stream
-  std::vector<std::shared_ptr<platform::CudaEventObject>> group_events_;
-  std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
-  cudaStream_t compute_stream_;
-  std::vector<cudaStream_t> comm_streams_;
  int nrings_ = 1;

  // Following variables are to help rebuild group
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <thread>  // NOLINT
+
 #include "paddle/fluid/imperative/nccl_context.h"

 #include "gtest/gtest.h"
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@ -60,6 +60,109 @@ TEST(TestGroup, TestPrintGroupMessage) {
  ASSERT_STREQ(stream2.str().c_str(), head.c_str());
 }

+template <typename T, typename Place>
+void GroupConcatSplit(Place place, size_t size) {
+  platform::CPUPlace cpu_place;
+  Group group;
+
+  // [[0.0], [0.0, 1.0], [0.0, 1.0, 2.0] .. ]
+  std::vector<framework::Variable> vars;
+  vars.resize(size);
+  for (size_t i = 0; i < size; ++i) {
+    auto len = i + 1;
+    auto* tensor = vars[i].GetMutable<framework::LoDTensor>();
+    tensor->Resize({static_cast<int64_t>(len)});
+    auto* data = tensor->mutable_data<T>(place);
+
+    std::vector<T> value;
+    for (size_t j = 0; j < len; ++j) {
+      value.push_back(static_cast<T>(1.0 * j));
+    }
+
+    if (std::is_same<Place, platform::CUDAPlace>::value) {
+      paddle::memory::Copy(place, data, cpu_place, value.data(),
+                           sizeof(T) * value.size(), 0);
+    } else {
+      paddle::memory::Copy(place, data, cpu_place, value.data(),
+                           sizeof(T) * value.size());
+    }
+
+    framework::Tensor tmp;
+    tmp.ShareDataWith(*tensor).Resize({static_cast<int64_t>(len)});
+    group.dense_tensors_.push_back(std::move(tmp));
+    group.all_length_ += len;
+    group.dtype_ = tensor->type();
+  }
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+
+  {  // concat
+    group.ConcatTensors(*dev_ctx);
+
+    auto* tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
+    framework::Tensor tmp;
+    framework::TensorCopySync(*tensor, cpu_place, &tmp);
+    auto* data = tmp.data<T>();
+    size_t offset = 0;
+    for (size_t i = 0; i < size; ++i) {
+      auto len = i + 1;
+      for (size_t j = 0; j < len; ++j) {
+        EXPECT_EQ(data[offset + j], static_cast<T>(1.0 * j));
+        // [[-0.0], [-0.0, -1.0], [-0.0, -1.0, -2.0] .. ]
+        data[offset + j] = -data[offset + j];
+      }
+      offset += len;
+    }
+    framework::TensorCopySync(tmp, place, tensor);
+  }
+
+  {  // split
+    group.SplitTensors(*dev_ctx);
+    for (size_t i = 0; i < size; ++i) {
+      auto len = i + 1;
+      auto& tensor = group.dense_tensors_[i];
+      framework::Tensor tmp;
+      framework::TensorCopySync(tensor, cpu_place, &tmp);
+      auto* data = tmp.data<T>();
+
+      for (size_t j = 0; j < len; ++j) {
+        EXPECT_EQ(data[j], static_cast<T>(-1.0 * j));
+      }
+    }
+  }
+}
+
+TEST(TestGroup, TestConcatSplit) {
+  platform::CUDAPlace cuda_place(0);
+  platform::CPUPlace cpu_place;
+
+  int size = 3;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<double>(cpu_place, size);
+  GroupConcatSplit<platform::float16>(cpu_place, size);
+
+  GroupConcatSplit<float>(cuda_place, size);
+  GroupConcatSplit<double>(cuda_place, size);
+  GroupConcatSplit<platform::float16>(cuda_place, size);
+
+  size = 15;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<double>(cpu_place, size);
+  GroupConcatSplit<platform::float16>(cpu_place, size);
+
+  GroupConcatSplit<float>(cuda_place, size);
+  GroupConcatSplit<double>(cuda_place, size);
+  GroupConcatSplit<platform::float16>(cuda_place, size);
+}
+
+TEST(TestGroup, TestConcatSplitException) {
+  platform::CUDAPinnedPlace place;
+
+  int size = 3;
+  ASSERT_ANY_THROW(GroupConcatSplit<float>(place, size));
+}
 #endif

 }  // namespace imperative
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@ -15,9 +15,8 @@ register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DE

 if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
-    cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common)
-    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
-    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
+    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

 if(WITH_GLOO)
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@ -23,11 +23,32 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"

-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"

 namespace paddle {
 namespace operators {

+static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
+  }
+}
+
+static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
+                            std::function<std::string(size_t)> func,
+                            const framework::Scope& scope) {
+  for (size_t i = 0; i < nccl_ids.size(); ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto nccl_id = var->GetMutable<ncclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
+  }
+}
+
 class CGenNCCLIdOp : public framework::OperatorBase {
 public:
  CGenNCCLIdOp(const std::string& type,
@ -45,14 +66,20 @@ class CGenNCCLIdOp : public framework::OperatorBase {
      return Output("Out");
    };

+    std::vector<ncclUniqueId> nccl_ids;
+    nccl_ids.resize(1);
+
    if (rank == 0) {
+      GenNCCLID(&nccl_ids);
      std::vector<std::string> endpoint_list =
          Attr<std::vector<std::string>>("other_endpoints");
-      SendBroadCastNCCLID(endpoint_list, 1, func, local_scope);
+      platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
    } else {
      std::string endpoint = Attr<std::string>("endpoint");
-      RecvBroadCastNCCLID(endpoint, 1, func, local_scope);
+      platform::RecvBroadCastCommID(endpoint, &nccl_ids);
    }
+
+    CopyNCCLIDToVar(nccl_ids, func, scope);
    scope.DeleteScope(&local_scope);
  }
 };
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@ -27,11 +27,32 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"

-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"

 namespace paddle {
 namespace operators {

+static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
+  }
+}
+
+static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
+                            std::function<std::string(size_t)> func,
+                            const framework::Scope& scope) {
+  for (size_t i = 0; i < nccl_ids.size(); ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto nccl_id = var->GetMutable<ncclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
+  }
+}
+
 class GenNCCLIdOp : public framework::OperatorBase {
 public:
  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
@ -98,19 +119,25 @@ class GenNCCLIdOp : public framework::OperatorBase {
            << ", trainers:" << ss.str();

    int server_fd = -1;
+    std::vector<ncclUniqueId> nccl_ids;
+    nccl_ids.resize(nccl_comm_num);

    /// 1. init flat
    std::function<std::string(size_t)> func = platform::GetFlatNCCLVarName;
+    // broadcast unique id
    if (trainer_id == 0) {
+      GenNCCLID(&nccl_ids);
+
      // server endpoints
      std::vector<std::string> flat_endpoints;
      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
                            trainers.end());
-      SendBroadCastNCCLID(flat_endpoints, nccl_comm_num, func, scope);
+      platform::SendBroadCastCommID(flat_endpoints, &nccl_ids);
    } else {
-      server_fd = CreateListenSocket(endpoint);
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      server_fd = platform::CreateListenSocket(endpoint);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
    }
+    CopyNCCLIDToVar(nccl_ids, func, scope);

    /// 2. hierarchical inter ncclid
    func = platform::GetHierarchicalInterNCCLVarName;
@ -127,10 +154,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
      }
      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();

-      SendBroadCastNCCLID(inter_endpoints, nccl_comm_num, func, scope);
+      GenNCCLID(&nccl_ids);
+      platform::SendBroadCastCommID(inter_endpoints, &nccl_ids);
+      CopyNCCLIDToVar(nccl_ids, func, scope);
    } else if (inter_trainer_id > 0) {
      VLOG(1) << "Hierarchical inter ring";
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
+      CopyNCCLIDToVar(nccl_ids, func, scope);
    }

    /// 3. hierarchical exter ncclid
@ -146,15 +176,18 @@ class GenNCCLIdOp : public framework::OperatorBase {
      }
      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();

-      SendBroadCastNCCLID(exter_endpoints, nccl_comm_num, func, scope);
+      GenNCCLID(&nccl_ids);
+      platform::SendBroadCastCommID(exter_endpoints, &nccl_ids);
+      CopyNCCLIDToVar(nccl_ids, func, scope);
    } else if (exter_trainer_id > 0) {
      VLOG(1) << "Hierarchical exter ring";
-      RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
+      CopyNCCLIDToVar(nccl_ids, func, scope);
    }

    // close socket server
    if (trainer_id != 0) {
-      CloseSocket(server_fd);
+      platform::CloseSocket(server_fd);
    }
  }
 };
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -101,7 +101,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})

-cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)

 if(WITH_GPU)
    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
--- a/paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
+#ifdef PADDLE_WITH_NCCL
+#include "paddle/fluid/platform/gen_comm_id_helper.h"

 #include <arpa/inet.h>
 #include <netdb.h>
@ -31,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"

 namespace paddle {
-namespace operators {
+namespace platform {

 constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";

@ -257,26 +258,29 @@ static int ConnectAddr(const std::string& ep, const char* head) {
  return sock;
 }

-static void RecvNCCLID(int conn, ncclUniqueId* nccl_id) {
+template <typename CommUniqueId>
+static void RecvCommID(int conn, CommUniqueId* nccl_id) {
  char buffer[1024] = {0};
-  static_assert(NCCL_UNIQUE_ID_BYTES <= 1024,
+  static_assert(sizeof(CommUniqueId) <= 1024,
                "nccl id bytes must <= buffer size");

-  CHECK_SYS_CALL(SocketRecv(conn, buffer, NCCL_UNIQUE_ID_BYTES), "recv ncc id");
-  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, sizeof(CommUniqueId)),
+                 "recv comm unique id");
+  memcpy(nccl_id, buffer, sizeof(CommUniqueId));
 }

-static void SendNCCLID(int conn, ncclUniqueId* nccl_id) {
+template <typename CommUniqueId>
+static void SendCommID(int conn, CommUniqueId* nccl_id) {
  char buffer[1024] = {0};
-  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
+  memcpy(buffer, nccl_id, sizeof(CommUniqueId));

-  CHECK_SYS_CALL(SocketSend(conn, buffer, NCCL_UNIQUE_ID_BYTES),
-                 "send nccl id");
+  CHECK_SYS_CALL(SocketSend(conn, buffer, sizeof(CommUniqueId)),
+                 "send comm unique id");
 }

-void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
+template <typename CommUniqueId>
+void SendBroadCastCommID(std::vector<std::string> servers,
+                         std::vector<CommUniqueId>* nccl_ids) {
  // connect with server
  std::vector<int> connects;
  for (auto server : servers) {
@ -286,23 +290,13 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
  }
  VLOG(3) << "connecting completed...";

-  for (int i = 0; i < nccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::NotFound("Variable with name %s is not found",
-                                        var_name.c_str()));
-    auto nccl_id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(nccl_id));
-
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
    int j = 0;
    for (auto conn : connects) {
-      VLOG(3) << "sending nccl_id_var: " << var_name << " to " << servers[j]
-              << " nccl_comm_no: " << i;
-      SendNCCLID(conn, nccl_id);
+      VLOG(3) << "sending comm_id to " << servers[j] << " nccl_comm_no: " << i;
+      SendCommID(conn, &(*nccl_ids)[i]);
      ++j;
    }
-    VLOG(3) << "sending completed...";
  }

  // close client
@ -311,34 +305,43 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
  }
 }

-void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
+template <typename CommUniqueId>
+void RecvBroadCastCommID(std::string endpoint,
+                         std::vector<CommUniqueId>* nccl_ids) {
  int server = CreateListenSocket(endpoint);
-  RecvBroadCastNCCLID(server, endpoint, nccl_comm_num, func, scope);
+  RecvBroadCastCommID(server, endpoint, nccl_ids);
  CloseSocket(server);
 }

-void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
+template <typename CommUniqueId>
+void RecvBroadCastCommID(int server_fd, std::string endpoint,
+                         std::vector<CommUniqueId>* nccl_ids) {
  int client = SocketAccept(server_fd, COMM_HEAD);

-  for (int i = 0; i < nccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::NotFound("Variable with name %s is not found",
-                                        var_name.c_str()));
-    auto nccl_id = var->GetMutable<ncclUniqueId>();
-
-    VLOG(3) << "trainer: " << endpoint << " receiving nccl_id_var: " << var_name
-            << " from trainer 0, nccl_comm_no: " << i;
-    RecvNCCLID(client, nccl_id);
+  for (size_t i = 0; i < nccl_ids->size(); ++i) {
+    VLOG(3) << "trainer: " << endpoint
+            << " receiving comm_id from trainer 0, nccl_comm_no: " << i;
+    RecvCommID(client, &(*nccl_ids)[i]);
  }
+
  VLOG(3) << "receiving completed...";
  CloseSocket(client);
 }

-}  // namespace operators
+/// template instantiation
+#define INSTANT_TEMPLATE(Type)                                              \
+  template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
+                                          std::vector<Type> * nccl_ids);    \
+  template void RecvBroadCastCommID<Type>(std::string endpoint,             \
+                                          std::vector<Type> * nccl_ids);
+
+#ifdef PADDLE_WITH_NCCL
+INSTANT_TEMPLATE(ncclUniqueId)
+#endif
+#ifdef PADDLE_WITH_XPU_BKCL
+INSTANT_TEMPLATE(bkclUniqueId)
+#endif
+}  // namespace platform
 }  // namespace paddle
+
+#endif
--- a/paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
@ -14,35 +14,31 @@ limitations under the License. */

 #pragma once

+#ifdef PADDLE_WITH_NCCL
 #include <functional>
 #include <string>
 #include <vector>

 namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
+namespace platform {

 int CreateListenSocket(const std::string& ep);

 void CloseSocket(int fd);

-void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
+template <typename CommUniqueId>
+void SendBroadCastCommID(std::vector<std::string> servers,
+                         std::vector<CommUniqueId>* nccl_ids);

-// server listen on endpoint, then recv nccl id
-void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
+template <typename CommUniqueId>
+void RecvBroadCastCommID(std::string endpoint,
+                         std::vector<CommUniqueId>* nccl_ids);

 // recv nccl id from socket
-void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-}  // namespace operators
+template <typename CommUniqueId>
+void RecvBroadCastCommID(int server_fd, std::string endpoint,
+                         std::vector<CommUniqueId>* nccl_ids);
+}  // namespace platform
 }  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifdef PADDLE_WITH_NCCL
 #pragma once

+#ifdef PADDLE_WITH_NCCL
 #include <stdio.h>
 #include <memory>
 #include <string>
--- a/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
@ -14,10 +14,11 @@

 import unittest
 import os
+import copy
 from launch_function_helper import wait, _find_free_port
-from multiprocessing import Pool, Process
+from threading import Thread

-os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10")
+os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10,gen_comm_id*=10")

 import paddle
 from paddle.fluid import core
@ -29,8 +30,8 @@ def run_gen_ncc_id(attr):
    nccl_comm_num = attr['nccl_comm_num']
    use_hallreduce = attr['use_hierarchical_allreduce']

-    startup_program = paddle.static.default_startup_program()
-    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()

    with paddle.static.program_guard(main_program, startup_program):
        nccl_id_var = startup_program.global_block().create_var(
@ -60,8 +61,9 @@ def run_gen_ncc_id(attr):
            attrs=attr)

    place = paddle.CPUPlace()
-
    exe = paddle.static.Executor(place)
+    scope = paddle.static.Scope()
+    with paddle.static.scope_guard(scope):
        exe.run(startup_program)


@ -97,16 +99,19 @@ class TestGenNcclIdOp(unittest.TestCase):
        procs = []
        for i in range(nranks):
            attr['trainer_id'] = i
-            p = Process(target=run_gen_ncc_id, args=(attr, ))
+            # NOTE. multiprocessing cannot be covered by coverage
+            p = Thread(target=run_gen_ncc_id, args=(copy.copy(attr), ))
            p.start()
            procs.append(p)

-        wait(procs, timeout=120)
+        for p in procs:
+            p.join()

    def test_flat(self):
        print(">>> test gen flat nccl id")
        self.gen_nccl_id(2)
        print("<<< end test gen flat nccl id")
+        print()

    def test_hierarchical(self):
        print(">>> test gen hierarchical nccl id")