add allreduce and broadcast without test (#31024)

add allreduce and broadcast without test
5 years ago · 9fcdaeba5e
parent 5618f14047
commit 9fcdaeba5e
29 changed files with 1895 additions and 19 deletions
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@ -62,6 +62,7 @@ endif()
 if(WITH_ASCEND_CL)
  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)

+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
@ -73,6 +74,9 @@ if(WITH_ASCEND_CL)
  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})

+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -205,8 +205,11 @@ elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
    SET(PROTOBUF_TAG         v3.8.0)
 else()
-    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+    SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG v3.8.0)
+    
+    # SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+    # SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
 endif()

    cache_third_party(${TARGET_NAME}
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -151,6 +151,8 @@ set(COMMON_FLAGS
    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
+    -Wno-error=nonnull-compare # Warning in boost gcc 7.2
+    -Wno-error=address # Warning in boost gcc 7.2
    ${fsanitize}
 )

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@ -79,6 +79,7 @@ class AllocatorFacadePrivate {
        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
+        VLOG(3) << "npu num: " <<platform::GetNPUDeviceCount();
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
@ -141,6 +142,7 @@ class AllocatorFacadePrivate {
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                          : allocators_)
                  : zero_size_allocators_);
+        VLOG(3) <<size;
    auto iter = allocators.find(place);
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@ -24,6 +24,7 @@ namespace memory {
 namespace allocation {

 static AllocatorStrategy GetStrategyFromFlag() {
+  VLOG(3) << "FLAGS_allocator_strategy" << FLAGS_allocator_strategy;
  if (FLAGS_allocator_strategy == "naive_best_fit") {
    return AllocatorStrategy::kNaiveBestFit;
  }
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@ -11,24 +11,29 @@ foreach(src ${OPS})
    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()

-register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})

 if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
-    cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common)
-    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
-    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
+    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

-if(WITH_ASCEND)
-    op_library(gen_nccl_id_op)
-    op_library(c_gen_nccl_id_op)
+if(WITH_GLOO)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()

+if(WITH_XPU_BKCL)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
+endif()

-if(WITH_GLOO)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+if(WITH_ASCEND_CL)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
 endif()

 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+
+cc_test(c_hcom_op_npu_test SRCS c_hcom_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_comm_init_hccl_op c_create_group_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allreduce_max, 
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allreduce_min, 
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@ -30,6 +30,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif

+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {

@ -105,6 +110,88 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  }
 };

+template <ReduceType red_type, typename T>
+class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto place = ctx.GetPlace();
+    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+
+    int64_t numel = in->numel();
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    // void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->mutable_data<T>(place)));
+
+    out->Resize(in->dims());
+    // void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->mutable_data<T>(place)));
+    // void* recvbuff = sendbuff;
+    std::string tag = ctx.Attr<std::string>("tag");
+    int ring_id = ctx.Attr<int>("ring_id");
+    // s他的：
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+     group = "hccl_world_group";// std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get();
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REP_OP_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REP_OP_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REP_OP_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REP_OP_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+
+    VLOG(3) << "begin hccl allreduce, parameter is: "
+      << "input num: " << numel
+      << "dtype: " << dtype
+      << "hccl_red_type: " << hccl_red_type
+      << ", group is: " << group
+      << ", tag is " << tag;
+
+    printf("sendbuff: %p\n", sendbuff);
+    printf("recvbuff: %p\n", recvbuff);
+
+    // printf("sendbuff: %p, %d\n", sendbuff, ((int*)sendbuff)[0]);
+    // printf("recvbuff: %p, %d\n", recvbuff, ((int*)recvbuff)[0]);
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
+        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
 public:
@ -114,7 +201,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
    auto out = ctx.Output<framework::Tensor>("Out");

    auto place = ctx.GetPlace();
-    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    ncclDataType_t dtype = platform::ToHCCLDataType(in->type());
    int64_t numel = in->numel();
    const void* sendbuff = in->data<void>();
    out->Resize(in->dims());
@ -170,6 +257,11 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) the allreduced result.");
    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
        .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    #pragma message("hccl CAllReduceOpMaker need tag attr")
+    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
+        .SetDefault("tag");
+#endif
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allreduce_prod, 
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allreduce_sum, 
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@ -42,6 +42,11 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(0);
    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
        .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    #pragma message("tag")
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+#endif
    AddAttr<bool>(
        "use_calc_stream",
        "(bool default false) eject CUDA operations to calculation stream.")
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@ -0,0 +1,94 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto place = ctx.GetPlace();
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get();
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    std::string tag = ctx.Attr<std::string>("tag");
+
+    VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
+      << ", group is " << group
+      << ", tag is " << tag;
+
+    if (root == static_cast<int>(comm->rank())) {
+      PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
+                                   dtype, (uint32_t)root, group.c_str(), (void*)stream));
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
+              << x->numel();
+    } else {
+      PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
+                                    dtype, (uint32_t)root, group.c_str(), (void*)stream));
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+              << framework::product(out->dims());
+    }
+      if (out != x) {
+        framework::TensorCopy(
+            *static_cast<const framework::Tensor*>(x), place,
+            *platform::DeviceContextPool::Instance().Get(place),
+            static_cast<framework::Tensor*>(out));
+      }
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_broadcast,
+                        ops::CBroadcastOpASCENDKernel<float>,
+                        ops::CBroadcastOpASCENDKernel<int>,
+                        ops::CBroadcastOpASCENDKernel<int8_t>,
+                        ops::CBroadcastOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class CCommInitOpNPU : public framework::OperatorBase {
+ public:
+  CCommInitOpNPU(const std::string& type,
+              const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::string rank_table_file = Attr<std::string>("rank_table_file");
+    uint32_t rank_id = Attr<int>("rank_id");
+    uint32_t device_id = Attr<int>("device_id");
+
+    VLOG(3) << "begin init hccl, parameter is: "
+            << "rank_table_file " << rank_table_file 
+            << " rank_id " << rank_id
+            << " device_id " << device_id;
+    
+    platform::HCCLCommContext::Instance().CreateHCCLComm(rank_table_file, rank_id, device_id);
+  }
+};
+
+class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(
+CCommInit operator on NPU
+
+Initialize collective communication context within this trainer
+)DOC");
+    AddAttr<std::string>("rank_table_file",
+        "(string) path to rank_table_file");
+    AddAttr<int>("rank_id", "(int) world rank id of the process");
+    AddAttr<int>("device_id", "(int) device id of the process/thread");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpNPU,
+   ops::CCommInitOpNPUMaker);
+
+#endif
--- a/paddle/fluid/operators/collective/c_create_group_op.cc
+++ b/paddle/fluid/operators/collective/c_create_group_op.cc
@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class CCreateGroupOpNPU : public framework::OperatorBase {
+ public:
+  CCreateGroupOpNPU(const std::string& type,
+              const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::string group_name = Attr<std::string>("group_name");
+    int nranks = Attr<int>("nranks");
+    std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
+    paddle::platform::HCCLCommContext::Instance().CreateHCCLGroup(
+        group_name, (uint32_t)nranks,
+        std::vector<uint32_t>(rank_ids.begin(), rank_ids.end()));
+  }
+};
+
+class CCreateGroupOpNPUMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(
+CCreateGroup operator on NPU
+
+Create collective communication group on NPU
+)DOC");
+    AddAttr<std::string>("group_name",
+        "(string) name of the collective communication group");
+    AddAttr<int>("nranks", "(int) number of the group");
+    AddAttr<std::vector<int>>("rank_ids",
+                 "(list of int) The world rank id of the group members");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_create_group, ops::CCreateGroupOpNPU,
+    ops::CCreateGroupOpNPUMaker);
+
+#endif
--- a/paddle/fluid/operators/collective/c_hcom_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_hcom_op_npu_test.cc
@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_NO_KERNEL_OP(c_create_group);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  printf("rank_table_file: %s, rank_id = %d, device_id = %d\n", rank_table_file.c_str(), rank_id, device_id);
+
+  f::AttributeMap attrs;
+  attrs["rank_table_file"] = rank_table_file;
+  attrs["rank_id"] = rank_id;
+  attrs["device_id"] = device_id;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {}, {}, attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  f::AttributeMap create_attrs;
+  create_attrs["group_name"] = HCOM_GROUP_PREFIX + std::to_string(0);
+  create_attrs["nranks"] = 2;
+  std::vector<int> rank_ids{0, 1};
+  create_attrs["rank_ids"] = rank_ids;
+  auto create_group_op = f::OpRegistry::CreateOp("c_create_group", {}, {}, create_attrs);
+  create_group_op->Run(*scope, place);
+  ctx.Wait();
+}
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout<< "rank_id:" << rank_id<<std::endl;
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+    std::cout<< init[0];
+  }
+  std::cout<<std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["root"]=0;
+  attrs["ring_id"]=0;
+
+  auto op =
+      f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout<< "rank_id:" << rank_id<<std::endl;
+  
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+    // init.push_back(1.0 + rank_id * 3);
+    std::cout<< init[0];
+  }
+  std::cout<<std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+
+  auto op =
+      f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  char * npu_id=getenv("FLAGS_selected_npus");
+
+  p::NPUDeviceContext  ctx(p::NPUPlace(atoi(npu_id)));
+
+  Prepare(&scope, ctx);
+  // TestHCCLBroadcastOp(&scope, ctx);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -128,7 +128,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})

-cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc DEPS framework_proto  device_context enforce)

 if(WITH_GPU)
    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@ -14,20 +14,21 @@

 #pragma once

-#if defined(PADDLE_WITH_NCCL)
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>

 #include "boost/variant.hpp"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace platform {

+#if defined(PADDLE_WITH_NCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
 // ncclUniqueId. E.g. for a hierarchical case,
@ -47,6 +48,8 @@ namespace platform {
 //
 // The NCCLComm instance is created and reversed in the NCCLCommContext
 // singleton with a global user specified group id.
+class CUDADeviceContext;
+
 class NCCLComm {
 public:
  virtual int ring_id() const = 0;
@ -120,8 +123,162 @@ class NCCLCommContext {
  NCCLCommContext() = default;
  DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 };
+#endif

-}  // namespace platform
-}  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+// In order to apply hierarchical communication with HCCL, we need
+// a communication ring contains HCCL communicators associated to a global
+// HCCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The HCCLComm instance is created and reversed in the HCCLCommContext
+// singleton with a global user specified group id.
+class NPUDeviceContext;
+
+class HCCLComm {
+ public:
+  virtual std::string rank_table_file() const = 0;
+  virtual uint32_t rank() const = 0;
+  virtual uint32_t device_id() const = 0;
+  virtual aclrtStream stream() const = 0;
+  virtual NPUDeviceContext* dev_context() const = 0;
+  virtual ~HCCLComm() = default;
+};
+
+// A singleton HCCL communicator context reserves communication ring ids
+class HCCLCommContext {
+ public:
+  static HCCLCommContext& Instance() {
+    static HCCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  HCCLComm* CreateHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
+
+  void CreateHCCLGroup(const std::string& group_name, uint32_t nranks, const std::vector<uint32_t>& rank_ids);
+
+  // retrieve a communicator by the ring id and place
+  HCCLComm* Get() const {
+    return comm_.get();
+  }
+ private:
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  std::unique_ptr<HCCLComm> comm_;
+
+  HCCLComm* AssignHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
+
+  HCCLCommContext() = default;
+  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
+};
+#endif
+
+#if defined(PADDLE_WITH_XPU_BKCL)
+// In order to apply hierarchical communication with BKCL, we need
+// a communication ring contains BKCL communicators associated to a global
+// BKCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The BKCLComm instance is created and reversed in the BKCLCommContext
+// singleton with a global user specified group id.
+class BKCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual BKCLContext_t comm() const = 0;
+  virtual XPUStream stream() const = 0;
+  virtual XPUDeviceContext* dev_context() const = 0;
+  virtual ~BKCLComm() = default;
+};
+
+// A singleton BKCL communicator context reserves communication ring ids
+class BKCLCommContext {
+ public:
+  static BKCLCommContext& Instance() {
+    static BKCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  BKCLComm* CreateBKCLComm(BKCLUniqueId* bkcl_id, int nranks, int rank,
+                           int dev_id, int ring_id = 0);
+
+  void CreateAllBKCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
+
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  BKCLComm* AssignBKCLComm(BKCLContext_t comm, int nranks, int rank, int dev_id,
+                           int ring_id = 0);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  BKCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  BKCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  BKCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, BOOST_GET_CONST(XPUPlace, place).device);
+  }
+
+ private:
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-BKCLComm
+  std::map<int, std::map<int, std::unique_ptr<BKCLComm>>> comm_map_;

+  void ReleaseBKCLComms();
+
+  BKCLCommContext() = default;
+  DISABLE_COPY_AND_ASSIGN(BKCLCommContext);
+};
 #endif
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@ -0,0 +1,111 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
+
+namespace paddle {
+namespace platform {
+
+class HCCLCommImpl : public HCCLComm {
+ public:
+  void set_rank_table_file(const std::string& rank_table_file) { rank_table_file_ = rank_table_file; }
+  std::string rank_table_file() const override { return rank_table_file_; }
+
+  void set_rank(uint32_t rank) { rank_ = rank; }
+  uint32_t rank() const override { return rank_; }
+
+  void set_device_id(uint32_t device_id) { device_id_ = device_id; }
+  uint32_t device_id() const override { return device_id_; }
+
+  aclrtStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+ private:
+  std::string rank_table_file_;
+  uint32_t rank_;
+  uint32_t device_id_;
+  std::unique_ptr<NPUDeviceContext> dev_ctx_;
+};
+
+HCCLComm* HCCLCommContext::CreateHCCLComm(const std::string& rank_table_file,
+                                          uint32_t rank, uint32_t device_id) {
+/*
+  PADDLE_ENFORCE_NOT_NULL(rank_table_file,
+                          platform::errors::InvalidArgument(
+                              "The rank table file should not be null."));
+
+  PADDLE_ENFORCE_GE(rank, 0,
+      platform::errors::InvalidArgument(
+          "Expected rank >= 0. But received rank is %d.", rank));
+
+  PADDLE_ENFORCE_GE(device_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", device_id));
+*/
+  auto* comm_wrapper = AssignHCCLComm(rank_table_file, rank, device_id);
+
+  platform::dynload::hcom_init(rank_table_file.c_str(), std::to_string(rank).c_str());
+  platform::dynload::hcom_bind_model(comm_wrapper->stream(), comm_wrapper->stream());
+
+  VLOG(1) << "hccl communicator of rank " << rank << " has been created";
+  return comm_wrapper;
+}
+
+HCCLComm* HCCLCommContext::AssignHCCLComm(const std::string& rank_table_file,
+		uint32_t rank, uint32_t device_id)  {
+
+  std::unique_ptr<NPUDeviceContext> dev_ctx(
+      new NPUDeviceContext(NPUPlace(device_id)));
+
+  VLOG(3) << "device_id" << device_id;
+  VLOG(3) << "dev_ctx->stream()" << dev_ctx->stream();
+
+  HCCLCommImpl* c = new HCCLCommImpl;
+  c->set_rank_table_file(rank_table_file);
+  c->set_rank(rank);
+  c->set_device_id(device_id);
+  c->set_dev_ctx(std::move(dev_ctx));
+  // comm_ = c
+  comm_.reset(c);
+  return c;
+}
+
+void HCCLCommContext::CreateHCCLGroup(const std::string& group_name, uint32_t nranks,
+  const std::vector<uint32_t>& rank_ids) {
+/*
+  PADDLE_ENFORCE_NOT_NULL(group_name,
+                          platform::errors::InvalidArgument(
+                              "The group name should not be null."));
+  PADDLE_ENFORCE_GT(nranks, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected nranks > 0. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_NOT_NULL(rank_ids,
+                          platform::errors::InvalidArgument(
+                              "The rank ids should not be null."));
+*/
+  platform::dynload::hcom_create_group(group_name.c_str(), nranks, (unsigned int*)rank_ids.data());
+
+  VLOG(1) << "hccl group with name " << group_name << " has been created";
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@ -9,7 +9,7 @@ endif()
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
  if (WITH_NCCL)
    list(APPEND CUDA_SRCS nccl.cc)
  endif()
@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
 if(WITH_ROCM_PLATFORM)
  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
  hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
 else()
  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
--- a/paddle/fluid/platform/dynload/base.h
+++ b/paddle/fluid/platform/dynload/base.h
@ -0,0 +1,127 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file base.h
+ * @brief HCOM data type definition 
+ * 
+ */
+
+#ifndef HCCL_BASE_H_
+#define HCCL_BASE_H_
+
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+typedef signed char s8;
+typedef signed short s16;
+typedef signed int s32;
+typedef signed long long s64;
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+/**
+ * @brief HCOM functions return value definition
+ */
+typedef enum tagHcclResult {
+    HCCL_SUCCESS = 0,               /**< success */
+    HCCL_E_PARA = 1,                /**< parameter error */
+    HCCL_E_PTR = 2,                 /**< empty pointer */
+    HCCL_E_MEMORY = 3,              /**< memory error */
+    HCCL_E_INTERNAL = 4,            /**< internal error */
+    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
+    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
+    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
+    HCCL_E_SYSCALL = 8,             /**< call system interface error */
+    HCCL_E_TIMEOUT = 9,             /**< timeout */
+    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
+    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
+    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
+    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
+    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
+    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
+    HCCL_E_DRV = 16,                /**< call driver api fail */
+    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
+    HCCL_E_CCE = 18,                /**< call cce api fail */
+    HCCL_E_NETWORK = 19,            /**< call network api fail */
+    HCCL_E_RESERVED                 /**< reserved */
+} hcclResult_t;
+
+/* handle to communicator */
+typedef void *hcclComm_t;
+
+/**
+ * @brief HCCL Reduction opperation
+ */
+typedef enum tagHcclRedOp {
+    HCCL_REP_OP_SUM = 0,    /**< sum */
+    HCCL_REP_OP_PROD = 1,   /**< prod */
+    HCCL_REP_OP_MAX = 2,    /**< max */
+    HCCL_REP_OP_MIN = 3,    /**< min */
+    HCCL_REP_OP_RESERVED    /**< reserved */
+} hcclRedOp_t;
+
+/**
+ * @brief HCCL data type
+ */
+typedef enum tagHcclDataType {
+    HCCL_DATA_TYPE_INT8 = 0,  /**< int8 */
+    HCCL_DATA_TYPE_INT16 = 1,  /**< int16 */
+    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
+    HCCL_DATA_TYPE_FP16 = 3,  /**< fp16 */
+    HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */
+    HCCL_DATA_TYPE_INT64 = 5, /**< fp32 */
+    HCCL_DATA_TYPE_UINT64 = 6, /**< fp32 */
+    HCCL_DATA_TYPE_RESERVED   /**< reserved */
+} hcclDataType_t;
+
+const u32 HCCL_MAX_SEGMENT_NUM = 8;   // The max number of gradient segments.
+
+/**
+ * @brief the feature of the model
+ */
+struct model_feature {
+    const char *model_name;  /**< The model name */
+    u32 gradient_num;        /**< The number of gradients */
+    float *gradient_size;    /**< The size of each gradient */
+    float *gradient_time;    /**< The BP compution time of each gradient */
+};
+
+enum GradSplitForceMode {
+    FORCE_NONE,     /**< no force */
+    FORCE_SIZE,     /**< force split gradient by size */
+    FORCE_RESERVED  /**< reserved */
+};
+
+/**
+* @brief stream handle.
+*/
+typedef void *rtStream_t;
+
+/**
+* @brief model handle.
+*/
+typedef void *rtModel_t;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // HCCL_BASE_H_
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"

+
+
+
+
 DEFINE_string(cudnn_dir, "",
              "Specify path for loading libcudnn.so. For instance, "
              "/usr/local/cudnn/lib. If empty [default], dlopen "
@ -36,6 +40,11 @@ DEFINE_string(nccl_dir, "",
              "For instance, /usr/local/cuda/lib64. If default, "
              "dlopen will search cuda from LD_LIBRARY_PATH");

+DEFINE_string(hccl_dir, "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, /usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");

 DEFINE_string(
@ -383,6 +392,26 @@ void* GetNCCLDsoHandle() {
                                    warning_msg);
 #endif
 }
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
+                                    warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
+#endif
+}
+
+

 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
--- a/paddle/fluid/platform/dynload/hccl.cc
+++ b/paddle/fluid/platform/dynload/hccl.cc
@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/hccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hccl_dso_flag;
+void *hccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if HCCL_VERSION_CODE >= 2212
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/hccl.h
+++ b/paddle/fluid/platform/dynload/hccl.h
@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+// #include <hccl/hccl.h>
+// #include <hccl/hccl_types.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/dynload/hcom.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hccl_dso_flag;
+extern void* hccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using HCCL_func = decltype(&::__name);                             \
+      std::call_once(hccl_dso_flag, []() {                               \
+        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
+      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define HCCL_RAND_ROUTINE_EACH(__macro)         \
+  __macro(hcom_init);                           \
+  __macro(hcom_destroy);                        \
+  __macro(hcom_bind_model);                     \
+  __macro(hcom_unbind_model);                   \
+  __macro(hcom_send);                           \
+  __macro(hcom_receive);                        \
+  __macro(hcom_broadcast);                      \
+  __macro(hcom_all_gather);                     \
+  __macro(hcom_all_reduce);                     \
+  __macro(hcom_reduce_scatter);                 \
+  __macro(hcom_create_group);                   \
+  __macro(hcom_destroy_group);                  \
+  __macro(hcom_get_rank_id);                    \
+  __macro(hcom_get_local_rank_id);              \
+  __macro(hcom_get_local_rank_size);            \
+  __macro(hcom_get_split_strategy);             \
+  __macro(hcom_set_split_strategy_by_size);     \
+  __macro(hcom_set_split_strategy_by_index);    \
+  __macro(hcom_get_group_rank_from_world_rank); \
+  __macro(hcom_get_world_rank_from_group_rank); 
+
+
+HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+
+#if HCCL_VERSION_CODE >= 2212
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(HCCLSend);                               \
+  __macro(HCCLRecv);
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/Show More
+++ b/Show More