add allreduce and broadcast without test (#31024)

add allreduce and broadcast without test
revert-31562-mean
lw921014 5 years ago committed by GitHub
parent 5618f14047
commit 9fcdaeba5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -62,6 +62,7 @@ endif()
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
@ -73,6 +74,9 @@ if(WITH_ASCEND_CL)
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)

@ -205,8 +205,11 @@ elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
else()
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
# SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
# SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
endif()
cache_third_party(${TARGET_NAME}

@ -151,6 +151,8 @@ set(COMMON_FLAGS
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
-Wno-error=nonnull-compare # Warning in boost gcc 7.2
-Wno-error=address # Warning in boost gcc 7.2
${fsanitize}
)

@ -79,6 +79,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(3) << "npu num: " <<platform::GetNPUDeviceCount();
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
@ -141,6 +142,7 @@ class AllocatorFacadePrivate {
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: allocators_)
: zero_size_allocators_);
VLOG(3) <<size;
auto iter = allocators.find(place);
PADDLE_ENFORCE_NE(iter, allocators.end(),
platform::errors::NotFound(

@ -24,6 +24,7 @@ namespace memory {
namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() {
VLOG(3) << "FLAGS_allocator_strategy" << FLAGS_allocator_strategy;
if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit;
}

@ -11,24 +11,29 @@ foreach(src ${OPS})
set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
endforeach()
register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
if(WITH_NCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
endif()
if(WITH_ASCEND)
op_library(gen_nccl_id_op)
op_library(c_gen_nccl_id_op)
if(WITH_GLOO)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
endif()
if(WITH_XPU_BKCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
endif()
if(WITH_GLOO)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
if(WITH_ASCEND_CL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
endif()
set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
cc_test(c_hcom_op_npu_test SRCS c_hcom_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_comm_init_hccl_op c_create_group_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)

@ -0,0 +1,31 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_max,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)

@ -0,0 +1,31 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_min,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)

@ -30,6 +30,11 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
@ -105,6 +110,88 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
}
};
template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
// void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->mutable_data<T>(place)));
out->Resize(in->dims());
// void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->mutable_data<T>(place)));
// void* recvbuff = sendbuff;
std::string tag = ctx.Attr<std::string>("tag");
int ring_id = ctx.Attr<int>("ring_id");
// s他的
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
group = "hccl_world_group";// std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm = paddle::platform::HCCLCommContext::Instance().Get();
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REP_OP_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REP_OP_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REP_OP_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REP_OP_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl allreduce, parameter is: "
<< "input num: " << numel
<< "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type
<< ", group is: " << group
<< ", tag is " << tag;
printf("sendbuff: %p\n", sendbuff);
printf("recvbuff: %p\n", recvbuff);
// printf("sendbuff: %p, %d\n", sendbuff, ((int*)sendbuff)[0]);
// printf("recvbuff: %p, %d\n", recvbuff, ((int*)recvbuff)[0]);
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
template <ReduceType red_type, typename T>
class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
public:
@ -114,7 +201,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
auto out = ctx.Output<framework::Tensor>("Out");
auto place = ctx.GetPlace();
ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
ncclDataType_t dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();
const void* sendbuff = in->data<void>();
out->Resize(in->dims());
@ -170,6 +257,11 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
#pragma message("hccl CAllReduceOpMaker need tag attr")
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")

@ -0,0 +1,31 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_prod,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)

@ -0,0 +1,31 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allreduce_sum,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)

@ -42,6 +42,11 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
#pragma message("tag")
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")

@ -0,0 +1,94 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<framework::LoDTensor>("X");
void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
auto comm = paddle::platform::HCCLCommContext::Instance().Get();
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
int ring_id = ctx.Attr<int>("ring_id");
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
std::string tag = ctx.Attr<std::string>("tag");
VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
<< ", group is " << group
<< ", tag is " << tag;
if (root == static_cast<int>(comm->rank())) {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
dtype, (uint32_t)root, group.c_str(), (void*)stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
<< x->numel();
} else {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
dtype, (uint32_t)root, group.c_str(), (void*)stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
<< framework::product(out->dims());
}
if (out != x) {
framework::TensorCopy(
*static_cast<const framework::Tensor*>(x), place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<framework::Tensor*>(out));
}
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_broadcast,
ops::CBroadcastOpASCENDKernel<float>,
ops::CBroadcastOpASCENDKernel<int>,
ops::CBroadcastOpASCENDKernel<int8_t>,
ops::CBroadcastOpASCENDKernel<plat::float16>);

@ -0,0 +1,79 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
class CCommInitOpNPU : public framework::OperatorBase {
public:
CCommInitOpNPU(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::string rank_table_file = Attr<std::string>("rank_table_file");
uint32_t rank_id = Attr<int>("rank_id");
uint32_t device_id = Attr<int>("device_id");
VLOG(3) << "begin init hccl, parameter is: "
<< "rank_table_file " << rank_table_file
<< " rank_id " << rank_id
<< " device_id " << device_id;
platform::HCCLCommContext::Instance().CreateHCCLComm(rank_table_file, rank_id, device_id);
}
};
class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddComment(R"DOC(
CCommInit operator on NPU
Initialize collective communication context within this trainer
)DOC");
AddAttr<std::string>("rank_table_file",
"(string) path to rank_table_file");
AddAttr<int>("rank_id", "(int) world rank id of the process");
AddAttr<int>("device_id", "(int) device id of the process/thread");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpNPU,
ops::CCommInitOpNPUMaker);
#endif

@ -0,0 +1,76 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
class CCreateGroupOpNPU : public framework::OperatorBase {
public:
CCreateGroupOpNPU(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::string group_name = Attr<std::string>("group_name");
int nranks = Attr<int>("nranks");
std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
paddle::platform::HCCLCommContext::Instance().CreateHCCLGroup(
group_name, (uint32_t)nranks,
std::vector<uint32_t>(rank_ids.begin(), rank_ids.end()));
}
};
class CCreateGroupOpNPUMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddComment(R"DOC(
CCreateGroup operator on NPU
Create collective communication group on NPU
)DOC");
AddAttr<std::string>("group_name",
"(string) name of the collective communication group");
AddAttr<int>("nranks", "(int) number of the group");
AddAttr<std::vector<int>>("rank_ids",
"(list of int) The world rank id of the group members");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_create_group, ops::CCreateGroupOpNPU,
ops::CCreateGroupOpNPUMaker);
#endif

@ -0,0 +1,192 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_broadcast);
USE_OP(c_allreduce_sum);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_NO_KERNEL_OP(c_create_group);
USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
std::string rank_table_file = getenv("RANK_TABLE_FILE");
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
printf("rank_table_file: %s, rank_id = %d, device_id = %d\n", rank_table_file.c_str(), rank_id, device_id);
f::AttributeMap attrs;
attrs["rank_table_file"] = rank_table_file;
attrs["rank_id"] = rank_id;
attrs["device_id"] = device_id;
auto comm_init_op =
f::OpRegistry::CreateOp("c_comm_init_hccl", {}, {}, attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
f::AttributeMap create_attrs;
create_attrs["group_name"] = HCOM_GROUP_PREFIX + std::to_string(0);
create_attrs["nranks"] = 2;
std::vector<int> rank_ids{0, 1};
create_attrs["rank_ids"] = rank_ids;
auto create_group_op = f::OpRegistry::CreateOp("c_create_group", {}, {}, create_attrs);
create_group_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int num = 2;
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
std::cout<< "rank_id:" << rank_id<<std::endl;
for (int64_t i = 0; i < num * num; ++i) {
init.push_back(1.0 + rank_id);
std::cout<< init[0];
}
std::cout<<std::endl;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num, num});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"]=std::string("tagx");
attrs["root"]=0;
attrs["ring_id"]=0;
auto op =
f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 1.0);
}
}
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
std::cout<< "rank_id:" << rank_id<<std::endl;
int num1 = 1;
int num2 = 4;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0);
// init.push_back(1.0 + rank_id * 3);
std::cout<< init[0];
}
std::cout<<std::endl;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"]=std::string("tagx");
attrs["ring_id"]=0;
auto op =
f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0);
}
}
TEST(c_broadcast, NPU) {
f::Scope scope;
char * npu_id=getenv("FLAGS_selected_npus");
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
Prepare(&scope, ctx);
// TestHCCLBroadcastOp(&scope, ctx);
TestHCCLAllReduceOp(&scope, ctx);
}

@ -128,7 +128,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce)
cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc DEPS framework_proto device_context enforce)
if(WITH_GPU)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)

@ -14,20 +14,21 @@
#pragma once
#if defined(PADDLE_WITH_NCCL)
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "boost/variant.hpp"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/hccl.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_NCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
@ -47,6 +48,8 @@ namespace platform {
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
class CUDADeviceContext;
class NCCLComm {
public:
virtual int ring_id() const = 0;
@ -120,8 +123,162 @@ class NCCLCommContext {
NCCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
};
#endif
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_ASCEND_CL)
// In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class NPUDeviceContext;
class HCCLComm {
public:
virtual std::string rank_table_file() const = 0;
virtual uint32_t rank() const = 0;
virtual uint32_t device_id() const = 0;
virtual aclrtStream stream() const = 0;
virtual NPUDeviceContext* dev_context() const = 0;
virtual ~HCCLComm() = default;
};
// A singleton HCCL communicator context reserves communication ring ids
class HCCLCommContext {
public:
static HCCLCommContext& Instance() {
static HCCLCommContext comm_ctx;
return comm_ctx;
}
HCCLComm* CreateHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
void CreateHCCLGroup(const std::string& group_name, uint32_t nranks, const std::vector<uint32_t>& rank_ids);
// retrieve a communicator by the ring id and place
HCCLComm* Get() const {
return comm_.get();
}
private:
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
std::unique_ptr<HCCLComm> comm_;
HCCLComm* AssignHCCLComm(const std::string& config_file, uint32_t rank, uint32_t device_id);
HCCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global
// BKCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The BKCLComm instance is created and reversed in the BKCLCommContext
// singleton with a global user specified group id.
class BKCLComm {
public:
virtual int ring_id() const = 0;
virtual int nranks() const = 0;
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual BKCLContext_t comm() const = 0;
virtual XPUStream stream() const = 0;
virtual XPUDeviceContext* dev_context() const = 0;
virtual ~BKCLComm() = default;
};
// A singleton BKCL communicator context reserves communication ring ids
class BKCLCommContext {
public:
static BKCLCommContext& Instance() {
static BKCLCommContext comm_ctx;
return comm_ctx;
}
BKCLComm* CreateBKCLComm(BKCLUniqueId* bkcl_id, int nranks, int rank,
int dev_id, int ring_id = 0);
void CreateAllBKCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
// a latter comm with the same dev_id and the same ring_id
// will override the former
BKCLComm* AssignBKCLComm(BKCLContext_t comm, int nranks, int rank, int dev_id,
int ring_id = 0);
// retrieve a communicator by the ring id in multiprocessing mode
BKCLComm* Get(int ring_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator in ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
platform::errors::InvalidArgument(
"One device id should be specified to retrieve from "
"multiple communicators."));
return comm_map_.at(ring_id).begin()->second.get();
}
// retrieve a communicator by the ring id and the device id
BKCLComm* Get(int ring_id, int dev_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator of ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_GT(
comm_map_.at(ring_id).count(dev_id), 0,
platform::errors::InvalidArgument(
"Communicator at device id %d has not been initialized in ring %d.",
dev_id, ring_id));
return comm_map_.at(ring_id).at(dev_id).get();
}
// retrieve a communicator by the ring id and place
BKCLComm* Get(int ring_id, Place place) const {
return Get(ring_id, BOOST_GET_CONST(XPUPlace, place).device);
}
private:
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
// ring id to dev-BKCLComm
std::map<int, std::map<int, std::unique_ptr<BKCLComm>>> comm_map_;
void ReleaseBKCLComms();
BKCLCommContext() = default;
DISABLE_COPY_AND_ASSIGN(BKCLCommContext);
};
#endif
} // namespace platform
} // namespace paddle

@ -0,0 +1,111 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace paddle {
namespace platform {
class HCCLCommImpl : public HCCLComm {
public:
void set_rank_table_file(const std::string& rank_table_file) { rank_table_file_ = rank_table_file; }
std::string rank_table_file() const override { return rank_table_file_; }
void set_rank(uint32_t rank) { rank_ = rank; }
uint32_t rank() const override { return rank_; }
void set_device_id(uint32_t device_id) { device_id_ = device_id; }
uint32_t device_id() const override { return device_id_; }
aclrtStream stream() const override { return dev_ctx_->stream(); }
void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
}
NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
private:
std::string rank_table_file_;
uint32_t rank_;
uint32_t device_id_;
std::unique_ptr<NPUDeviceContext> dev_ctx_;
};
HCCLComm* HCCLCommContext::CreateHCCLComm(const std::string& rank_table_file,
uint32_t rank, uint32_t device_id) {
/*
PADDLE_ENFORCE_NOT_NULL(rank_table_file,
platform::errors::InvalidArgument(
"The rank table file should not be null."));
PADDLE_ENFORCE_GE(rank, 0,
platform::errors::InvalidArgument(
"Expected rank >= 0. But received rank is %d.", rank));
PADDLE_ENFORCE_GE(device_id, 0,
platform::errors::InvalidArgument(
"Expected dev_id >= 0. But received dev_id is %d.", device_id));
*/
auto* comm_wrapper = AssignHCCLComm(rank_table_file, rank, device_id);
platform::dynload::hcom_init(rank_table_file.c_str(), std::to_string(rank).c_str());
platform::dynload::hcom_bind_model(comm_wrapper->stream(), comm_wrapper->stream());
VLOG(1) << "hccl communicator of rank " << rank << " has been created";
return comm_wrapper;
}
HCCLComm* HCCLCommContext::AssignHCCLComm(const std::string& rank_table_file,
uint32_t rank, uint32_t device_id) {
std::unique_ptr<NPUDeviceContext> dev_ctx(
new NPUDeviceContext(NPUPlace(device_id)));
VLOG(3) << "device_id" << device_id;
VLOG(3) << "dev_ctx->stream()" << dev_ctx->stream();
HCCLCommImpl* c = new HCCLCommImpl;
c->set_rank_table_file(rank_table_file);
c->set_rank(rank);
c->set_device_id(device_id);
c->set_dev_ctx(std::move(dev_ctx));
// comm_ = c
comm_.reset(c);
return c;
}
void HCCLCommContext::CreateHCCLGroup(const std::string& group_name, uint32_t nranks,
const std::vector<uint32_t>& rank_ids) {
/*
PADDLE_ENFORCE_NOT_NULL(group_name,
platform::errors::InvalidArgument(
"The group name should not be null."));
PADDLE_ENFORCE_GT(nranks, 0,
platform::errors::InvalidArgument(
"Expected nranks > 0. But received nranks is %d.", nranks));
PADDLE_ENFORCE_NOT_NULL(rank_ids,
platform::errors::InvalidArgument(
"The rank ids should not be null."));
*/
platform::dynload::hcom_create_group(group_name.c_str(), nranks, (unsigned int*)rank_ids.data());
VLOG(1) << "hccl group with name " << group_name << " has been created";
}
} // namespace platform
} // namespace paddle
#endif

@ -9,7 +9,7 @@ endif()
# There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
if (NOT APPLE AND NOT WIN32)
list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc)
endif()
@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
if(WITH_ROCM_PLATFORM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)

@ -0,0 +1,127 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file base.h
* @brief HCOM data type definition
*
*/
#ifndef HCCL_BASE_H_
#define HCCL_BASE_H_
#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef signed char s8;
typedef signed short s16;
typedef signed int s32;
typedef signed long long s64;
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long long u64;
/**
* @brief HCOM functions return value definition
*/
typedef enum tagHcclResult {
HCCL_SUCCESS = 0, /**< success */
HCCL_E_PARA = 1, /**< parameter error */
HCCL_E_PTR = 2, /**< empty pointer */
HCCL_E_MEMORY = 3, /**< memory error */
HCCL_E_INTERNAL = 4, /**< internal error */
HCCL_E_NOT_SUPPORT = 5, /**< not support feature */
HCCL_E_NOT_FOUND = 6, /**< not found specific resource */
HCCL_E_UNAVAIL = 7, /**< resource unavailable */
HCCL_E_SYSCALL = 8, /**< call system interface error */
HCCL_E_TIMEOUT = 9, /**< timeout */
HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */
HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */
HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */
HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */
HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */
HCCL_E_RUNTIME = 15, /**< call runtime api fail */
HCCL_E_DRV = 16, /**< call driver api fail */
HCCL_E_PROFILING = 17, /**< call profiling api fail */
HCCL_E_CCE = 18, /**< call cce api fail */
HCCL_E_NETWORK = 19, /**< call network api fail */
HCCL_E_RESERVED /**< reserved */
} hcclResult_t;
/* handle to communicator */
typedef void *hcclComm_t;
/**
* @brief HCCL Reduction opperation
*/
typedef enum tagHcclRedOp {
HCCL_REP_OP_SUM = 0, /**< sum */
HCCL_REP_OP_PROD = 1, /**< prod */
HCCL_REP_OP_MAX = 2, /**< max */
HCCL_REP_OP_MIN = 3, /**< min */
HCCL_REP_OP_RESERVED /**< reserved */
} hcclRedOp_t;
/**
* @brief HCCL data type
*/
typedef enum tagHcclDataType {
HCCL_DATA_TYPE_INT8 = 0, /**< int8 */
HCCL_DATA_TYPE_INT16 = 1, /**< int16 */
HCCL_DATA_TYPE_INT32 = 2, /**< int32 */
HCCL_DATA_TYPE_FP16 = 3, /**< fp16 */
HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */
HCCL_DATA_TYPE_INT64 = 5, /**< fp32 */
HCCL_DATA_TYPE_UINT64 = 6, /**< fp32 */
HCCL_DATA_TYPE_RESERVED /**< reserved */
} hcclDataType_t;
const u32 HCCL_MAX_SEGMENT_NUM = 8; // The max number of gradient segments.
/**
* @brief the feature of the model
*/
struct model_feature {
const char *model_name; /**< The model name */
u32 gradient_num; /**< The number of gradients */
float *gradient_size; /**< The size of each gradient */
float *gradient_time; /**< The BP compution time of each gradient */
};
enum GradSplitForceMode {
FORCE_NONE, /**< no force */
FORCE_SIZE, /**< force split gradient by size */
FORCE_RESERVED /**< reserved */
};
/**
* @brief stream handle.
*/
typedef void *rtStream_t;
/**
* @brief model handle.
*/
typedef void *rtModel_t;
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // HCCL_BASE_H_

@ -21,6 +21,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
@ -36,6 +40,11 @@ DEFINE_string(nccl_dir, "",
"For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(hccl_dir, "",
"Specify path for loading hccl library, such as libhccl.so. "
"For instance, /usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If default, "
"dlopen will search hccl from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string(
@ -383,6 +392,26 @@ void* GetNCCLDsoHandle() {
warning_msg);
#endif
}
void* GetHCCLDsoHandle() {
std::string warning_msg(
"You may need to install 'hccl2' from Huawei official website: "
"before install PaddlePaddle.");
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
#elif defined(PADDLE_WITH_ASCEND_CL)
return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
warning_msg);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
#endif
}
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)

@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
void* GetCUDADsoHandle();
void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle();
void* GetHCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
void* GetOpDsoHandle(const std::string& dso_name);

@ -0,0 +1,38 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hccl.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hccl_dso_flag;
void *hccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if HCCL_VERSION_CODE >= 2212
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle

@ -0,0 +1,84 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// #include <hccl/hccl.h>
// #include <hccl/hccl_types.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/hcom.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hccl_dso_flag;
extern void* hccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using HCCL_func = decltype(&::__name); \
std::call_once(hccl_dso_flag, []() { \
hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(hccl_dso_handle, #__name); \
return reinterpret_cast<HCCL_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HCCL_RAND_ROUTINE_EACH(__macro) \
__macro(hcom_init); \
__macro(hcom_destroy); \
__macro(hcom_bind_model); \
__macro(hcom_unbind_model); \
__macro(hcom_send); \
__macro(hcom_receive); \
__macro(hcom_broadcast); \
__macro(hcom_all_gather); \
__macro(hcom_all_reduce); \
__macro(hcom_reduce_scatter); \
__macro(hcom_create_group); \
__macro(hcom_destroy_group); \
__macro(hcom_get_rank_id); \
__macro(hcom_get_local_rank_id); \
__macro(hcom_get_local_rank_size); \
__macro(hcom_get_split_strategy); \
__macro(hcom_set_split_strategy_by_size); \
__macro(hcom_set_split_strategy_by_index); \
__macro(hcom_get_group_rank_from_world_rank); \
__macro(hcom_get_world_rank_from_group_rank);
HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#if HCCL_VERSION_CODE >= 2212
#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(HCCLSend); \
__macro(HCCLRecv);
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save