[NPU] add npu kernel for communication op (#31437)
* add allreduce and broadcast without test * add c_broadcast_test case * build c_comm_init and c_create_group operators * make the whole thing compile * add broadcast and init op test case but run failed * make unit test compile * fix broadcast test bug and change into hcom for ccl * change c_comm_init and c_create_group ops accordingly * make tests compile * transfer code to 27 * compiled successfully in 28, but run failed * test broadcast in 28, but failed * make hcom primitives work * change hccl data type for base.h * fix broadcast bug * make attributes work * fix group name bug * add allreduce but test failed * allreduce bug for qiuliang * allreduce finished * add allgather and reducescatter * merge all op code * add allgather test * finish run all ccl op test exclude send/recv * all all op and test exclude send/recv * send_v2_npu.cc recv_v2_npiu.cc compiled * fix ccl core dump bug and test allgather, reducescatter, broadcast op * fix allreduce bug just for test * hcom send&recv test pass, without hcom_destroy * for qiuliang test * Ascend Send&Recv Test Pass * all op (ex send/recv) ok * fix bug * merge all ccl op * style merge to PaddlePaddle * merge style * new merge style * merge style 2 * insert an empty at the end * disable ctest for hcom to pass ci Co-authored-by: void-main <voidmain1313113@gmail.com> Co-authored-by: f2hkop <f2huestc@outlook.com>revert-31562-mean
parent
388c69f27d
commit
15823bb0df
@ -0,0 +1,86 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_allgather_op.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
auto in = ctx.Input<framework::Tensor>("X");
|
||||
auto out = ctx.Output<framework::Tensor>("Out");
|
||||
hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
|
||||
|
||||
int ring_id = ctx.Attr<int>("ring_id");
|
||||
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
|
||||
std::string tag = ctx.Attr<std::string>("tag");
|
||||
auto place = ctx.GetPlace();
|
||||
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
|
||||
int nranks = comm->nranks();
|
||||
|
||||
framework::DDim out_dims = in->dims();
|
||||
out_dims[0] *= nranks;
|
||||
out->mutable_data<T>(out_dims, place);
|
||||
|
||||
int64_t send_numel = in->numel();
|
||||
void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
|
||||
void *recv_buff = reinterpret_cast<void*>(out->data<T>());
|
||||
|
||||
aclrtStream stream = nullptr;
|
||||
if (ctx.Attr<bool>("use_calc_stream")) {
|
||||
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
|
||||
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
|
||||
} else {
|
||||
stream = comm->stream();
|
||||
}
|
||||
|
||||
VLOG(3) << "begin hccl allgather, parameter is: "
|
||||
<< ", group is " << group
|
||||
<< ", ring_id is " << ring_id
|
||||
<< ", nranks is " << nranks
|
||||
<< ", tag is " << tag;
|
||||
|
||||
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
|
||||
tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
|
||||
group.c_str(), (void*)stream));
|
||||
|
||||
#else
|
||||
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
||||
"PaddlePaddle should compile with NPU."));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(c_allgather,
|
||||
ops::CAllGatherOpASCENDKernel<int8_t>,
|
||||
ops::CAllGatherOpASCENDKernel<int>,
|
||||
ops::CAllGatherOpASCENDKernel<float>,
|
||||
ops::CAllGatherOpASCENDKernel<plat::float16>);
|
@ -0,0 +1,149 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allgather_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(c_allgather);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(c_allgather, NPU);
|
||||
|
||||
DECLARE_string(selected_npus);
|
||||
|
||||
template<typename T>
|
||||
void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
|
||||
std::string debugstring = "";
|
||||
for (auto ele : data) {
|
||||
debugstring += std::to_string(ele) + std::string(",");
|
||||
}
|
||||
VLOG(2) << preStr << ":" << std::endl <<debugstring;
|
||||
}
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
|
||||
VLOG(2) << "rank_id = " << rank_id
|
||||
<< "; device_id = " << device_id
|
||||
<< "; rank_id = " << rank_id
|
||||
<< "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
|
||||
|
||||
std::vector<int> rank_ids{0, 1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op =
|
||||
f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<float> init;
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
|
||||
int num1 = 1;
|
||||
int num2 = 4;
|
||||
|
||||
for (int64_t i = 0; i < num1 * num2; ++i) {
|
||||
init.push_back(1.0 + rank_id);
|
||||
}
|
||||
PrintDebugInfo("input data", init);
|
||||
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({num1, num2});
|
||||
ctx.Wait();
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({num1, num2});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
ctx.Wait();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("tagx");
|
||||
attrs["ring_id"]=0;
|
||||
attrs["nranks"]=2;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
|
||||
{{"Out", {"Out"}}}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
ctx.Wait();
|
||||
|
||||
PrintDebugInfo("output data", out_vec);
|
||||
|
||||
EXPECT_EQ(out_vec.size(), init.size() * 2);
|
||||
for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
|
||||
EXPECT_EQ(out_vec[i], 1.0);
|
||||
}
|
||||
for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], 2.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(c_allgather, NPU) {
|
||||
f::Scope scope;
|
||||
|
||||
// only support one device, if more than one device, use first default
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
|
||||
|
||||
Prepare(&scope, ctx);
|
||||
TestHCCLAllGatherOp(&scope, ctx);
|
||||
}
|
@ -0,0 +1,144 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allgather_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(c_allreduce_max);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
|
||||
|
||||
DECLARE_string(selected_npus);
|
||||
|
||||
template<typename T>
|
||||
void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
|
||||
std::string debugstring = "";
|
||||
for (auto ele : data) {
|
||||
debugstring += std::to_string(ele) + std::string(",");
|
||||
}
|
||||
VLOG(2) << preStr << ":" << std::endl <<debugstring;
|
||||
}
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
|
||||
VLOG(2) << "rank_id = " << rank_id
|
||||
<< "; device_id = " << device_id
|
||||
<< "; rank_id = " << rank_id
|
||||
<< "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
|
||||
|
||||
std::vector<int> rank_ids{0, 1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op =
|
||||
f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<float> init;
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
|
||||
int num1 = 100;
|
||||
int num2 = 100;
|
||||
|
||||
for (int64_t i = 0; i < num1 * num2; ++i) {
|
||||
init.push_back(1.0 + rank_id * 3);
|
||||
}
|
||||
PrintDebugInfo("input data", init);
|
||||
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({num1, num2});
|
||||
ctx.Wait();
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({num1, num2});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
ctx.Wait();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("tagx");
|
||||
attrs["ring_id"]=0;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
|
||||
{{"Out", {"Out"}}}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
ctx.Wait();
|
||||
|
||||
PrintDebugInfo("output data", out_vec);
|
||||
|
||||
EXPECT_EQ(out_vec.size(), init.size());
|
||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], 4.0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(c_allreduce_max, NPU) {
|
||||
f::Scope scope;
|
||||
|
||||
// only support one device, if more than one device, use first default
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
|
||||
|
||||
Prepare(&scope, ctx);
|
||||
TestHCCLAllReduceOp(&scope, ctx);
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(c_allreduce_sum);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
|
||||
|
||||
DECLARE_string(selected_npus);
|
||||
|
||||
template<typename T>
|
||||
void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
|
||||
std::string debugstring = "";
|
||||
for (auto ele : data) {
|
||||
debugstring += std::to_string(ele) + std::string(",");
|
||||
}
|
||||
VLOG(2) << preStr << ":" << std::endl <<debugstring;
|
||||
}
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
|
||||
VLOG(2) << "rank_id = " << rank_id
|
||||
<< "; device_id = " << device_id
|
||||
<< "; rank_id = " << rank_id
|
||||
<< "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
|
||||
|
||||
std::vector<int> rank_ids{0, 1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op =
|
||||
f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int num1 = 3;
|
||||
int num2 = 128;
|
||||
|
||||
std::vector<float> init;
|
||||
for (int64_t i = 0; i < num1 * num2; ++i) {
|
||||
init.push_back(1.0 + rank_id);
|
||||
}
|
||||
PrintDebugInfo("input data", init);
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({num1, num2});
|
||||
ctx.Wait();
|
||||
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({num1, num2});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
ctx.Wait();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("tagx");
|
||||
attrs["ring_id"]=0;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
|
||||
{{"X", {"X"}}},
|
||||
{{"Out", {"Out"}}},
|
||||
attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
ctx.Wait();
|
||||
|
||||
PrintDebugInfo("output data", out_vec);
|
||||
|
||||
EXPECT_EQ(out_vec.size(), init.size());
|
||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], 3.0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(c_allreduce_sum, NPU) {
|
||||
f::Scope scope;
|
||||
|
||||
// only support one device, if more than one device, use first default
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
|
||||
|
||||
Prepare(&scope, ctx);
|
||||
TestHCCLAllReduceOp(&scope, ctx);
|
||||
}
|
@ -0,0 +1,89 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
auto in = ctx.Input<framework::Tensor>("X");
|
||||
auto out = ctx.Output<framework::Tensor>("Out");
|
||||
|
||||
int ring_id = ctx.Attr<int>("ring_id");
|
||||
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
|
||||
std::string tag = ctx.Attr<std::string>("tag");
|
||||
auto place = ctx.GetPlace();
|
||||
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
|
||||
int nranks = comm->nranks();
|
||||
|
||||
auto out_dims = in->dims();
|
||||
PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
|
||||
platform::errors::InvalidArgument(
|
||||
"The input tensor X's "
|
||||
"dim[0] (%d) should be divisible by nranks(%d)",
|
||||
out_dims[0], nranks));
|
||||
|
||||
out_dims[0] = out_dims[0] / nranks;
|
||||
out->mutable_data<T>(out_dims, place);
|
||||
|
||||
int64_t recv_numel = in->numel() / nranks;
|
||||
|
||||
void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
|
||||
void* outputPtr = reinterpret_cast<void*>(out->data<T>());
|
||||
hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
|
||||
|
||||
aclrtStream stream = nullptr;
|
||||
if (ctx.Attr<bool>("use_calc_stream")) {
|
||||
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
|
||||
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
|
||||
} else {
|
||||
stream = comm->stream();
|
||||
}
|
||||
VLOG(3) << "begin hccl reduce scatter, parameter is: "
|
||||
<< "recv_numel: " << recv_numel
|
||||
<< "dtype: " << dtype
|
||||
<< "hccl_red_type: " << HCCL_REP_OP_SUM
|
||||
<< ", group is: " << group
|
||||
<< ", tag is " << tag;
|
||||
|
||||
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter(
|
||||
tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream));
|
||||
#else
|
||||
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
||||
"PaddlePaddle should compile with NPU."));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(c_reducescatter,
|
||||
ops::CReduceScatterOpAscendKernel<int8_t>,
|
||||
ops::CReduceScatterOpAscendKernel<int>,
|
||||
ops::CReduceScatterOpAscendKernel<float>,
|
||||
ops::CReduceScatterOpAscendKernel<plat::float16>);
|
@ -0,0 +1,144 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_allgather_op.h"
|
||||
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(c_reducescatter);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
|
||||
|
||||
DECLARE_string(selected_npus);
|
||||
|
||||
template<typename T>
|
||||
void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
|
||||
std::string debugstring = "";
|
||||
for (auto ele : data) {
|
||||
debugstring += std::to_string(ele) + std::string(",");
|
||||
}
|
||||
VLOG(2) << preStr << ":" << std::endl <<debugstring;
|
||||
}
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
|
||||
VLOG(2) << "rank_id = " << rank_id
|
||||
<< "; device_id = " << device_id
|
||||
<< "; rank_id = " << rank_id
|
||||
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
|
||||
|
||||
std::vector<int> rank_ids{0, 1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op =
|
||||
f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
|
||||
// init
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
|
||||
std::vector<float> init;
|
||||
int num1 = 4;
|
||||
int num2 = 1;
|
||||
|
||||
for (int64_t i = 0; i < num1 * num2; ++i) {
|
||||
init.push_back(1.0);
|
||||
}
|
||||
PrintDebugInfo("input data", init);
|
||||
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({num1, num2});
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({num1, num2});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
// run
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("tagx");
|
||||
attrs["ring_id"]=0;
|
||||
attrs["nranks"]=2;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
|
||||
{{"Out", {"Out"}}}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
ctx.Wait();
|
||||
|
||||
PrintDebugInfo("output data", out_vec);
|
||||
EXPECT_EQ(out_vec.size(), init.size() / 2);
|
||||
for (uint32_t i = 0; i < out_vec.size(); i++) {
|
||||
EXPECT_EQ(out_vec[i], 2.0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(c_reducescatter, NPU) {
|
||||
f::Scope scope;
|
||||
|
||||
// only support one device, if more than one device, use first default
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
|
||||
|
||||
Prepare(&scope, ctx);
|
||||
TestHCCLReduceScatterOp(&scope, ctx);
|
||||
}
|
@ -0,0 +1,73 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/collective/recv_v2_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class CRecvOpASCENDKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
auto out = ctx.Output<framework::LoDTensor>("Out");
|
||||
int numel = out->numel();
|
||||
hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
|
||||
|
||||
int ring_id = ctx.Attr<int>("ring_id");
|
||||
auto place = ctx.GetPlace();
|
||||
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
|
||||
|
||||
aclrtStream stream = nullptr;
|
||||
if (ctx.Attr<bool>("use_calc_stream")) {
|
||||
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
|
||||
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
|
||||
} else {
|
||||
stream = comm->stream();
|
||||
}
|
||||
std::string tag = ctx.Attr<std::string>("tag");
|
||||
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
|
||||
int srcRank = ctx.Attr<int>("peer");
|
||||
int srTag = ctx.Attr<int>("srTag");
|
||||
VLOG(3) << "recv_v2_npu attr get";
|
||||
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
|
||||
tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), (u64)numel, dtype, srcRank,
|
||||
srTag, group.c_str(), stream));
|
||||
VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
|
||||
out->Resize(out->dims());
|
||||
out->set_lod(out->lod());
|
||||
#else
|
||||
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
||||
"PaddlePaddle should compile with NPU."));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(recv_v2,
|
||||
ops::CRecvOpASCENDKernel<int>,
|
||||
ops::CRecvOpASCENDKernel<int8_t>,
|
||||
ops::CRecvOpASCENDKernel<float>,
|
||||
ops::CRecvOpASCENDKernel<plat::float16>);
|
@ -0,0 +1,122 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/recv_v2_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(recv_v2);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(recv_v2, NPU);
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
std::string rank_table_file = getenv("RANK_TABLE_FILE");
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
int src_rank = atoi(getenv("SRC_RANK"));
|
||||
int dest_rank = atoi(getenv("DEST_RANK"));
|
||||
VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
|
||||
|
||||
std::vector<int> rank_ids = {0,1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
VLOG(3) << "CreateOp c_comm_init_hcom";
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
|
||||
|
||||
int num = atoi(getenv("DATA_SIZE"));
|
||||
EXPECT_GT(num, 0);
|
||||
EXPECT_LT(num, 1 << 15);
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
VLOG(3) << "rank_id:" << rank_id<<std::endl;
|
||||
|
||||
ctx.Wait();
|
||||
auto place = ctx.GetPlace();
|
||||
auto out = scope->Var("Out");
|
||||
auto tensor_out = out->GetMutable<f::LoDTensor>();
|
||||
tensor_out->Resize({num, num});
|
||||
tensor_out->mutable_data<float>(place); // allocate
|
||||
|
||||
ctx.Wait();
|
||||
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("srtest");
|
||||
attrs["peer"]=atoi(getenv("SRC_RANK"));
|
||||
attrs["ring_id"]=0;
|
||||
attrs["srTag"]=0;
|
||||
std::vector<int> out_shape;
|
||||
out_shape.push_back(num);
|
||||
out_shape.push_back(num);
|
||||
attrs["out_shape"]=out_shape;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
|
||||
VLOG(3) << "CreateOp recv_v2";
|
||||
|
||||
op->Run(*scope, place);
|
||||
VLOG(3) << "Run op recv_v2";
|
||||
std::vector<float> out_vec;
|
||||
TensorToVector(*tensor_out, ctx, &out_vec);
|
||||
ctx.Wait();
|
||||
std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
|
||||
EXPECT_EQ(out_vec == init, true);
|
||||
}
|
||||
|
||||
|
||||
TEST(recv_v2, NPU){
|
||||
f::Scope scope;
|
||||
char * npu_id=getenv("FLAGS_selected_npus");
|
||||
VLOG(3) << "Select npu:" << npu_id;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
|
||||
VLOG(3) << "Place over";
|
||||
Prepare(&scope, ctx);
|
||||
VLOG(3) << "Prepare over";
|
||||
TestHcomRecvOp(&scope, ctx);
|
||||
VLOG(3) << "Test over";
|
||||
}
|
@ -0,0 +1,74 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/collective/send_v2_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class CSendOpASCENDKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
auto x = ctx.Input<framework::LoDTensor>("X");
|
||||
int numel = x->numel();
|
||||
hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
|
||||
|
||||
auto place = ctx.GetPlace();
|
||||
int ring_id = ctx.Attr<int>("ring_id");
|
||||
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
|
||||
|
||||
aclrtStream stream = nullptr;
|
||||
if (ctx.Attr<bool>("use_calc_stream")) {
|
||||
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
|
||||
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
|
||||
} else {
|
||||
stream = comm->stream();
|
||||
}
|
||||
std::string tag = ctx.Attr<std::string>("tag");
|
||||
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
|
||||
int destRank = ctx.Attr<int>("peer");
|
||||
int srTag = ctx.Attr<int>("srTag");
|
||||
|
||||
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
|
||||
tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), (u64)numel, dtype, destRank,
|
||||
srTag, group.c_str(), stream));
|
||||
|
||||
VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
|
||||
<< x->numel();
|
||||
|
||||
#else
|
||||
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
||||
"PaddlePaddle should compile with NPU."));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
REGISTER_OP_NPU_KERNEL(send_v2,
|
||||
ops::CSendOpASCENDKernel<int>,
|
||||
ops::CSendOpASCENDKernel<int8_t>,
|
||||
ops::CSendOpASCENDKernel<float>,
|
||||
ops::CSendOpASCENDKernel<plat::float16>);
|
@ -0,0 +1,109 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#include "paddle/fluid/framework/operator.h"
|
||||
#include "paddle/fluid/operators/dropout_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/program_desc.h"
|
||||
#include "paddle/fluid/operators/math/math_function.h"
|
||||
|
||||
#include "paddle/fluid/operators/collective/send_v2_op.h"
|
||||
|
||||
#if defined(PADDLE_WITH_ASCEND_CL)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/hccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace f = paddle::framework;
|
||||
namespace p = paddle::platform;
|
||||
namespace m = paddle::operators::math;
|
||||
|
||||
USE_OP(send_v2);
|
||||
USE_NO_KERNEL_OP(c_comm_init_hcom);
|
||||
USE_OP_DEVICE_KERNEL(send_v2, NPU);
|
||||
|
||||
void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
|
||||
std::string rank_table_file = getenv("RANK_TABLE_FILE");
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
int device_id = atoi(getenv("DEVICE_ID"));
|
||||
int src_rank = atoi(getenv("SRC_RANK"));
|
||||
int dest_rank = atoi(getenv("DEST_RANK"));
|
||||
VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
|
||||
|
||||
std::vector<int> rank_ids = {0, 1};
|
||||
f::AttributeMap comm_init_attrs;
|
||||
comm_init_attrs["ring_id"] = 0;
|
||||
comm_init_attrs["nranks"] = 2;
|
||||
comm_init_attrs["rank"] = rank_id;
|
||||
comm_init_attrs["device_id"] = device_id;
|
||||
comm_init_attrs["rank_ids"] = rank_ids;
|
||||
auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
|
||||
auto place = ctx.GetPlace();
|
||||
comm_init_op->Run(*scope, place);
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
|
||||
std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
|
||||
auto x = scope->Var("X");
|
||||
auto tensor_x = x->GetMutable<f::LoDTensor>();
|
||||
int num = atoi(getenv("DATA_SIZE"));;
|
||||
EXPECT_GT(num, 0);
|
||||
EXPECT_LT(num, 1 << 15);
|
||||
std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
|
||||
int rank_id = atoi(getenv("RANK_ID"));
|
||||
VLOG(3)<<"rank id:"<<rank_id;
|
||||
TensorFromVector(init, ctx, tensor_x);
|
||||
tensor_x->Resize({num, num});
|
||||
ctx.Wait();
|
||||
auto place = ctx.GetPlace();
|
||||
ctx.Wait();
|
||||
|
||||
f::AttributeMap attrs;
|
||||
attrs["tag"]=std::string("srtest");
|
||||
attrs["peer"]=atoi(getenv("DEST_RANK"));
|
||||
attrs["ring_id"]=0;
|
||||
attrs["srTag"]=0;
|
||||
|
||||
auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
|
||||
|
||||
op->Run(*scope, place);
|
||||
VLOG(3)<<"send run over";
|
||||
ctx.Wait();
|
||||
}
|
||||
|
||||
TEST(send_v2, NPU){
|
||||
f::Scope scope;
|
||||
char * npu_id=getenv("FLAGS_selected_npus");
|
||||
VLOG(3) << "Select npu:" << npu_id;
|
||||
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
|
||||
VLOG(3) << "Place over";
|
||||
Prepare(&scope, ctx);
|
||||
VLOG(3) << "Prepare over";
|
||||
TestHcomSendOp(&scope, ctx);
|
||||
VLOG(3) << "Test over";
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue