【kunlun】dygraph supports multi xpu card training (#30671)
parent
3a3ff75c52
commit
b1026f64af
@ -0,0 +1,172 @@
|
||||
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#if defined(PADDLE_WITH_XPU_BKCL)
|
||||
#include "paddle/fluid/imperative/bkcl_context.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/platform/bkcl_helper.h"
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/gen_comm_id_helper.h"
|
||||
|
||||
#include "paddle/fluid/framework/variable.h"
|
||||
#include "paddle/fluid/platform/device_context.h"
|
||||
#include "paddle/fluid/platform/place.h"
|
||||
#include "paddle/fluid/string/split.h"
|
||||
#include "paddle/fluid/string/string_helper.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace imperative {
|
||||
|
||||
static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
|
||||
const XPUStream stream, const platform::BKCLComm *comm) {
|
||||
const auto &place = src.place();
|
||||
PADDLE_ENFORCE_EQ(
|
||||
platform::is_xpu_place(place), true,
|
||||
platform::errors::Unimplemented(
|
||||
"Dynamic graph mode does not support multi-CPU training yet."));
|
||||
|
||||
const void *src_ptr = src.data<void>();
|
||||
dst->Resize(src.dims());
|
||||
auto *dst_ptr = dst->mutable_data(src.place(), src.type());
|
||||
auto bkcl_dtype = platform::ToBKCLDataType(src.type());
|
||||
|
||||
PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(),
|
||||
bkcl_dtype, BKCL_ADD, stream),
|
||||
BKCL_SUCCESS, platform::errors::PreconditionNotMet(
|
||||
"BKCL all reduce failed"));
|
||||
}
|
||||
/*
|
||||
Baidu Kunlun Communication Library(BKCL) is designed for multi Baidu Kunlun
|
||||
cards communication
|
||||
as NVIDIA Collective Communications Library(NCCL) in multi Nvidia GPU cards.
|
||||
Please refer to bkcl.h in xpu.tar.gz linked in cmake/external/xpu.cmake.
|
||||
*/
|
||||
void BKCLParallelContext::BcastBKCLId(
|
||||
std::vector<BKCLUniqueId> &bkcl_ids, // NOLINT
|
||||
int root) {
|
||||
if (strategy_.local_rank_ == root) {
|
||||
std::vector<std::string> other_trainers;
|
||||
for (auto &ep : strategy_.trainer_endpoints_) {
|
||||
if (ep != strategy_.current_endpoint_) {
|
||||
other_trainers.push_back(ep);
|
||||
}
|
||||
}
|
||||
platform::SendBroadCastCommID(other_trainers, &bkcl_ids);
|
||||
} else {
|
||||
platform::RecvBroadCastCommID(strategy_.current_endpoint_, &bkcl_ids);
|
||||
}
|
||||
}
|
||||
|
||||
void BKCLParallelContext::Init() {
|
||||
std::vector<BKCLUniqueId> bkcl_ids;
|
||||
bkcl_ids.resize(strategy_.nrings_);
|
||||
|
||||
if (strategy_.local_rank_ == 0) {
|
||||
// generate the unique ncclid on the root worker
|
||||
for (size_t i = 0; i < bkcl_ids.size(); ++i) {
|
||||
auto ret = bkcl_get_unique_id(&bkcl_ids[i]);
|
||||
PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
|
||||
platform::errors::PreconditionNotMet(
|
||||
"BKCL get unique id failed [%d]", ret));
|
||||
}
|
||||
}
|
||||
BcastBKCLId(bkcl_ids, 0);
|
||||
|
||||
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
|
||||
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
|
||||
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
|
||||
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
|
||||
<< " ring id: " << ring_id;
|
||||
// it will assign bkcl_comm in XPUDeviceContext within ring_id
|
||||
platform::BKCLCommContext::Instance().CreateBKCLComm(
|
||||
&bkcl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, xpu_id,
|
||||
ring_id);
|
||||
}
|
||||
}
|
||||
|
||||
void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
|
||||
framework::Variable *dst,
|
||||
int ring_id, bool use_calc_stream) {
|
||||
PADDLE_ENFORCE_EQ(
|
||||
platform::is_xpu_place(place_), true,
|
||||
platform::errors::Unimplemented(
|
||||
"Dynamic graph mode does not support multi-CPU training yet."));
|
||||
auto place = place_;
|
||||
|
||||
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
|
||||
platform::DeviceContextPool::Instance().Get(place));
|
||||
platform::BKCLComm *comm =
|
||||
platform::BKCLCommContext::Instance().Get(ring_id, place);
|
||||
XPUStream stream =
|
||||
use_calc_stream ? dev_ctx->x_context()->xpu_stream : comm->stream();
|
||||
|
||||
if (src.IsType<framework::LoDTensor>()) {
|
||||
if (!dst->IsType<framework::LoDTensor>()) {
|
||||
dst->Clear();
|
||||
}
|
||||
AllReduce(src.Get<framework::LoDTensor>(),
|
||||
dst->GetMutable<framework::LoDTensor>(), stream, comm);
|
||||
} else {
|
||||
PADDLE_THROW(platform::errors::InvalidArgument(
|
||||
"XPU unsupported variable type %s for imperative allreduce, only "
|
||||
"LoDTensor are supported.",
|
||||
platform::demangle(framework::ToTypeName(src.Type()))));
|
||||
}
|
||||
}
|
||||
|
||||
paddle::platform::DeviceContext *BKCLParallelContext::GetDeviceContext(
|
||||
int ring_id) {
|
||||
return static_cast<platform::DeviceContext *>(
|
||||
platform::BKCLCommContext::Instance()
|
||||
.Get(ring_id, place_)
|
||||
->dev_context());
|
||||
}
|
||||
|
||||
void BKCLParallelContext::WaitCompute(int ring_id) {
|
||||
PADDLE_ENFORCE_GE(ring_id, 0,
|
||||
platform::errors::OutOfRange(
|
||||
"Ring id expected >= 0, but got %d", ring_id));
|
||||
PADDLE_ENFORCE_LT(
|
||||
ring_id, strategy_.nrings_,
|
||||
platform::errors::OutOfRange("Ring id expected < nrings,"
|
||||
"but got ring id = %d, nrings = %d",
|
||||
ring_id, strategy_.nrings_));
|
||||
// TODO(wangxi16): [Performance optimize] Maybe need to put Wait and
|
||||
// bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now.
|
||||
auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
|
||||
platform::DeviceContextPool::Instance().Get(place_));
|
||||
compute_dev_ctx->Wait();
|
||||
}
|
||||
|
||||
void BKCLParallelContext::WaitComm(int ring_id) {
|
||||
PADDLE_ENFORCE_GE(ring_id, 0,
|
||||
platform::errors::OutOfRange(
|
||||
"Ring id expected >= 0, but got %d", ring_id));
|
||||
PADDLE_ENFORCE_LT(
|
||||
ring_id, strategy_.nrings_,
|
||||
platform::errors::OutOfRange("Ring id expected < nrings,"
|
||||
"but got ring id = %d, nrings = %d",
|
||||
ring_id, strategy_.nrings_));
|
||||
auto comm_dev_ctx =
|
||||
platform::BKCLCommContext::Instance().Get(ring_id, place_)->dev_context();
|
||||
comm_dev_ctx->Wait();
|
||||
}
|
||||
|
||||
} // namespace imperative
|
||||
} // namespace paddle
|
||||
#endif
|
@ -0,0 +1,53 @@
|
||||
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#if defined(PADDLE_WITH_XPU_BKCL)
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/imperative/parallel_context.h"
|
||||
#include "xpu/bkcl.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace imperative {
|
||||
|
||||
class BKCLParallelContext : public ParallelContext {
|
||||
public:
|
||||
explicit BKCLParallelContext(const ParallelStrategy& strategy,
|
||||
const platform::Place& place)
|
||||
: ParallelContext(strategy, place) {}
|
||||
|
||||
~BKCLParallelContext() override = default;
|
||||
|
||||
void BcastBKCLId(std::vector<BKCLUniqueId>& bkcl_ids, int root); // NOLINT
|
||||
|
||||
void Init() override;
|
||||
|
||||
void AllReduceByStream(const framework::Variable& src,
|
||||
framework::Variable* dst, int ring_id,
|
||||
bool use_calc_stream) override;
|
||||
|
||||
paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
|
||||
|
||||
void WaitCompute(int ring_id) override;
|
||||
|
||||
void WaitComm(int ring_id) override;
|
||||
};
|
||||
|
||||
} // namespace imperative
|
||||
} // namespace paddle
|
||||
|
||||
#endif
|
@ -0,0 +1,66 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <thread> // NOLINT
|
||||
|
||||
#include "paddle/fluid/imperative/bkcl_context.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace imperative = paddle::imperative;
|
||||
namespace platform = paddle::platform;
|
||||
|
||||
int nrings = 2;
|
||||
imperative::ParallelStrategy GetStrategy(int local_rank) {
|
||||
std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
|
||||
imperative::ParallelStrategy strategy;
|
||||
strategy.trainer_endpoints_ = eps;
|
||||
strategy.current_endpoint_ = eps[local_rank];
|
||||
strategy.nranks_ = 2;
|
||||
strategy.local_rank_ = local_rank;
|
||||
strategy.nrings_ = nrings;
|
||||
return strategy;
|
||||
}
|
||||
|
||||
#if defined(PADDLE_WITH_XPU_BKCL)
|
||||
void BcastBKCLId(int local_rank, std::vector<BKCLUniqueId>* bkcl_ids) {
|
||||
auto strategy = GetStrategy(local_rank);
|
||||
platform::XPUPlace xpu(local_rank);
|
||||
imperative::BKCLParallelContext ctx(strategy, xpu);
|
||||
ctx.BcastBKCLId(*bkcl_ids, 0);
|
||||
}
|
||||
|
||||
TEST(BcastBKCLId, Run) {
|
||||
std::vector<BKCLUniqueId> bkcl_ids;
|
||||
bkcl_ids.resize(nrings);
|
||||
for (int i = 0; i < nrings; ++i) {
|
||||
bkcl_get_unique_id(&bkcl_ids[i]);
|
||||
}
|
||||
|
||||
std::thread t(BcastBKCLId, 0, &bkcl_ids);
|
||||
|
||||
std::vector<BKCLUniqueId> recv_bkcl_ids;
|
||||
recv_bkcl_ids.resize(nrings);
|
||||
for (int i = 0; i < nrings; ++i) {
|
||||
bkcl_get_unique_id(&recv_bkcl_ids[i]);
|
||||
}
|
||||
BcastBKCLId(1, &recv_bkcl_ids);
|
||||
|
||||
t.join();
|
||||
for (int i = 0; i < nrings; ++i) {
|
||||
EXPECT_EQ(
|
||||
0, std::memcmp(&bkcl_ids[i], &recv_bkcl_ids[i], BKCL_UNIQUE_ID_BYTES));
|
||||
}
|
||||
}
|
||||
#endif
|
@ -0,0 +1,96 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/framework/data_type.h"
|
||||
#include "paddle/fluid/framework/lod_tensor.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
|
||||
#if defined(PADDLE_WITH_XPU_BKCL)
|
||||
#include "paddle/fluid/platform/bkcl_helper.h"
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#endif
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()), true,
|
||||
platform::errors::PreconditionNotMet(
|
||||
"The place of ExecutionContext should be XPUPlace."));
|
||||
|
||||
#if defined(PADDLE_WITH_XPU_BKCL)
|
||||
int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device;
|
||||
int root_dev_id = ctx.Attr<int>("root");
|
||||
|
||||
auto in = ctx.Input<framework::Tensor>("X");
|
||||
auto out = ctx.Output<framework::Tensor>("Out");
|
||||
PADDLE_ENFORCE_EQ(
|
||||
out->IsInitialized(), true,
|
||||
platform::errors::PreconditionNotMet(
|
||||
"Currently, the output of broadcast op must be initialized,"
|
||||
"because this op can only be an In-Place operation."));
|
||||
void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
|
||||
PADDLE_ENFORCE_EQ(
|
||||
send_recv_buffer, in->data<void>(),
|
||||
platform::errors::PreconditionNotMet("Currently, the broadcast op can "
|
||||
"only be an In-Place operation."));
|
||||
|
||||
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
|
||||
auto comm = dev_ctx.bkcl_context();
|
||||
auto stream = dev_ctx.x_context()->xpu_stream;
|
||||
|
||||
// TODO(wangxi16): bkcl_broadcast only support float type,
|
||||
// need to converted other type to float before broadcasting.
|
||||
// Broadcast is equivalent to no type of operation, does not affect
|
||||
// correctness.
|
||||
// Once bkcl_broadcast support other type, need chang to:
|
||||
// BKCLDataType data_type = platform::ToBKCLDataType(in->type());
|
||||
BKCLDataType data_type = BKCL_FLOAT;
|
||||
size_t scale = sizeof(T) / sizeof(float);
|
||||
auto ret = bkcl_broadcast(comm, send_recv_buffer, send_recv_buffer,
|
||||
static_cast<size_t>(in->numel()) * scale,
|
||||
data_type, root_dev_id, stream);
|
||||
PADDLE_ENFORCE_EQ(ret, BKCL_SUCCESS,
|
||||
platform::errors::Unavailable("bkcl_broadcast failed"));
|
||||
|
||||
VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
|
||||
<< " From " << root_dev_id << " to " << dev_id;
|
||||
|
||||
if (ctx.Attr<bool>("sync_mode")) {
|
||||
dev_ctx.Wait();
|
||||
}
|
||||
#else
|
||||
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
||||
"PaddlePaddle should compile with XPU."));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
REGISTER_OP_XPU_KERNEL(broadcast, ops::BKCLBroadcastOpKernel<float>,
|
||||
ops::BKCLBroadcastOpKernel<double>,
|
||||
ops::BKCLBroadcastOpKernel<int>,
|
||||
ops::BKCLBroadcastOpKernel<int64_t>);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue