[Kunlun] PR2: Support MultiDevicePass and BKCL in parallel executor (#29574)

revert-31562-mean
liuyuhui 4 years ago committed by GitHub
parent 0b74428db8
commit 4427df37cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -29,7 +29,7 @@ include(generic) # simplify cmake module
find_package(CUDA QUIET)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
@ -166,6 +166,7 @@ option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
@ -213,6 +214,13 @@ if (NOT WITH_GPU AND WITH_NCCL)
"Disable NCCL when compiling without GPU" FORCE)
endif()
if (NOT WITH_XPU AND WITH_XPU_BKCL)
MESSAGE(WARNING
"Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
set(WITH_XPU_BKCL OFF CACHE STRING
"Disable BKCL when compiling without XPU" FORCE)
endif()
if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL")
include(nccl)

@ -47,4 +47,18 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
if (WITH_XPU_BKCL)
MESSAGE(STATUS "Compile with XPU BKCL!")
ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
SET(XPU_BKCL_LIB_NAME "libbkcl.so")
SET(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}")
SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
else(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
endif(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})

@ -43,6 +43,19 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
}
#elif defined(PADDLE_WITH_XPU_BKCL)
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::BKCLCommunicator *ctxs)
: BKCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
@ -98,6 +111,9 @@ void AllReduceOpHandle::AllReduceImpl(
places.reserve(num_places);
int64_t numel = -1;
bool is_gpu_place = false;
#if defined(PADDLE_WITH_XPU_BKCL)
bool is_xpu_place = false;
#endif
auto dtype = static_cast<framework::proto::VarType::Type>(0);
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
@ -117,6 +133,9 @@ void AllReduceOpHandle::AllReduceImpl(
in_var_handles[i]->name(), numel));
dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place());
#if defined(PADDLE_WITH_XPU_BKCL)
is_xpu_place = platform::is_xpu_place(lod_tensor.place());
#endif
}
PADDLE_ENFORCE_EQ(
numel, static_cast<int64_t>(lod_tensor.numel()),
@ -128,6 +147,12 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::PreconditionNotMet(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."));
#if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_EQ(is_xpu_place, platform::is_xpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
"in different local scopes should be equal."));
#endif
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
@ -179,6 +204,25 @@ void AllReduceOpHandle::AllReduceFunc(
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else if (is_xpu_place(places[0])) {
#if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
platform::errors::InvalidArgument(
"The bkcl context should not be NULL."));
BKCLDataType bkcl_dtype = platform::ToBKCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &p = places[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
all_reduce_calls.emplace_back([=] {
BKCLAllReduce(p, buffer, buffer, numel, bkcl_dtype, BKCL_ADD);
});
}
BKCLAllReduceFunc(all_reduce_calls);
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0]
@ -205,6 +249,27 @@ void AllReduceOpHandle::AllReduceFunc(
VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
}
#if defined(PADDLE_WITH_XPU_BKCL)
void AllReduceOpHandle::BKCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) {
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
all_reduce_calls[0]();
} else {
PADDLE_ENFORCE_EQ(
bkcl_group_start(), BKCL_SUCCESS,
platform::errors::PreconditionNotMet("bkcl_group_start failed"));
for (auto &call : all_reduce_calls) {
call();
}
PADDLE_ENFORCE_EQ(
bkcl_group_end(), BKCL_SUCCESS,
platform::errors::PreconditionNotMet("bkcl_group_end failed"));
}
});
}
#endif
#if defined(PADDLE_WITH_NCCL)
void AllReduceOpHandle::NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) {

@ -34,6 +34,9 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/framework/details/bkcl_op_handle.h"
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace paddle {
@ -46,6 +49,12 @@ class AllReduceOpHandle : public NCCLOpHandleBase {
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLCommunicator *ctxs);
#elif defined(PADDLE_WITH_XPU_BKCL)
class AllReduceOpHandle : public BKCLOpHandleBase {
public:
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::BKCLCommunicator *ctxs);
#else
class AllReduceOpHandle : public OpHandleBase {
public:
@ -65,8 +74,8 @@ class AllReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
#ifndef PADDLE_WITH_NCCL
// NCCLOpHandleBase already have these attributes.
#if !(PADDLE_WITH_NCCL || PADDLE_WITH_XPU_BKCL)
// NCCLOpHandleBase and BKCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std::vector<platform::Place> places_;
#endif
@ -78,6 +87,11 @@ class AllReduceOpHandle : public OpHandleBase {
void SyncNCCLAllReduce();
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
void BKCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls);
#endif
void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles);

@ -0,0 +1,131 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "xpu/bkcl.h"
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/bkcl_helper.h"
DECLARE_bool(sync_bkcl_allreduce);
namespace paddle {
namespace framework {
namespace details {
class BKCLOpHandleBase : public OpHandleBase {
public:
BKCLOpHandleBase(ir::Node* node, const std::vector<platform::Place>& places,
const platform::BKCLCommunicator* bkcl_ctxs)
: OpHandleBase(node), places_(places), bkcl_ctxs_(bkcl_ctxs) {
if (bkcl_ctxs == nullptr) {
return;
}
// init device context
auto default_bkcl_ctxs = bkcl_ctxs_->DefaultFlatCtx();
for (auto& p : places_) {
this->SetDeviceContext(p, default_bkcl_ctxs->DevCtx(p));
}
}
virtual ~BKCLOpHandleBase() {}
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
PADDLE_ENFORCE_NE(use_hierarchical_allreduce, true,
platform::errors::Unimplemented(
"xpu doesn't support hierarchical_allreduce"));
run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce;
VLOG(10) << "SetRunEnv "
<< " run_order:" << run_order
<< ", use_hierarchical_allreduce:" << use_hierarchical_allreduce;
if (bkcl_ctxs_ == nullptr) {
return;
}
if (!use_hierarchical_allreduce_) {
auto ctxs = bkcl_ctxs_->GetFlatCtx(run_order);
for (auto& p : places_) {
this->SetDeviceContext(p, ctxs->DevCtx(p));
}
return;
}
}
void FlatBKCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, BKCLDataType datatype,
BKCLOp op) {
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
auto comm = bkcl_ctx.comm_;
VLOG(10) << "before all reduce buffer:" << sendbuff << ", numel:" << count
<< ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place;
PADDLE_ENFORCE_EQ(
bkcl_all_reduce(comm, sendbuff, recvbuff, count, datatype, op, NULL),
BKCL_SUCCESS,
platform::errors::PreconditionNotMet("bckl all reduce failed"));
}
void BKCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, BKCLDataType datatype,
BKCLOp op) {
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
platform::errors::Unimplemented(
"xpu doesn't support hierarchical all reduce"));
if (!use_hierarchical_allreduce_) {
FlatBKCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return;
}
}
protected:
std::vector<platform::Place> places_;
const platform::BKCLCommunicator* bkcl_ctxs_{nullptr};
// When multi trainer call collective function, they need run the same order.
// Or the program will hang.So we use allreduce_deps_pass to set this
// run_order_.
int run_order_{0};
// Use 2d allreduce or not.
bool use_hierarchical_allreduce_{false};
};
} // namespace details
} // namespace framework
} // namespace paddle

@ -80,7 +80,7 @@ void BroadcastOpHandle::BroadcastOneVar(
&VariableVisitor::GetMutableTensor(out_var));
});
}
} else {
} else if (platform::is_gpu_place(in_tensor.place())) {
#if defined(PADDLE_WITH_NCCL)
VarHandle *out_handle = nullptr;
int root_id =
@ -141,6 +141,72 @@ void BroadcastOpHandle::BroadcastOneVar(
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_XPU_BKCL)
VarHandle *out_handle = nullptr;
int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls;
int type = platform::ToBKCLDataType(in_tensor.type());
size_t numel = static_cast<size_t>(in_tensor.numel());
for (auto out_var_handle : out_var_handles) {
Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
int dst_id =
BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);
void *send_recv_buffer = nullptr;
if (root_id == dst_id) {
send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
out_handle = out_var_handle;
} else {
send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
.Resize(in_tensor.dims())
.mutable_data(out_var_handle->place());
}
broadcast_calls.emplace_back([send_recv_buffer, numel, type, root_id,
&bkcl_ctx] {
PADDLE_ENFORCE_EQ(
bkcl_broadcast(bkcl_ctx.comm(), send_recv_buffer, send_recv_buffer,
numel, static_cast<BKCLDataType>(type), root_id,
nullptr),
BKCL_SUCCESS,
platform::errors::Unavailable("bkcl_broadcast failed"));
});
}
WaitInputVarGenerated();
this->RunAndRecordEvent([&] {
{
PADDLE_ENFORCE_EQ(
bkcl_group_start(), BKCL_SUCCESS,
platform::errors::Unavailable("bkcl_group_start failed"));
for (auto &call : broadcast_calls) {
call();
}
PADDLE_ENFORCE_EQ(
bkcl_group_end(), BKCL_SUCCESS,
platform::errors::Unavailable("bkcl_group_end failed"));
}
if (!out_handle->IsTheSameVar(in_var_handle)) {
auto out_var = var_scopes.at(in_var_handle.scope_idx())
->FindVar(out_var_handles[0]->name());
paddle::framework::TensorCopy(
in_tensor, in_var_handle.place(),
*(dev_ctxes_.at(in_var_handle.place())),
&VariableVisitor::GetMutableTensor(out_var));
}
});
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif
}
}

@ -34,12 +34,19 @@ class Node;
} // namespace ir
} // namespace framework
namespace platform {
#if defined(PADDLE_WITH_NCCL)
struct NCCLContextMap;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
struct BKCLContextMap;
#endif
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace paddle {
@ -63,11 +70,26 @@ struct BroadcastOpHandle : public OpHandleBase {
}
}
}
#else
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::BKCLContextMap *bkcl_ctxs)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
bkcl_ctxs_(bkcl_ctxs) {
if (bkcl_ctxs_) {
for (auto &p_ctx : bkcl_ctxs_->contexts_) {
this->SetDeviceContext(platform::XPUPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}
#endif
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string Name() const override;
@ -86,6 +108,8 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_NCCL)
const platform::NCCLContextMap *nccl_ctxs_;
#elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLContextMap *bkcl_ctxs_;
#endif
void InitOutputValue(const VarHandle &in_var_handle,

@ -18,10 +18,12 @@ namespace paddle {
namespace framework {
namespace details {
using DeviceType = paddle::platform::DeviceType;
TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitCtxOnDevice(p::kCPU);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastLodTensor(input_scope_idx);
}
@ -29,7 +31,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitCtxOnDevice(p::kCPU);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastSelectedRows(input_scope_idx);
}
@ -38,7 +40,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitCtxOnDevice(p::kCUDA);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastLodTensor(input_scope_idx);
}
@ -46,12 +48,22 @@ TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitCtxOnDevice(p::kCUDA);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastSelectedRows(input_scope_idx);
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
TEST(BroadcastTester, TestXPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnDevice(p::kXPU);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastLodTensor(input_scope_idx);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle

@ -33,7 +33,7 @@ struct VarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
using DeviceType = paddle::platform::DeviceType;
// test data amount
const f::DDim kDims = {20, 20};
@ -47,11 +47,15 @@ struct TestBroadcastOpHandle {
std::vector<VarHandleBase*> vars_;
std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_;
bool use_gpu_;
DeviceType use_device_;
#if defined(PADDLE_WITH_NCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
std::unique_ptr<platform::BKCLContextMap> bkcl_ctxs_;
#endif
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
@ -60,12 +64,36 @@ struct TestBroadcastOpHandle {
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
if (bkcl_ctxs_) {
bkcl_ctxs_->WaitAll();
}
#endif
}
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
void InitCtxOnDevice(DeviceType use_device) {
use_device_ = use_device;
if (use_device_ == p::kXPU) {
#if defined(PADDLE_WITH_XPU_BKCL)
int count = p::GetXPUDeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-xpu Broadcast, because the XPU "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::XPUPlace(i);
place_list_.push_back(p);
ctxs_.emplace_back(new p::XPUDeviceContext(p));
}
bkcl_ctxs_.reset(new platform::BKCLContextMap(place_list_));
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif
} else if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
int count = p::GetCUDADeviceCount();
if (count <= 1) {
@ -91,6 +119,9 @@ struct TestBroadcastOpHandle {
place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_.reset(nullptr);
#endif
#if defined(PADDLE_WITH_NCCL)
nccl_ctxs_.reset(nullptr);
#endif
@ -111,22 +142,25 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_gpu_) {
if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
platform::errors::PreconditionNotMet("Not compiled with NCCL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
} else if (use_device_ == p::kXPU) {
#if defined(PADDLE_WITH_XPU_BKCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
place_list_, bkcl_ctxs_.get());
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif
} else {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_);
#endif
}
op_handle_->SetLocalExecScopes(scope_map);
@ -149,7 +183,7 @@ struct TestBroadcastOpHandle {
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < place_list_.size(); ++j) {
if (!use_gpu_) {
if (use_device_ != p::kCUDA) {
op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
}
nodes_.emplace_back(
@ -275,7 +309,7 @@ struct TestBroadcastOpHandle {
f::LoD lod{{0, 10, 20}};
auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();
@ -290,7 +324,7 @@ struct TestBroadcastOpHandle {
int height = static_cast<int>(kDims[0] * 2);
auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();

@ -313,10 +313,13 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_NCCL)
const bool use_cuda,
DeviceType use_device,
platform::NCCLCommunicator *nccl_ctxs) const {
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
DeviceType use_device,
platform::BKCLCommunicator *bkcl_ctxs) const {
#else
const bool use_cuda) const {
DeviceType use_device) const {
#endif
VLOG(1) << "apply all passes";
// Create a default one if not finalized by user.
@ -336,9 +339,16 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx =
(use_device == p::kCUDA) ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
// ToDo: more check
platform::BKCLCommunicator *bkcl_ctx =
(use_device == p::kXPU) ? bkcl_ctxs : nullptr;
pass->Erase(kBKCLCtxs);
pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, bkcl_ctx);
#endif
} else if (pass->Type() == "fuse_all_reduce_op_pass") {
pass->Erase(kNRanks);
@ -349,12 +359,24 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
#if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx =
(use_device == p::kCUDA) ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
platform::BKCLCommunicator *nctx =
(use_device == p::kXPU) ? bkcl_ctxs : nullptr;
pass->Erase(kBKCLCtxs);
pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
platform::errors::Unimplemented(
"xpu doesn't support hierarchical_allreduce"));
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
#endif
} else if (pass->Type() == "coalesce_grad_tensor_pass") {
pass->Erase(kNRanks);
@ -364,35 +386,47 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
<< enable_sequential_execution_;
} else if (pass->Type() == "all_reduce_deps_pass") {
#if defined(PADDLE_WITH_NCCL)
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx =
(use_device == p::kCUDA) ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
platform::BKCLCommunicator *nctx =
(use_device == p::kXPU) ? bkcl_ctxs : nullptr;
pass->Erase(kBKCLCtxs);
pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
platform::errors::Unimplemented(
"xpu doesn't support hierarchical_allreduce"));
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
#endif
VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
} else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
if (!use_cuda) {
if (use_device != p::kCUDA) {
LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
"GPU, skipped.";
continue;
}
} else if (pass->Type() == "fusion_group_pass") {
pass->Set<bool>("use_gpu", new bool(use_cuda));
if (!use_cuda) {
pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
if (use_device != p::kCUDA) {
LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped.";
continue;
}
} else if (pass->Type() == "fuse_bn_act_pass") {
if (!use_cuda) {
if (use_device != p::kCUDA) {
LOG(WARNING) << "fuse_bn_act_pass is only supported on "
"GPU, skipped.";
continue;
}
} else if (pass->Type() == "fuse_bn_add_act_pass") {
if (!use_cuda) {
if (use_device != p::kCUDA) {
LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
"GPU, skipped.";
continue;
@ -401,7 +435,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
} else if (pass->Type() == "backward_optimizer_op_deps_pass") {
if (!use_cuda) {
if (use_device != p::kCUDA) {
VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
"GPU, skipped.";
continue;

@ -41,11 +41,15 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace paddle {
namespace framework {
namespace details {
using DeviceType = paddle::platform::DeviceType;
namespace p = paddle::platform;
struct BuildStrategy {
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
@ -147,6 +151,7 @@ struct BuildStrategy {
// NCCL config
size_t nccl_comm_num_{1};
size_t bkcl_comm_num_{1};
// The picture is here:
// https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
bool use_hierarchical_allreduce_{false};
@ -181,10 +186,13 @@ struct BuildStrategy {
const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_NCCL)
const bool use_cuda,
DeviceType use_device,
platform::NCCLCommunicator *nccl_ctxs) const;
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
DeviceType use_device,
platform::BKCLCommunicator *bkcl_ctxs) const;
#else
const bool use_cuda) const;
DeviceType use_device) const;
#endif
// If set true, ParallelExecutor would build the main_program into multiple

@ -14,22 +14,19 @@
#pragma once
#include <cstddef> // for size_t
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
using DeviceType = paddle::platform::DeviceType;
namespace p = paddle::platform;
struct ExecutionStrategy {
enum ExecutorType { kDefault = 0, kExperimental = 1 };
enum UseDevice {
kCPU = 0,
kCUDA = 1,
kXPU = 2,
};
// num_threads indicates the size of thread pool.
size_t num_threads_{0};
UseDevice use_device_{kCUDA};
DeviceType use_device_ = p::kCUDA;
// Note that allow_op_delay is invalid now.
bool allow_op_delay_{false};
// num_iteration_per_drop_scope indicates how many

@ -37,6 +37,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
const platform::NCCLCommunicator *ctxs)
: AllReduceOpHandle(node, local_scopes, places, ctxs),
num_of_all_reduce_(num_of_all_reduce) {}
#elif defined(PADDLE_WITH_XPU_BKCL)
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::BKCLCommunicator *ctxs)
: AllReduceOpHandle(node, local_scopes, places, ctxs),
num_of_all_reduce_(num_of_all_reduce) {}
#else
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
@ -73,9 +80,14 @@ void FusedAllReduceOpHandle::RunImpl() {
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
#if defined(PADDLE_WITH_XPU_BKCL)
// TODO(liuyuhui): XPU don't support fuse all reduce for now
if (InputIsInDifferentPlace(in_var_handles) || true) {
#else
if (InputIsInDifferentPlace(in_var_handles)) {
#endif
for (size_t j = 0; j < num_of_all_reduce_; ++j) {
std::vector<VarHandle *> dev_inputs;
std::vector<VarHandle *> dev_outputs;

@ -36,6 +36,8 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace paddle {
@ -49,6 +51,13 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::NCCLCommunicator *ctxs);
#elif defined(PADDLE_WITH_XPU_BKCL)
struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::BKCLCommunicator *ctxs);
#else
struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node,

@ -52,11 +52,18 @@ struct FusedBroadcastOpHandle : public BroadcastOpHandle {
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctx)
: BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
#else
FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
const std::vector<platform::Place>& places)
: BroadcastOpHandle(node, local_scopes, places) {}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places,
const platform::BKCLContextMap *bkcl_ctx)
: BroadcastOpHandle(node, local_scopes, places, bkcl_ctx) {}
#endif
FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places)
: BroadcastOpHandle(node, local_scopes, places) {}
std::string Name() const override;
protected:

@ -32,7 +32,7 @@ namespace framework {
namespace details {
struct VarHandle;
using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
using DeviceType = paddle::platform::DeviceType;
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_;
@ -56,7 +56,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
// create op handle node
nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_gpu_) {
if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
@ -64,14 +64,17 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
} else if (use_device_ == p::kXPU) {
#if defined(PADDLE_WITH_XPU_BKCL)
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
nodes_.back().get(), local_scopes_, place_list_, bkcl_ctxs_.get());
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with XPU."));
#endif
} else {
op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(),
local_scopes_, place_list_);
#endif
}
op_handle_->SetLocalExecScopes(scope_map);
@ -109,7 +112,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
}
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();
@ -133,7 +136,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
rows, height, val_scalar));
}
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();
@ -150,7 +153,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
TEST(FusedBroadcastTester, CPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitCtxOnDevice(p::kCPU);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
@ -158,7 +161,7 @@ TEST(FusedBroadcastTester, CPULodTensor) {
TEST(FusedBroadcastTester, CPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitCtxOnDevice(p::kCPU);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
@ -167,7 +170,7 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
TEST(FusedBroadcastTester, GPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitCtxOnDevice(p::kCUDA);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
@ -175,12 +178,22 @@ TEST(FusedBroadcastTester, GPULodTensor) {
TEST(FusedBroadcastTester, GPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitCtxOnDevice(p::kCUDA);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
TEST(FusedBroadcastTester, XPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnDevice(p::kXPU);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle

@ -27,7 +27,7 @@ struct DummyVarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
using DeviceType = paddle::platform::DeviceType;
// test data amount
const f::DDim kDims = {20, 20};
@ -173,7 +173,7 @@ struct TestGatherOpHandle {
out_selected_rows->mutable_value()->ShareDataWith(
in_selected_rows->value());
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();

@ -55,6 +55,7 @@ constexpr char kPlaces[] = "places";
constexpr char kGlobalScope[] = "global_scope";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
constexpr char kBKCLCtxs[] = "bkcl_ctxs";
constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce";
// aux variables to represent dependency. Useful to resolve data hazard.

@ -82,21 +82,74 @@ void OpHandleBase::InitCUDA() {
}
}
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."));
#endif
}
void OpHandleBase::InitXPU() {
#ifdef PADDLE_WITH_XPU
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
// TODO(liuyuhui): XPU now don't support sync events, add later.
}
}
} else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"%s should have only one dev_ctx.", Name()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
PADDLE_ENFORCE_EQ(
xpu_set_device(dev_id), XPU_SUCCESS,
platform::errors::PreconditionNotMet("xpu_set_device failed"));
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
PADDLE_ENFORCE_EQ(
platform::is_same_place(place, out_var_handle->place()), true,
platform::errors::InvalidArgument(
"The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name()));
}
}
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."));
#endif
}
void OpHandleBase::Run(ExecutionStrategy::UseDevice use_device) {
void OpHandleBase::Run(DeviceType use_device) {
#ifdef PADDLE_WITH_CUDA
if (events_.empty() && use_device == ExecutionStrategy::UseDevice::kCUDA &&
dev_ctxes_.size() > 0) {
if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
InitCUDA();
}
#else
PADDLE_ENFORCE_NE(use_device, ExecutionStrategy::UseDevice::kCUDA,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
PADDLE_ENFORCE_NE(
use_device, p::kCUDA,
platform::errors::InvalidArgument(
"Argument use_device should not be kCUDA when Paddle is not "
"compiled with CUDA."));
#endif
if (use_device == p::kXPU && dev_ctxes_.size() > 0) {
#ifdef PADDLE_WITH_XPU
InitXPU();
#else
PADDLE_ENFORCE_NE(
use_device, p::kXPU,
platform::errors::InvalidArgument(
"Argument use_device should not be kXPU when Paddle is not "
"compiled with XPU."));
#endif
}
// skip running current op, used with inplace_addto_op_pass
if (skip_running_) {

@ -43,7 +43,8 @@ class Node;
} // namespace ir
namespace details {
using DeviceType = paddle::platform::DeviceType;
namespace p = paddle::platform;
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
class OpHandleBase {
@ -72,7 +73,7 @@ class OpHandleBase {
virtual std::string Name() const = 0;
void Run(ExecutionStrategy::UseDevice use_device);
void Run(DeviceType use_device);
virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
@ -145,6 +146,7 @@ class OpHandleBase {
virtual void RunImpl() = 0;
virtual void InitCUDA();
virtual void InitXPU();
ir::Node *node_;
std::vector<VarHandleBase *> inputs_;

@ -212,10 +212,64 @@ void ReduceOpHandle::RunImpl() {
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else if (paddle::platform::is_xpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_XPU_BKCL)
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(
out_var_handle->place(), pre_in.type());
auto out_p = out_var_handle->place();
int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i];
auto &lod_tensor = *lod_tensors[i];
int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
void *buffer = const_cast<void *>(lod_tensor.data<void>());
void *recvbuffer = nullptr;
if (root_id == dev_id) {
recvbuffer =
out_var->GetMutable<framework::LoDTensor>()->mutable_data(
out_var_handle->place());
}
int type = platform::ToBKCLDataType(lod_tensor.type());
size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id,
&bkcl_ctx] {
PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer,
numel, static_cast<BKCLDataType>(type),
BKCL_ADD, root_id, nullptr),
BKCL_SUCCESS, platform::errors::Unavailable(
"bkcl_all_reduce failed"));
});
}
WaitInputVarGenerated();
this->RunAndRecordEvent([&] {
PADDLE_ENFORCE_EQ(
bkcl_group_start(), BKCL_SUCCESS,
platform::errors::Unavailable("bkcl_group_start failed"));
for (auto &call : all_reduce_calls) {
call();
}
PADDLE_ENFORCE_EQ(
bkcl_group_end(), BKCL_SUCCESS,
platform::errors::Unavailable("bkcl_group_end failed"));
});
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with XPU."));
#endif
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
"The place of tensor should be CPUPlace, CUDAPlace or XPUPlace, but "
"got %s.",
lod_tensors[0]->place()));
}
}

@ -41,6 +41,8 @@ struct NCCLContextMap;
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace paddle {
@ -93,6 +95,22 @@ struct ReduceOpHandle : public OpHandleBase {
}
}
}
#elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLContextMap *bkcl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::BKCLContextMap *bkcl_ctxs)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
bkcl_ctxs_(bkcl_ctxs) {
if (bkcl_ctxs_) {
for (auto &p_ctx : bkcl_ctxs_->contexts_) {
this->SetDeviceContext(platform::XPUPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}
#else
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)

@ -25,7 +25,7 @@ namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
using DeviceType = paddle::platform::DeviceType;
// test data amount
const f::DDim kDims = {20, 20};
@ -198,7 +198,7 @@ struct TestReduceOpHandle {
out_selected_rows->mutable_value()->ShareDataWith(
in_selected_rows->value());
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();
@ -263,7 +263,7 @@ struct TestReduceOpHandle {
out_lodtensor->ShareDataWith(in_lodtensor);
UseDevice use_device = UseDevice::kCPU;
DeviceType use_device = p::kCPU;
op_handle_->Run(use_device);
WaitAll();

@ -30,6 +30,7 @@ DECLARE_double(eager_delete_tensor_gb);
namespace paddle {
namespace framework {
namespace p = paddle::platform;
static std::vector<platform::Place> CreatePlaces(size_t num, bool use_cuda) {
std::vector<platform::Place> result;
@ -88,8 +89,7 @@ class ReferenceCountPassTestHelper {
FLAGS_eager_delete_tensor_gb = -1;
details::ExecutionStrategy exec_strategy;
exec_strategy.use_device_ =
use_cuda ? (ExecutionStrategy::kCUDA) : (ExecutionStrategy::kCPU);
exec_strategy.use_device_ = use_cuda ? p::kCUDA : p::kCPU;
executor_.reset(new ParallelExecutor(CreatePlaces(1, use_cuda), {}, "",
&scope_, {}, exec_strategy,

@ -41,6 +41,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
auto *multi_nccl_ctxs =
&Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto *multi_bkcl_ctxs =
&Get<platform::BKCLCommunicator>(details::kBKCLCtxs);
#endif
ir::Graph &result = *graph;
@ -92,6 +95,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, multi_nccl_ctxs, &result);
#elif defined(PADDLE_WITH_XPU_BKCL)
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, multi_bkcl_ctxs, &result);
#else
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, &result);
@ -154,6 +160,8 @@ class FuseAllReduceOpPass : public ir::Pass {
const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs,
#elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLCommunicator *multi_bkcl_ctxs,
#endif
ir::Graph *result) const {
std::vector<details::VarHandleBase *> inputs;
@ -182,6 +190,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, multi_nccl_ctxs, result);
#elif defined(PADDLE_WITH_XPU_BKCL)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, multi_bkcl_ctxs, result);
#else
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, result);
@ -197,12 +208,18 @@ class FuseAllReduceOpPass : public ir::Pass {
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs,
#elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLCommunicator *multi_bkcl_ctxs,
#endif
ir::Graph *result) const {
#if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto *op_handle = new details::FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, multi_bkcl_ctxs);
#else
auto *op_handle = new details::FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
@ -221,6 +238,10 @@ class FuseAllReduceOpPass : public ir::Pass {
if (!multi_nccl_ctxs) {
SetCommunicationContext(places, op_handle);
}
#elif defined(PADDLE_WITH_XPU_BKCL)
if (!multi_bkcl_ctxs) {
SetCommunicationContext(places, op_handle);
}
#else
SetCommunicationContext(places, op_handle);
#endif

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save