|
|
|
@ -111,8 +111,8 @@ class ParallelExecutorPrivate {
|
|
|
|
|
std::vector<ncclUniqueId *> flat_nccl_ids;
|
|
|
|
|
if (nranks_ == 1) {
|
|
|
|
|
// FIXME(gongwb): need not to create ncclid when nranks==1
|
|
|
|
|
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -132,16 +132,16 @@ class ParallelExecutorPrivate {
|
|
|
|
|
|
|
|
|
|
flat_nccl_ids.push_back(nccl_id);
|
|
|
|
|
|
|
|
|
|
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
VLOG(1) << "init bst nccl context complete!";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// num_trainers ==1 && places > 1
|
|
|
|
|
if (bst.num_trainers_ == 1) {
|
|
|
|
|
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -153,8 +153,8 @@ class ParallelExecutorPrivate {
|
|
|
|
|
flat_nccl_ids.push_back(nccl_id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_);
|
|
|
|
|
|
|
|
|
|
if (bst.use_hierarchical_allreduce_) {
|
|
|
|
|
std::vector<ncclUniqueId *> inter_nccl_ids;
|
|
|
|
@ -175,12 +175,30 @@ class ParallelExecutorPrivate {
|
|
|
|
|
exter_nccl_ids.push_back(nccl_id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nccl_ctxs_.InitHierarchicalCtxs(places_, inter_nccl_ids, exter_nccl_ids,
|
|
|
|
|
bst.num_trainers_, bst.trainer_id_,
|
|
|
|
|
bst.hierarchical_allreduce_inter_nranks_,
|
|
|
|
|
bst.hierarchical_allreduce_exter_nranks_);
|
|
|
|
|
nccl_ctxs_->InitHierarchicalCtxs(
|
|
|
|
|
places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_,
|
|
|
|
|
bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_,
|
|
|
|
|
bst.hierarchical_allreduce_exter_nranks_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void InitOrGetNCCLCommunicator(framework::Scope *scope,
|
|
|
|
|
const BuildStrategy &bst) {
|
|
|
|
|
const std::string var_name = "NCCLCommunicator";
|
|
|
|
|
auto var = scope->FindVar(var_name);
|
|
|
|
|
if (var != nullptr) {
|
|
|
|
|
PADDLE_ENFORCE(var->IsInitialized(),
|
|
|
|
|
"if %s exists, it must be initialized", var_name);
|
|
|
|
|
VLOG(1) << "find " << var_name
|
|
|
|
|
<< " in scope, so use it and does not recreate!";
|
|
|
|
|
nccl_ctxs_ = var->GetMutable<platform::NCCLCommunicator>();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
|
|
|
|
|
nccl_ctxs_ = scope->Var(var_name)->GetMutable<platform::NCCLCommunicator>();
|
|
|
|
|
InitNCCLCtxs(scope, bst);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
BuildStrategy build_strategy_;
|
|
|
|
@ -190,7 +208,7 @@ class ParallelExecutorPrivate {
|
|
|
|
|
std::unique_ptr<details::SSAGraphExecutor> executor_;
|
|
|
|
|
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
platform::MultiNCCLContextMap nccl_ctxs_;
|
|
|
|
|
platform::NCCLCommunicator *nccl_ctxs_{nullptr};
|
|
|
|
|
#endif
|
|
|
|
|
bool own_local_scope_;
|
|
|
|
|
bool use_cuda_;
|
|
|
|
@ -281,27 +299,6 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
|
|
|
|
|
return executor && executor->NeedCreateLocalExeScope();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
/*
|
|
|
|
|
* When nccl inits nccl comm using ncclCommInitAll, it meets error when
|
|
|
|
|
* allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
|
|
|
|
|
* create a new nccl comm for sync_batch_norm_op. And these codes should be
|
|
|
|
|
* polished with a unified nccl management.
|
|
|
|
|
*/
|
|
|
|
|
platform::NCCLContextMap *ParallelExecutor::GetNCCLContextForSyncbatchNomrOp(
|
|
|
|
|
framework::Scope *scope) {
|
|
|
|
|
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
|
|
|
|
|
if (nccl_id_var != nullptr) {
|
|
|
|
|
return member_->nccl_ctxs_.DefaultFlatCtx();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (dev_nccl_ctxs_.get() == nullptr) {
|
|
|
|
|
dev_nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
|
|
|
|
|
}
|
|
|
|
|
return dev_nccl_ctxs_.get();
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
const std::vector<std::string> &bcast_vars,
|
|
|
|
|
const std::string &loss_var_name,
|
|
|
|
@ -375,7 +372,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
if (member_->use_cuda_) {
|
|
|
|
|
// Bcast Parameters to all GPUs
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
member_->InitNCCLCtxs(scope, build_strategy);
|
|
|
|
|
member_->InitOrGetNCCLCommunicator(scope, build_strategy);
|
|
|
|
|
|
|
|
|
|
// Initialize device context's nccl comm, will be used by normal
|
|
|
|
|
// Operators like sync_batch_norm, and collective ops.
|
|
|
|
@ -384,7 +381,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
// NOTE: NCCL group-calls and non-group-calls can not use the same
|
|
|
|
|
// NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
|
|
|
|
|
// same communicators.
|
|
|
|
|
auto *nccl_ctxs = GetNCCLContextForSyncbatchNomrOp(scope);
|
|
|
|
|
auto *nccl_ctxs =
|
|
|
|
|
member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
|
|
|
|
|
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
|
|
|
|
|
platform::DeviceContextPool &pool =
|
|
|
|
|
platform::DeviceContextPool::Instance();
|
|
|
|
@ -421,18 +419,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
VLOG(3) << "use local async mode";
|
|
|
|
|
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
|
|
|
|
|
{member_->local_scopes_[0]}, 1,
|
|
|
|
|
member_->use_cuda_, &member_->nccl_ctxs_);
|
|
|
|
|
member_->use_cuda_, member_->nccl_ctxs_);
|
|
|
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
|
|
|
graphs[i] =
|
|
|
|
|
build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
|
|
|
|
|
{member_->local_scopes_[i]}, 1,
|
|
|
|
|
member_->use_cuda_, &member_->nccl_ctxs_);
|
|
|
|
|
member_->use_cuda_, member_->nccl_ctxs_);
|
|
|
|
|
async_graphs[i] = graphs[i];
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
|
|
|
|
|
member_->local_scopes_, member_->nranks_,
|
|
|
|
|
member_->use_cuda_, &member_->nccl_ctxs_);
|
|
|
|
|
member_->use_cuda_, member_->nccl_ctxs_);
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
if (build_strategy.async_mode_) {
|
|
|
|
@ -565,7 +563,7 @@ void ParallelExecutor::BCastParamsToDevices(
|
|
|
|
|
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
|
|
|
|
|
"variables' buffer size to bcast NOT equal to places");
|
|
|
|
|
{
|
|
|
|
|
auto *nccl_ctxs = member_->nccl_ctxs_.DefaultFlatCtx();
|
|
|
|
|
auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
|
|
|
|
|
platform::NCCLGroupGuard guard;
|
|
|
|
|
for (size_t i = 0; i < member_->places_.size(); ++i) {
|
|
|
|
|
auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
|
|
|
|
|