|
|
|
@ -181,13 +181,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
|
|
|
|
|
return member_->local_scopes_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ParallelExecutor::ParallelExecutor(
|
|
|
|
|
const std::vector<platform::Place> &places,
|
|
|
|
|
const std::unordered_set<std::string> &bcast_vars,
|
|
|
|
|
const std::string &loss_var_name, Scope *scope,
|
|
|
|
|
const std::vector<Scope *> &local_scopes,
|
|
|
|
|
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
|
|
|
|
|
ir::Graph *graph)
|
|
|
|
|
ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
const std::vector<std::string> &bcast_vars,
|
|
|
|
|
const std::string &loss_var_name,
|
|
|
|
|
Scope *scope,
|
|
|
|
|
const std::vector<Scope *> &local_scopes,
|
|
|
|
|
const ExecutionStrategy &exec_strategy,
|
|
|
|
|
const BuildStrategy &build_strategy,
|
|
|
|
|
ir::Graph *graph)
|
|
|
|
|
: member_(new ParallelExecutorPrivate(places)) {
|
|
|
|
|
member_->global_scope_ = scope;
|
|
|
|
|
member_->use_cuda_ = exec_strategy.use_cuda_;
|
|
|
|
@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
|
|
|
|
|
PADDLE_THROW("Not compiled with CUDA");
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
|
|
|
|
|
BCastParamsToDevices(bcast_vars);
|
|
|
|
|
// broadcast parameters from the 0th device to others:
|
|
|
|
|
auto need_broadcast = [&]() -> bool {
|
|
|
|
|
if (build_strategy.num_trainers_ > 1) {
|
|
|
|
|
// 1. num_tariners would be grater than 1 for nccl distributed training.
|
|
|
|
|
return true;
|
|
|
|
|
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
|
|
|
|
|
// 2. Only one trainer process, but ParallelExecutor hold multiple
|
|
|
|
|
// devices.
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (need_broadcast()) {
|
|
|
|
|
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Startup Program has been run. All local scopes has correct parameters.
|
|
|
|
|
|
|
|
|
|
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
|
|
|
|
@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ParallelExecutor::BCastParamsToDevices(
|
|
|
|
|
const std::unordered_set<std::string> &vars) const {
|
|
|
|
|
const std::vector<std::string> &vars, int trainer_id) const {
|
|
|
|
|
// the initializing bcast, all vars would be bcast from device(0).
|
|
|
|
|
for (auto &var : vars) {
|
|
|
|
|
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
|
|
|
|
@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
|
|
|
|
|
auto place = member_->places_[i];
|
|
|
|
|
void *buffer;
|
|
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
|
if (i == 0 && trainer_id == 0) {
|
|
|
|
|
buffer = const_cast<void *>(main_tensor.data<void>());
|
|
|
|
|
} else {
|
|
|
|
|
auto local_scope = member_->local_scopes_[i];
|
|
|
|
|