You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
360 lines
13 KiB
360 lines
13 KiB
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include "paddle/fluid/framework/parallel_executor.h"
|
|
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <vector>
|
|
|
|
#include "paddle/fluid/framework/ir/graph.h"
|
|
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
#include "paddle/fluid/platform/nccl_helper.h"
|
|
#endif
|
|
|
|
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
|
|
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
|
|
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
|
|
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
|
|
#include "paddle/fluid/platform/profiler.h"
|
|
|
|
namespace paddle {
|
|
namespace framework {
|
|
|
|
std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
|
|
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
|
|
const std::string &loss_var_name,
|
|
const std::unordered_set<std::string> ¶m_names,
|
|
const std::vector<Scope *> &local_scopes, const bool use_cuda,
|
|
#ifdef PADDLE_WITH_CUDA
|
|
const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
|
|
#else
|
|
const BuildStrategy &strategy) {
|
|
#endif
|
|
// Convert the program to graph.
|
|
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
|
|
|
|
// Apply a graph viz pass to record a graph.
|
|
if (!strategy.debug_graphviz_path_.empty()) {
|
|
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
|
|
const std::string graph_path = string::Sprintf(
|
|
"%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
|
|
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
|
|
graph = viz_pass->Apply(std::move(graph));
|
|
}
|
|
|
|
// Convert graph to run on multi-devices.
|
|
auto multi_device_pass =
|
|
ir::PassRegistry::Instance().Get("multi_device_pass");
|
|
multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
|
|
&places);
|
|
multi_device_pass->SetNotOwned<const std::string>("loss_var_name",
|
|
&loss_var_name);
|
|
multi_device_pass->SetNotOwned<const std::unordered_set<std::string>>(
|
|
"params", ¶m_names);
|
|
multi_device_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
|
|
&local_scopes);
|
|
multi_device_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
|
|
multi_device_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
|
|
#endif
|
|
graph = multi_device_pass->Apply(std::move(graph));
|
|
|
|
// Apply a graph print pass to record a graph with device info.
|
|
if (!strategy.debug_graphviz_path_.empty()) {
|
|
auto multi_device_print_pass =
|
|
ir::PassRegistry::Instance().Get("multi_device_print_pass");
|
|
multi_device_print_pass->SetNotOwned<const std::string>(
|
|
"debug_graphviz_path", &strategy.debug_graphviz_path_);
|
|
multi_device_print_pass->Set<details::GraphvizSSAGraphPrinter>(
|
|
"graph_printer", new details::GraphvizSSAGraphPrinter);
|
|
graph = multi_device_print_pass->Apply(std::move(graph));
|
|
}
|
|
|
|
// Verify that the graph is correct for multi-device executor.
|
|
auto multi_device_check_pass =
|
|
ir::PassRegistry::Instance().Get("multi_device_check_pass");
|
|
graph = multi_device_check_pass->Apply(std::move(graph));
|
|
return graph;
|
|
}
|
|
|
|
class ParallelExecutorPrivate {
|
|
public:
|
|
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
|
|
: places_(places) {}
|
|
|
|
std::vector<platform::Place> places_;
|
|
std::vector<Scope *> local_scopes_;
|
|
Scope *global_scope_;
|
|
std::unique_ptr<details::SSAGraphExecutor> executor_;
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
|
|
#endif
|
|
bool own_local_scope_;
|
|
bool use_cuda_;
|
|
bool use_all_reduce_;
|
|
};
|
|
|
|
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
|
|
return member_->local_scopes_;
|
|
}
|
|
|
|
ParallelExecutor::ParallelExecutor(
|
|
const std::vector<platform::Place> &places,
|
|
const std::unordered_set<std::string> ¶ms,
|
|
const std::unordered_set<std::string> &bcast_vars,
|
|
const ProgramDesc &main_program, const std::string &loss_var_name,
|
|
Scope *scope, const std::vector<Scope *> &local_scopes,
|
|
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
|
|
size_t num_trainers, size_t trainer_id)
|
|
: member_(new ParallelExecutorPrivate(places)) {
|
|
member_->global_scope_ = scope;
|
|
member_->use_cuda_ = exec_strategy.use_cuda_;
|
|
member_->use_all_reduce_ =
|
|
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
|
|
|
|
if (!member_->use_all_reduce_) {
|
|
PADDLE_ENFORCE(places.size() > 1,
|
|
"If you set build_strategy.reduce with 'Reduce',"
|
|
"the number of places must be greater than 1.");
|
|
}
|
|
|
|
// Step 1. Bcast the params to devs.
|
|
// Create local scopes
|
|
if (local_scopes.empty()) {
|
|
member_->own_local_scope_ = true;
|
|
member_->local_scopes_.emplace_back(member_->global_scope_);
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
member_->local_scopes_.emplace_back(&scope->NewScope());
|
|
}
|
|
} else {
|
|
member_->own_local_scope_ = false;
|
|
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
|
|
for (size_t i = 0; i < member_->places_.size(); ++i) {
|
|
member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
|
|
}
|
|
}
|
|
|
|
if (member_->use_cuda_) {
|
|
// Bcast Parameters to all GPUs
|
|
#ifdef PADDLE_WITH_CUDA
|
|
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
|
|
ncclUniqueId *nccl_id = nullptr;
|
|
if (nccl_id_var != nullptr) {
|
|
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
|
|
}
|
|
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
|
|
member_->places_, nccl_id, num_trainers, trainer_id));
|
|
#else
|
|
PADDLE_THROW("Not compiled with CUDA");
|
|
#endif
|
|
}
|
|
|
|
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
|
|
BCastParamsToDevices(bcast_vars);
|
|
}
|
|
// Startup Program has been run. All local scopes has correct parameters.
|
|
|
|
// Step 2. Create vars in each scope;
|
|
std::vector<details::VariableInfo> var_infos;
|
|
for (auto *var : main_program.Block(0).AllVars()) {
|
|
var_infos.emplace_back();
|
|
var_infos.back().name_ = var->Name();
|
|
var_infos.back().type_ = var->GetType();
|
|
var_infos.back().persistable_ = var->Persistable();
|
|
}
|
|
|
|
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
|
|
// ncclOp
|
|
#ifdef PADDLE_WITH_CUDA
|
|
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
|
|
main_program, member_->places_, loss_var_name, params,
|
|
member_->local_scopes_, member_->use_cuda_, build_strategy,
|
|
member_->nccl_ctxs_.get());
|
|
#else
|
|
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
|
|
main_program, member_->places_, loss_var_name, params,
|
|
member_->local_scopes_, member_->use_cuda_, build_strategy);
|
|
#endif
|
|
|
|
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
|
|
exec_strategy, member_->local_scopes_, places, std::move(graph)));
|
|
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
|
|
exec_strategy, member_->local_scopes_, std::move(var_infos),
|
|
member_->places_, std::move(member_->executor_)));
|
|
}
|
|
|
|
void ParallelExecutor::BCastParamsToDevices(
|
|
const std::unordered_set<std::string> &vars) const {
|
|
// the initializing bcast, all vars would be bcast from device(0),
|
|
// otherwise
|
|
// bcast from the specified device.
|
|
bool initializing = member_->executor_ ? false : true;
|
|
for (auto &var : vars) {
|
|
int var_dev_id = -1;
|
|
if (member_->executor_) {
|
|
auto &sharded_var_device =
|
|
member_->executor_->Graph().Get<details::ShardedVarDevice>(
|
|
details::kShardedVarDevice);
|
|
if (sharded_var_device.find(var) != sharded_var_device.end()) {
|
|
var_dev_id = sharded_var_device.at(var);
|
|
}
|
|
}
|
|
|
|
if (!initializing && var_dev_id == -1) continue;
|
|
|
|
framework::Variable *main_var = nullptr;
|
|
if (initializing) {
|
|
main_var = member_->local_scopes_[0]->FindVar(var);
|
|
} else {
|
|
main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
|
|
}
|
|
|
|
if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
|
|
continue;
|
|
}
|
|
|
|
auto &main_tensor = main_var->Get<LoDTensor>();
|
|
auto &dims = main_tensor.dims();
|
|
if (paddle::platform::is_gpu_place(main_tensor.place())) {
|
|
#ifdef PADDLE_WITH_CUDA
|
|
std::vector<void *> buffers;
|
|
size_t numel = main_tensor.numel();
|
|
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
|
|
for (size_t i = 0; i < member_->places_.size(); ++i) {
|
|
auto place = member_->places_[i];
|
|
void *buffer;
|
|
|
|
if ((initializing && i == 0) ||
|
|
(!initializing && static_cast<int>(i) == var_dev_id)) {
|
|
buffer = const_cast<void *>(main_tensor.data<void>());
|
|
} else {
|
|
auto local_scope = member_->local_scopes_[i];
|
|
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
|
|
t->Resize(dims);
|
|
buffer = t->mutable_data(place, main_tensor.type());
|
|
}
|
|
buffers.push_back(buffer);
|
|
}
|
|
|
|
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
|
|
"variables' buffer size to bcast NOT equal to places");
|
|
{
|
|
platform::NCCLGroupGuard guard;
|
|
for (size_t i = 0; i < member_->places_.size(); ++i) {
|
|
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
|
|
if (initializing) {
|
|
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
|
|
nccl_ctx.comm_, nccl_ctx.stream());
|
|
} else {
|
|
if (var_dev_id >= 0) {
|
|
platform::dynload::ncclBcast(buffers[i], numel, data_type,
|
|
var_dev_id, nccl_ctx.comm_,
|
|
nccl_ctx.stream());
|
|
}
|
|
}
|
|
}
|
|
member_->nccl_ctxs_->WaitAll();
|
|
}
|
|
|
|
#else
|
|
PADDLE_THROW("Not compiled with CUDA");
|
|
#endif
|
|
} else {
|
|
platform::CPUPlace cpu;
|
|
for (size_t i = 0; i < member_->places_.size(); ++i) {
|
|
if ((initializing && i == 0) ||
|
|
(!initializing && static_cast<int>(i) == var_dev_id))
|
|
continue;
|
|
|
|
auto local_scope = member_->local_scopes_[i];
|
|
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
|
|
|
|
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
|
|
if (member_->use_all_reduce_ || member_->use_cuda_ ||
|
|
var == "@LR_DECAY_COUNTER@") {
|
|
t->Resize(dims);
|
|
t->mutable_data(cpu, main_tensor.type());
|
|
paddle::framework::TensorCopy(main_tensor, cpu, t);
|
|
} else {
|
|
t->ShareDataWith(main_tensor);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
|
|
const std::string &fetched_var_name) {
|
|
platform::RecordBlock b(0);
|
|
auto fetch_data = member_->executor_->Run(fetch_tensors);
|
|
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
|
|
fetch_data;
|
|
}
|
|
|
|
void ParallelExecutor::FeedTensorsIntoLocalScopes(
|
|
const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
|
|
PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
|
|
|
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
auto &map = tensors[i];
|
|
auto *scope = member_->local_scopes_[i];
|
|
for (auto &pair : map) {
|
|
auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
|
|
trg->ShareDataWith(pair.second);
|
|
trg->set_lod(pair.second.lod());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
|
|
const std::unordered_map<std::string, LoDTensor> &tensors) {
|
|
for (auto pair : tensors) {
|
|
auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
|
|
PADDLE_ENFORCE_EQ(
|
|
member_->places_.size(), lod_tensors.size(),
|
|
"The number of samples of current batch is less than the count of "
|
|
"devices, currently, it is not allowed. (%d vs %d)",
|
|
member_->places_.size(), lod_tensors.size());
|
|
for (size_t j = 0; j < member_->places_.size(); ++j) {
|
|
// TODO(panxy0718): Do I need to delete this var?
|
|
auto t =
|
|
member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
|
|
t->ShareDataWith(lod_tensors[j]);
|
|
t->set_lod(lod_tensors[j].lod());
|
|
}
|
|
}
|
|
}
|
|
|
|
ParallelExecutor::~ParallelExecutor() {
|
|
if (member_->own_local_scope_) {
|
|
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
|
|
member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace framework
|
|
} // namespace paddle
|
|
|
|
USE_PASS(graph_viz_pass);
|
|
USE_PASS(multi_device_pass);
|
|
USE_PASS(multi_device_check_pass);
|
|
USE_PASS(multi_device_print_pass);
|