|
|
@ -63,6 +63,8 @@ static bool gProfileStarted = false;
|
|
|
|
std::once_flag p2p_init_flag;
|
|
|
|
std::once_flag p2p_init_flag;
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
|
|
|
|
|
|
|
|
|
|
|
|
class ParallelExecutorPrivate {
|
|
|
|
class ParallelExecutorPrivate {
|
|
|
|
public:
|
|
|
|
public:
|
|
|
|
ParallelExecutorPrivate(const std::vector<platform::Place> &places,
|
|
|
|
ParallelExecutorPrivate(const std::vector<platform::Place> &places,
|
|
|
@ -93,6 +95,8 @@ class ParallelExecutorPrivate {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool IsUseCUDA(UseDevice use_device);
|
|
|
|
|
|
|
|
|
|
|
|
void SetHasFeed(size_t dev_idx, bool has_feed = true);
|
|
|
|
void SetHasFeed(size_t dev_idx, bool has_feed = true);
|
|
|
|
|
|
|
|
|
|
|
|
bool AllowPartialFeed() const;
|
|
|
|
bool AllowPartialFeed() const;
|
|
|
@ -286,7 +290,7 @@ class ParallelExecutorPrivate {
|
|
|
|
platform::NCCLCommunicator *nccl_ctxs_{nullptr};
|
|
|
|
platform::NCCLCommunicator *nccl_ctxs_{nullptr};
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
bool own_local_scope_;
|
|
|
|
bool own_local_scope_;
|
|
|
|
bool use_cuda_;
|
|
|
|
UseDevice use_device_;
|
|
|
|
bool use_all_reduce_;
|
|
|
|
bool use_all_reduce_;
|
|
|
|
size_t nranks_;
|
|
|
|
size_t nranks_;
|
|
|
|
|
|
|
|
|
|
|
@ -296,6 +300,10 @@ class ParallelExecutorPrivate {
|
|
|
|
details::ParallelSSAGraphExecutor *inference_executor_{nullptr};
|
|
|
|
details::ParallelSSAGraphExecutor *inference_executor_{nullptr};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool ParallelExecutorPrivate::IsUseCUDA(UseDevice use_device) {
|
|
|
|
|
|
|
|
return use_device == UseDevice::kCUDA;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ParallelExecutorPrivate::SetHasFeed(size_t dev_idx, bool has_feed) {
|
|
|
|
void ParallelExecutorPrivate::SetHasFeed(size_t dev_idx, bool has_feed) {
|
|
|
|
if (inference_executor_) {
|
|
|
|
if (inference_executor_) {
|
|
|
|
inference_executor_->SetHasFeed(dev_idx, has_feed);
|
|
|
|
inference_executor_->SetHasFeed(dev_idx, has_feed);
|
|
|
@ -340,7 +348,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
|
|
|
|
auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
|
|
|
|
addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
|
|
|
|
addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
|
|
|
|
addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
|
|
|
|
addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
|
|
|
|
addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
|
|
|
|
addto_pass->Set(ir::kUseCuda, new bool(use_device_ == UseDevice::kCUDA));
|
|
|
|
VLOG(10) << "Start to apply inplace_addto_op_pass";
|
|
|
|
VLOG(10) << "Start to apply inplace_addto_op_pass";
|
|
|
|
graph = addto_pass->Apply(graph);
|
|
|
|
graph = addto_pass->Apply(graph);
|
|
|
|
VLOG(10) << "inplace_addto_op_pass Applied";
|
|
|
|
VLOG(10) << "inplace_addto_op_pass Applied";
|
|
|
@ -351,7 +359,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
|
|
|
|
ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
|
|
|
|
inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
|
|
|
|
inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
|
|
|
|
inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
|
|
|
|
inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
|
|
|
|
inplace_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
|
|
|
|
inplace_pass->Set(ir::kUseCuda, new bool(use_device_ == UseDevice::kCUDA));
|
|
|
|
VLOG(10) << "Start to apply buffer_shared_inplace_pass";
|
|
|
|
VLOG(10) << "Start to apply buffer_shared_inplace_pass";
|
|
|
|
graph = inplace_pass->Apply(graph);
|
|
|
|
graph = inplace_pass->Apply(graph);
|
|
|
|
VLOG(10) << "buffer_shared_inplace_pass Applied";
|
|
|
|
VLOG(10) << "buffer_shared_inplace_pass Applied";
|
|
|
@ -366,7 +374,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
&mem_opt_var_infos_);
|
|
|
|
&mem_opt_var_infos_);
|
|
|
|
cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
|
|
|
|
cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
|
|
|
|
&last_live_ops_of_vars);
|
|
|
|
&last_live_ops_of_vars);
|
|
|
|
cross_op_memory_reuse_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
|
|
|
|
cross_op_memory_reuse_pass->Set(ir::kUseCuda,
|
|
|
|
|
|
|
|
new bool(use_device_ == UseDevice::kCUDA));
|
|
|
|
VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass";
|
|
|
|
VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass";
|
|
|
|
graph = cross_op_memory_reuse_pass->Apply(graph);
|
|
|
|
graph = cross_op_memory_reuse_pass->Apply(graph);
|
|
|
|
VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied";
|
|
|
|
VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied";
|
|
|
@ -386,8 +395,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::unique_ptr<GarbageCollector> gc;
|
|
|
|
std::unique_ptr<GarbageCollector> gc;
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(place)) {
|
|
|
|
if (platform::is_gpu_place(place)) {
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
if (IsFastEagerDeletionModeEnabled()) {
|
|
|
|
if (IsFastEagerDeletionModeEnabled()) {
|
|
|
|
gc.reset(new UnsafeFastGPUGarbageCollector(
|
|
|
|
gc.reset(new UnsafeFastGPUGarbageCollector(
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
|
|
|
@ -396,9 +405,22 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
|
|
|
|
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
|
|
|
|
} else {
|
|
|
|
#else
|
|
|
|
|
|
|
|
PADDLE_THROW(platform::errors::PermissionDenied(
|
|
|
|
|
|
|
|
"Paddle can't use CUDA device since it's not compiled with CUDA,"
|
|
|
|
|
|
|
|
"Please recompile or reinstall Paddle with GPU support."));
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
} else if (platform::is_xpu_place(place)) {
|
|
|
|
|
|
|
|
#if defined(PADDLE_WITH_XPU)
|
|
|
|
|
|
|
|
gc.reset(new XPUGarbageCollector(
|
|
|
|
|
|
|
|
BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size));
|
|
|
|
|
|
|
|
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
PADDLE_THROW(platform::errors::PermissionDenied(
|
|
|
|
|
|
|
|
"Paddle can't use XPU device since it's not compiled with XPU,"
|
|
|
|
|
|
|
|
"Please recompile or reinstall Paddle with XPU support."));
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
if (platform::is_cpu_place(place)) {
|
|
|
|
} else if (platform::is_cpu_place(place)) {
|
|
|
|
gc.reset(new CPUGarbageCollector(
|
|
|
|
gc.reset(new CPUGarbageCollector(
|
|
|
|
BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
|
|
|
|
BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
|
|
|
|
VLOG(10) << "Created GarbageCollector at " << place;
|
|
|
|
VLOG(10) << "Created GarbageCollector at " << place;
|
|
|
@ -406,10 +428,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
|
|
|
|
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
|
|
|
PADDLE_THROW(platform::errors::PreconditionNotMet(
|
|
|
|
"Unsupported place for garbage collection"));
|
|
|
|
"Unsupported place for garbage collection"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gcs_.emplace(place, std::move(gc));
|
|
|
|
gcs_.emplace(place, std::move(gc));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -510,13 +528,10 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
const BuildStrategy &build_strategy,
|
|
|
|
const BuildStrategy &build_strategy,
|
|
|
|
ir::Graph *graph)
|
|
|
|
ir::Graph *graph)
|
|
|
|
: member_(new ParallelExecutorPrivate(places, scope)) {
|
|
|
|
: member_(new ParallelExecutorPrivate(places, scope)) {
|
|
|
|
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
|
|
|
|
|
|
|
|
platform::errors::Unavailable(
|
|
|
|
|
|
|
|
"XPU is not supported in ParallelExecutor"));
|
|
|
|
|
|
|
|
InitP2P(places);
|
|
|
|
InitP2P(places);
|
|
|
|
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
|
|
|
|
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
|
|
|
|
member_->places_.size());
|
|
|
|
member_->places_.size());
|
|
|
|
member_->use_cuda_ = exec_strategy.use_cuda_;
|
|
|
|
member_->use_device_ = exec_strategy.use_device_;
|
|
|
|
member_->build_strategy_ = build_strategy;
|
|
|
|
member_->build_strategy_ = build_strategy;
|
|
|
|
member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
|
|
|
|
member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
|
|
|
|
BuildStrategy::ReduceStrategy::kAllReduce;
|
|
|
|
BuildStrategy::ReduceStrategy::kAllReduce;
|
|
|
@ -529,7 +544,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
member_->use_all_reduce_ = true;
|
|
|
|
member_->use_all_reduce_ = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
|
|
|
|
if (member_->use_cuda_) {
|
|
|
|
if (member_->IsUseCUDA(member_->use_device_)) {
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
places.size(), 1,
|
|
|
|
places.size(), 1,
|
|
|
|
platform::errors::Unavailable("Windows can support Single GPU only."));
|
|
|
|
platform::errors::Unavailable("Windows can support Single GPU only."));
|
|
|
@ -537,7 +552,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
|
|
|
|
if (member_->use_cuda_) {
|
|
|
|
if (member_->IsUseCUDA(member_->use_device_)) {
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
places.size(), 1,
|
|
|
|
places.size(), 1,
|
|
|
|
platform::errors::PermissionDenied(
|
|
|
|
platform::errors::PermissionDenied(
|
|
|
@ -548,10 +563,19 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string device_name;
|
|
|
|
|
|
|
|
if (member_->use_device_ == UseDevice::kCPU) {
|
|
|
|
|
|
|
|
device_name = "CPU";
|
|
|
|
|
|
|
|
} else if (member_->use_device_ == UseDevice::kCUDA) {
|
|
|
|
|
|
|
|
device_name = "CUDA";
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
device_name = "XPU";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
VLOG(1) << string::Sprintf(
|
|
|
|
VLOG(1) << string::Sprintf(
|
|
|
|
"The Program will be executed on %s using ParallelExecutor, %lu "
|
|
|
|
"The Program will be executed on %s using ParallelExecutor, %lu "
|
|
|
|
"cards are used, so %lu programs are executed in parallel.",
|
|
|
|
"cards are used, so %lu programs are executed in parallel.",
|
|
|
|
(member_->use_cuda_ ? "CUDA" : "CPU"), places.size(), places.size());
|
|
|
|
device_name, places.size(), places.size());
|
|
|
|
|
|
|
|
|
|
|
|
// Step 1. Bcast the bcast_vars to devs.
|
|
|
|
// Step 1. Bcast the bcast_vars to devs.
|
|
|
|
// Create local scopes
|
|
|
|
// Create local scopes
|
|
|
@ -575,7 +599,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<ir::Graph *> graphs;
|
|
|
|
std::vector<ir::Graph *> graphs;
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
PADDLE_ENFORCE_EQ(member_->use_cuda_, false,
|
|
|
|
PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
|
|
|
|
platform::errors::Unavailable(
|
|
|
|
platform::errors::Unavailable(
|
|
|
|
"gpu mode does not support async_mode_ now!"));
|
|
|
|
"gpu mode does not support async_mode_ now!"));
|
|
|
|
graphs.push_back(graph);
|
|
|
|
graphs.push_back(graph);
|
|
|
@ -598,7 +622,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
<< "you can force it off by env FLAGS_enable_parallel_graph=0";
|
|
|
|
<< "you can force it off by env FLAGS_enable_parallel_graph=0";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (member_->use_cuda_ && member_->nranks_ > 1) {
|
|
|
|
if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
|
|
|
|
#if defined(PADDLE_WITH_NCCL)
|
|
|
|
#if defined(PADDLE_WITH_NCCL)
|
|
|
|
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
|
|
|
|
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
|
|
|
|
|
|
|
|
|
|
|
@ -647,36 +671,39 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
|
|
|
|
VLOG(3) << "use local async mode";
|
|
|
|
VLOG(3) << "use local async mode";
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph, {member_->places_[0]}, loss_var_name,
|
|
|
|
graph, {member_->places_[0]}, loss_var_name,
|
|
|
|
{member_->local_scopes_[0]}, 1, member_->use_cuda_,
|
|
|
|
{member_->local_scopes_[0]}, 1,
|
|
|
|
member_->nccl_ctxs_);
|
|
|
|
member_->IsUseCUDA(member_->use_device_), member_->nccl_ctxs_);
|
|
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
|
|
graphs[i] = member_->build_strategy_.Apply(
|
|
|
|
graphs[i] = member_->build_strategy_.Apply(
|
|
|
|
graphs[i], {member_->places_[i]}, loss_var_name,
|
|
|
|
graphs[i], {member_->places_[i]}, loss_var_name,
|
|
|
|
{member_->local_scopes_[i]}, 1, member_->use_cuda_,
|
|
|
|
{member_->local_scopes_[i]}, 1,
|
|
|
|
member_->nccl_ctxs_);
|
|
|
|
member_->IsUseCUDA(member_->use_device_), member_->nccl_ctxs_);
|
|
|
|
async_graphs[i] = graphs[i];
|
|
|
|
async_graphs[i] = graphs[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph, member_->places_, loss_var_name, member_->local_scopes_,
|
|
|
|
graph, member_->places_, loss_var_name, member_->local_scopes_,
|
|
|
|
member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_);
|
|
|
|
member_->nranks_, member_->IsUseCUDA(member_->use_device_),
|
|
|
|
|
|
|
|
member_->nccl_ctxs_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
VLOG(3) << "use local async mode";
|
|
|
|
VLOG(3) << "use local async mode";
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph, {member_->places_[0]}, loss_var_name,
|
|
|
|
graph, {member_->places_[0]}, loss_var_name,
|
|
|
|
{member_->local_scopes_[0]}, 1, member_->use_cuda_);
|
|
|
|
{member_->local_scopes_[0]}, 1,
|
|
|
|
|
|
|
|
member_->IsUseCUDA(member_->use_device_));
|
|
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
|
|
for (size_t i = 1; i < member_->places_.size(); ++i) {
|
|
|
|
graphs[i] = member_->build_strategy_.Apply(
|
|
|
|
graphs[i] = member_->build_strategy_.Apply(
|
|
|
|
graphs[i], {member_->places_[i]}, loss_var_name,
|
|
|
|
graphs[i], {member_->places_[i]}, loss_var_name,
|
|
|
|
{member_->local_scopes_[i]}, 1, member_->use_cuda_);
|
|
|
|
{member_->local_scopes_[i]}, 1,
|
|
|
|
|
|
|
|
member_->IsUseCUDA(member_->use_device_));
|
|
|
|
async_graphs[i] = graphs[i];
|
|
|
|
async_graphs[i] = graphs[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph = member_->build_strategy_.Apply(
|
|
|
|
graph, member_->places_, loss_var_name, member_->local_scopes_,
|
|
|
|
graph, member_->places_, loss_var_name, member_->local_scopes_,
|
|
|
|
member_->nranks_, member_->use_cuda_);
|
|
|
|
member_->nranks_, member_->IsUseCUDA(member_->use_device_));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
@ -874,7 +901,8 @@ void ParallelExecutor::BCastParamsToDevices(
|
|
|
|
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
|
|
|
|
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
if (member_->build_strategy_.async_mode_) {
|
|
|
|
share_memory();
|
|
|
|
share_memory();
|
|
|
|
} else if (member_->use_all_reduce_ || member_->use_cuda_ ||
|
|
|
|
} else if (member_->use_all_reduce_ ||
|
|
|
|
|
|
|
|
member_->IsUseCUDA(member_->use_device_) ||
|
|
|
|
var == "@LR_DECAY_COUNTER@") {
|
|
|
|
var == "@LR_DECAY_COUNTER@") {
|
|
|
|
copy_memory();
|
|
|
|
copy_memory();
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
@ -1105,7 +1133,7 @@ bool ParallelExecutor::EnableParallelGraphExecution(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!member_->use_all_reduce_ || !member_->use_cuda_) {
|
|
|
|
if (!member_->use_all_reduce_ || !member_->IsUseCUDA(member_->use_device_)) {
|
|
|
|
if (build_strategy.enable_sequential_execution_ ||
|
|
|
|
if (build_strategy.enable_sequential_execution_ ||
|
|
|
|
exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) {
|
|
|
|
exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) {
|
|
|
|
enable_parallel_graph = false;
|
|
|
|
enable_parallel_graph = false;
|
|
|
|