|
|
|
@ -20,7 +20,7 @@ limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/framework/ir/graph.h"
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
#include "paddle/fluid/platform/nccl_helper.h"
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
|
|
|
|
|
Scope *global_scope_; // not owned
|
|
|
|
|
std::unique_ptr<details::SSAGraphExecutor> executor_;
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
|
|
|
|
|
#endif
|
|
|
|
|
bool own_local_scope_;
|
|
|
|
@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
|
|
|
|
|
|
|
|
|
|
if (member_->use_cuda_) {
|
|
|
|
|
// Bcast Parameters to all GPUs
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
|
|
|
|
|
ncclUniqueId *nccl_id = nullptr;
|
|
|
|
|
if (nccl_id_var != nullptr) {
|
|
|
|
@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(
|
|
|
|
|
|
|
|
|
|
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
|
|
|
|
|
// ncclOp
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
|
|
|
|
|
main_program, member_->places_, loss_var_name, params,
|
|
|
|
|
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
|
|
|
|
@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices(
|
|
|
|
|
}
|
|
|
|
|
auto &dims = main_tensor.dims();
|
|
|
|
|
if (paddle::platform::is_gpu_place(main_tensor.place())) {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
|
|
|
|
std::vector<void *> buffers;
|
|
|
|
|
size_t numel = main_tensor.numel();
|
|
|
|
|
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
|
|
|
|
|