add WITH_NCCL option for cmake. (#22384)

cmake选项中添加了WITH_NCCL,显示指定是否编译NCCL的部分代码,WITH_NCCL默认打开,但如果WITH_GPU为OFF,则关闭WITH_NCCL

添加了PADDLE_WITH_NCCL定义

单机单卡能够关闭NCCL编译,多卡的话需要默认打开NCCL,如果关闭NCCL,则只能使用单卡

Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
revert-22710-feature/integrated_ps_api
Wilber 5 years ago committed by GitHub
parent c8b90d8f9a
commit 7bc4b09500
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -89,6 +89,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
# PY_VERSION # PY_VERSION
if(NOT PY_VERSION) if(NOT PY_VERSION)
@ -121,6 +122,27 @@ if(WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE) "Disable DISTRIBUTE when compiling for Windows" FORCE)
endif() endif()
if(WITH_NCCL)
MESSAGE(WARNING
"Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
set(WITH_NCCL OFF CACHE STRING
"Disable NCCL when compiling for Windows" FORCE)
endif()
endif()
if (NOT WITH_GPU AND WITH_NCCL)
MESSAGE(WARNING
"Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
set(WITH_NCCL OFF CACHE STRING
"Disable NCCL when compiling without GPU" FORCE)
endif()
if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL")
else()
if(WITH_GPU)
MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
endif()
endif() endif()
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)

@ -28,7 +28,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc(
VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype); VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void AllReduceOpHandle::NCCLAllReduceFunc( void AllReduceOpHandle::NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) { const std::vector<std::function<void()>> &all_reduce_calls) {
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {

@ -20,7 +20,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -29,7 +29,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
class AllReduceOpHandle : public NCCLOpHandleBase { class AllReduceOpHandle : public NCCLOpHandleBase {
public: public:
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) #ifndef PADDLE_WITH_NCCL
// NCCLOpHandleBase already have these attributes. // NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework. // Will polish it by class inheritance framework.
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void NCCLAllReduceFunc( void NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls); const std::vector<std::function<void()>> &all_reduce_calls);

@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}); });
} }
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
VarHandle *out_handle = nullptr; VarHandle *out_handle = nullptr;
int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device; int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls; std::vector<std::function<void()>> broadcast_calls;

@ -24,7 +24,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -34,7 +34,7 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase { struct BroadcastOpHandle : public OpHandleBase {
public: public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs) const platform::NCCLContextMap *nccl_ctxs)
@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
#endif #endif

@ -44,7 +44,7 @@ struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<ir::Node>> nodes_; std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_; std::vector<p::Place> place_list_;
bool use_gpu_; bool use_gpu_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
@ -52,7 +52,7 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_ctxs_) { if (nccl_ctxs_) {
nccl_ctxs_->WaitAll(); nccl_ctxs_->WaitAll();
} }
@ -62,7 +62,7 @@ struct TestBroadcastOpHandle {
void InitCtxOnGpu(bool use_gpu) { void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu; use_gpu_ = use_gpu;
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@ -86,7 +86,7 @@ struct TestBroadcastOpHandle {
place_list_.push_back(p); place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p)); ctxs_.emplace_back(new p::CPUDeviceContext(p));
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
nccl_ctxs_.reset(nullptr); nccl_ctxs_.reset(nullptr);
#endif #endif
} }
@ -107,14 +107,14 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not support.");
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else

@ -28,7 +28,7 @@ namespace details {
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor; GradientAndLoDTensor;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
FusedAllReduceOpHandle::FusedAllReduceOpHandle( FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce, const std::vector<platform::Place> &places, const size_t num_of_all_reduce,

@ -21,7 +21,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -30,7 +30,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
struct FusedAllReduceOpHandle : public AllReduceOpHandle { struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,

@ -25,7 +25,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -35,7 +35,7 @@ namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle { struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public: public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
FusedBroadcastOpHandle(ir::Node *node, FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes, const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,

@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW("CUDA is not supported.");
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else

@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto pre_in = pre_in_var->Get<framework::LoDTensor>(); auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data( VariableVisitor::GetMutableTensor(out_var).mutable_data(

@ -24,7 +24,7 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,

@ -61,7 +61,7 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif
} // namespace framework } // namespace framework

@ -4,7 +4,9 @@ else()
cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_PSLIB) endif(WITH_PSLIB)
cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) if(WITH_NCCL)
cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps) cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
else() else()

@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass {
std::vector<details::OpHandleBase*> all_reduce_op_handles = std::vector<details::OpHandleBase*> all_reduce_op_handles =
GetSortedAllReduceOps(*graph); GetSortedAllReduceOps(*graph);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto use_hierarchical_allreduce = auto use_hierarchical_allreduce =
Get<bool>(details::kUseHierarchicalAllReduce); Get<bool>(details::kUseHierarchicalAllReduce);
for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {

@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
auto &places = Get<const std::vector<platform::Place>>(details::kPlaces); auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes); auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *multi_nccl_ctxs = auto *multi_nccl_ctxs =
&Get<platform::NCCLCommunicator>(details::kNCCLCtxs); &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
#endif #endif
@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass {
for (auto &p_g : group_p_g) { for (auto &p_g : group_p_g) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
InsertFusedAllReduce(places, local_scopes, group_size, InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, multi_nccl_ctxs, &result); group_all_reduce_ops, multi_nccl_ctxs, &result);
#else #else
@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass {
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce, const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops, const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs, const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif #endif
ir::Graph *result) const { ir::Graph *result) const {
@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass {
result->RemoveNode(op_handle.Node()); result->RemoveNode(op_handle.Node());
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, multi_nccl_ctxs, result); local_scopes, multi_nccl_ctxs, result);
#else #else
@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass {
const size_t num_of_all_reduce, const size_t num_of_all_reduce,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
const platform::NCCLCommunicator *multi_nccl_ctxs, const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif #endif
ir::Graph *result) const { ir::Graph *result) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::FusedAllReduceOpHandle( auto *op_handle = new details::FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, multi_nccl_ctxs); local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass {
op_handle->AddOutput(out); op_handle->AddOutput(out);
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (!multi_nccl_ctxs) { if (!multi_nccl_ctxs) {
SetCommunicationContext(places, op_handle); SetCommunicationContext(places, op_handle);
} }

@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
places_ = Get<const std::vector<platform::Place>>(details::kPlaces); places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes); local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
strategy_ = Get<const details::BuildStrategy>(kStrategy); strategy_ = Get<const details::BuildStrategy>(kStrategy);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs); multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
nccl_ctxs_ = nullptr; nccl_ctxs_ = nullptr;
if (multi_nccl_ctxs_) { if (multi_nccl_ctxs_) {
@ -298,7 +298,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
bool MultiDevSSAGraphBuilderBase::UseGPU() const { bool MultiDevSSAGraphBuilderBase::UseGPU() const {
bool use_gpu = false; bool use_gpu = false;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
use_gpu = nccl_ctxs_ != nullptr; use_gpu = nccl_ctxs_ != nullptr;
#endif #endif
return use_gpu; return use_gpu;
@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::SetCommunicationContext( void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
details::OpHandleBase *op_handle, const platform::Place &p) const { details::OpHandleBase *op_handle, const platform::Place &p) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
if (nccl_ctxs_ == nullptr) { if (nccl_ctxs_ == nullptr) {
op_handle->SetDeviceContext(p, op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p)); platform::DeviceContextPool::Instance().Get(p));
@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
const std::string &p_name, const std::string &p_name,
size_t src_dev_id) const { size_t src_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::BroadcastOpHandle( auto *op_handle = new details::BroadcastOpHandle(
result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_); local_scopes_, places_, nccl_ctxs_);
@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
ir::Graph *result, ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const { const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
auto *op_handle = new details::FusedBroadcastOpHandle( auto *op_handle = new details::FusedBroadcastOpHandle(
result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_); local_scopes_, places_, nccl_ctxs_);
@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
auto append_allreduce_op = [&]( auto append_allreduce_op = [&](
const std::vector<Scope *> &scopes, const std::vector<Scope *> &scopes,
const std::vector<platform::Place> &places) -> details::OpHandleBase * { const std::vector<platform::Place> &places) -> details::OpHandleBase * {
#if defined(PADDLE_WITH_DGC) #if defined(PADDLE_WITH_DGC) && defined(PADDLE_WITH_NCCL)
if (is_encoded) { if (is_encoded) {
result->Get<GraphOps>(kGraphOps).emplace_back( result->Get<GraphOps>(kGraphOps).emplace_back(
new details::SparseAllReduceOpHandle( new details::SparseAllReduceOpHandle(
@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
scopes, places, multi_nccl_ctxs_)); scopes, places, multi_nccl_ctxs_));
} }
#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #elif defined(PADDLE_WITH_NCCL)
result->Get<GraphOps>(kGraphOps).emplace_back( result->Get<GraphOps>(kGraphOps).emplace_back(
new details::AllReduceOpHandle( new details::AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
ir::Graph *result, const std::string &og, size_t dst_dev_id) const { ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_)); local_scopes_, places_, nccl_ctxs_));

@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const; size_t device_id) const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
#endif #endif

@ -109,7 +109,7 @@ class ParallelExecutorPrivate {
} }
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
<< ", num_trainers:" << bst.num_trainers_ << ", num_trainers:" << bst.num_trainers_
@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
} }
if (member_->use_cuda_ && member_->nranks_ > 1) { if (member_->use_cuda_ && member_->nranks_ > 1) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
// Initialize device context's nccl comm, will be used by normal // Initialize device context's nccl comm, will be used by normal
@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices(
} }
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
std::vector<void *> buffers; std::vector<void *> buffers;
buffers.reserve(member_->places_.size()); buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();

@ -32,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"

@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
#include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h" #include "google/protobuf/message.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"

@ -63,7 +63,7 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer); REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif #endif
} // namespace framework } // namespace framework

@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
if(NOT WIN32) if(NOT WIN32)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) if(WITH_NCCL)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
endif()
cc_library(data_loader SRCS data_loader.cc DEPS enforce) cc_library(data_loader SRCS data_loader.cc DEPS enforce)
endif(NOT WIN32) endif(NOT WIN32)

@ -16,7 +16,7 @@
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_NCCL)
void NCCLParallelContext::RecvNCCLID(const std::string &ep, void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) { ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':'); auto addr = paddle::string::Split(ep, ':');

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save