[Prepare for MultiProcess xpu] unified gen nccl id, refine imperative reducer (#30455)

revert-31068-fix_conv3d_windows
WangXi 4 years ago committed by GitHub
parent 549855ac20
commit 572c466d19
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,8 +16,24 @@
#include "paddle/fluid/imperative/all_reduce.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <nccl.h>
#include <string>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace imperative {
static const platform::Place &GetVarPlace(const framework::Variable &src) {
if (src.IsType<framework::LoDTensor>()) {
return src.Get<framework::LoDTensor>().place();

@ -16,21 +16,6 @@
#ifdef PADDLE_WITH_NCCL
#include <cuda.h>
#include <cuda_runtime.h>
#include <nccl.h>
#include <string>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class Variable;

File diff suppressed because it is too large Load Diff

@ -13,73 +13,20 @@
// limitations under the License.
#pragma once
// network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/imperative/all_reduce.h"
#include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/imperative/parallel_context.h"
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
// TODO(shenliang03): support multi stream communication
int nrings_{1};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() {}
virtual void Init() = 0;
virtual void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst, int ring_id = 0,
bool use_calc_stream = false) = 0;
#if defined(PADDLE_WITH_NCCL)
virtual paddle::platform::CUDADeviceContext* GetDeviceContext(
int ring_id) = 0;
#endif
inline int GetNRings() { return strategy_.nrings_; }
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
#if defined(PADDLE_WITH_NCCL)
class NCCLParallelContext : public ParallelContext {
public:
@ -87,7 +34,7 @@ class NCCLParallelContext : public ParallelContext {
const platform::Place& place)
: ParallelContext(strategy, place) {}
~NCCLParallelContext() {}
~NCCLParallelContext() override = default;
void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root); // NOLINT
@ -97,14 +44,18 @@ class NCCLParallelContext : public ParallelContext {
framework::Variable* dst, int ring_id,
bool use_calc_stream) override;
paddle::platform::CUDADeviceContext* GetDeviceContext(int ring_id) override;
paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
void WaitCompute(int ring_id) override;
void WaitComm(int ring_id) override;
protected:
void RecvNCCLID(const std::string& endpoint,
std::vector<ncclUniqueId>& nccl_ids); // NOLINT
private:
// used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;
void SendNCCLID(const std::string& endpoint,
const std::vector<ncclUniqueId>& nccl_ids);
// used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
};
#endif

@ -0,0 +1,75 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
int nrings_{1};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() = default;
virtual void Init() = 0;
virtual void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst, int ring_id,
bool use_calc_stream) = 0;
virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0;
// comm_stream[ring_id] wait compute_stream.
// if CPU, should do nothing.
virtual void WaitCompute(int ring_id) = 0;
// compute_stream wait comm_stream[ring_id]
// if CPU, should do nothing.
virtual void WaitComm(int ring_id) = 0;
inline int GetNRings() const { return strategy_.nrings_; }
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
} // namespace imperative
} // namespace paddle

File diff suppressed because it is too large Load Diff

@ -24,60 +24,27 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/op_base.h"
#include "paddle/fluid/imperative/variable_wrapper.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/imperative/all_reduce.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/cuda_resource_pool.h"
#endif
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
namespace imperative {
class ParallelContext;
class VarBase;
class VariableWrapper;
} // namespace imperative
} // namespace paddle
namespace paddle {
namespace imperative {
#if defined(PADDLE_WITH_NCCL)
template <typename T>
void ConcatTensorsForAllReduce(
const platform::CUDADeviceContext& context,
const std::vector<framework::Tensor>& dense_tensors_,
framework::Variable* p_dense_contents) {
operators::math::ConcatFunctor<platform::CUDADeviceContext, T>
concat_functor_;
concat_functor_(context, dense_tensors_, 0,
p_dense_contents->GetMutable<framework::LoDTensor>());
}
template <typename T>
void SplitTensorsForAllReduce(const platform::CUDADeviceContext& context,
framework::Variable* p_dense_contents,
std::vector<framework::Tensor>* p_dense_tensors) {
auto* in = p_dense_contents->GetMutable<framework::LoDTensor>();
std::vector<framework::Tensor*> outs;
std::vector<const framework::Tensor*> shape_refer;
outs.reserve(p_dense_tensors->size());
shape_refer.reserve(p_dense_tensors->size());
for (auto& tensor : *p_dense_tensors) {
outs.emplace_back(&tensor);
shape_refer.emplace_back(&tensor);
}
// Sometimes direct copies will be faster
if (p_dense_tensors->size() < 10) {
operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
} else {
operators::math::SplitFunctor<platform::CUDADeviceContext, T>
split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs);
}
}
class Group {
public:
// Here, we use dense_contents_ & sparse_contents_ to
@ -104,10 +71,10 @@ class Group {
framework::proto::VarType::Type dtype_;
// context is used to select the stream for concat
void ConcatTensors(const platform::CUDADeviceContext& context);
void ConcatTensors(const platform::DeviceContext& context);
// context is used to select the stream for split
void SplitTensors(const platform::CUDADeviceContext& context);
void SplitTensors(const platform::DeviceContext& context);
friend std::ostream& operator<<(std::ostream&, const Group&);
};
@ -155,8 +122,6 @@ class Reducer {
std::vector<std::vector<size_t>> RebuildGruops();
void CreateGroupEvents(int group_num);
inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }
// Reducer Singleton
@ -193,11 +158,6 @@ class Reducer {
std::shared_ptr<imperative::ParallelContext> parallel_ctx_;
std::vector<VariableLocator> variable_locators_;
// Following variables are to help sync stream
std::vector<std::shared_ptr<platform::CudaEventObject>> group_events_;
std::vector<std::shared_ptr<platform::CudaEventObject>> comm_events_;
cudaStream_t compute_stream_;
std::vector<cudaStream_t> comm_streams_;
int nrings_ = 1;
// Following variables are to help rebuild group

@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread> // NOLINT
#include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"

@ -60,6 +60,109 @@ TEST(TestGroup, TestPrintGroupMessage) {
ASSERT_STREQ(stream2.str().c_str(), head.c_str());
}
template <typename T, typename Place>
void GroupConcatSplit(Place place, size_t size) {
platform::CPUPlace cpu_place;
Group group;
// [[0.0], [0.0, 1.0], [0.0, 1.0, 2.0] .. ]
std::vector<framework::Variable> vars;
vars.resize(size);
for (size_t i = 0; i < size; ++i) {
auto len = i + 1;
auto* tensor = vars[i].GetMutable<framework::LoDTensor>();
tensor->Resize({static_cast<int64_t>(len)});
auto* data = tensor->mutable_data<T>(place);
std::vector<T> value;
for (size_t j = 0; j < len; ++j) {
value.push_back(static_cast<T>(1.0 * j));
}
if (std::is_same<Place, platform::CUDAPlace>::value) {
paddle::memory::Copy(place, data, cpu_place, value.data(),
sizeof(T) * value.size(), 0);
} else {
paddle::memory::Copy(place, data, cpu_place, value.data(),
sizeof(T) * value.size());
}
framework::Tensor tmp;
tmp.ShareDataWith(*tensor).Resize({static_cast<int64_t>(len)});
group.dense_tensors_.push_back(std::move(tmp));
group.all_length_ += len;
group.dtype_ = tensor->type();
}
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
{ // concat
group.ConcatTensors(*dev_ctx);
auto* tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
framework::Tensor tmp;
framework::TensorCopySync(*tensor, cpu_place, &tmp);
auto* data = tmp.data<T>();
size_t offset = 0;
for (size_t i = 0; i < size; ++i) {
auto len = i + 1;
for (size_t j = 0; j < len; ++j) {
EXPECT_EQ(data[offset + j], static_cast<T>(1.0 * j));
// [[-0.0], [-0.0, -1.0], [-0.0, -1.0, -2.0] .. ]
data[offset + j] = -data[offset + j];
}
offset += len;
}
framework::TensorCopySync(tmp, place, tensor);
}
{ // split
group.SplitTensors(*dev_ctx);
for (size_t i = 0; i < size; ++i) {
auto len = i + 1;
auto& tensor = group.dense_tensors_[i];
framework::Tensor tmp;
framework::TensorCopySync(tensor, cpu_place, &tmp);
auto* data = tmp.data<T>();
for (size_t j = 0; j < len; ++j) {
EXPECT_EQ(data[j], static_cast<T>(-1.0 * j));
}
}
}
}
TEST(TestGroup, TestConcatSplit) {
platform::CUDAPlace cuda_place(0);
platform::CPUPlace cpu_place;
int size = 3;
GroupConcatSplit<float>(cpu_place, size);
GroupConcatSplit<double>(cpu_place, size);
GroupConcatSplit<platform::float16>(cpu_place, size);
GroupConcatSplit<float>(cuda_place, size);
GroupConcatSplit<double>(cuda_place, size);
GroupConcatSplit<platform::float16>(cuda_place, size);
size = 15;
GroupConcatSplit<float>(cpu_place, size);
GroupConcatSplit<double>(cpu_place, size);
GroupConcatSplit<platform::float16>(cpu_place, size);
GroupConcatSplit<float>(cuda_place, size);
GroupConcatSplit<double>(cuda_place, size);
GroupConcatSplit<platform::float16>(cuda_place, size);
}
TEST(TestGroup, TestConcatSplitException) {
platform::CUDAPinnedPlace place;
int size = 3;
ASSERT_ANY_THROW(GroupConcatSplit<float>(place, size));
}
#endif
} // namespace imperative

@ -15,9 +15,8 @@ register_operators(EXCLUDES c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DE
if(WITH_NCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
cc_library(gen_nccl_id_op_helper SRCS gen_nccl_id_op_helper.cc DEPS nccl_common)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} gen_nccl_id_op_helper)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
endif()
if(WITH_GLOO)

@ -23,11 +23,32 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
namespace paddle {
namespace operators {
static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
for (size_t i = 0; i < nccl_ids->size(); ++i) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
}
}
static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
for (size_t i = 0; i < nccl_ids.size(); ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto nccl_id = var->GetMutable<ncclUniqueId>();
memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
}
}
class CGenNCCLIdOp : public framework::OperatorBase {
public:
CGenNCCLIdOp(const std::string& type,
@ -45,14 +66,20 @@ class CGenNCCLIdOp : public framework::OperatorBase {
return Output("Out");
};
std::vector<ncclUniqueId> nccl_ids;
nccl_ids.resize(1);
if (rank == 0) {
GenNCCLID(&nccl_ids);
std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("other_endpoints");
SendBroadCastNCCLID(endpoint_list, 1, func, local_scope);
platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
} else {
std::string endpoint = Attr<std::string>("endpoint");
RecvBroadCastNCCLID(endpoint, 1, func, local_scope);
platform::RecvBroadCastCommID(endpoint, &nccl_ids);
}
CopyNCCLIDToVar(nccl_ids, func, scope);
scope.DeleteScope(&local_scope);
}
};

@ -27,11 +27,32 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
namespace paddle {
namespace operators {
static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
for (size_t i = 0; i < nccl_ids->size(); ++i) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
}
}
static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
for (size_t i = 0; i < nccl_ids.size(); ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto nccl_id = var->GetMutable<ncclUniqueId>();
memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
}
}
class GenNCCLIdOp : public framework::OperatorBase {
public:
GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
@ -98,19 +119,25 @@ class GenNCCLIdOp : public framework::OperatorBase {
<< ", trainers:" << ss.str();
int server_fd = -1;
std::vector<ncclUniqueId> nccl_ids;
nccl_ids.resize(nccl_comm_num);
/// 1. init flat
std::function<std::string(size_t)> func = platform::GetFlatNCCLVarName;
// broadcast unique id
if (trainer_id == 0) {
GenNCCLID(&nccl_ids);
// server endpoints
std::vector<std::string> flat_endpoints;
flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
trainers.end());
SendBroadCastNCCLID(flat_endpoints, nccl_comm_num, func, scope);
platform::SendBroadCastCommID(flat_endpoints, &nccl_ids);
} else {
server_fd = CreateListenSocket(endpoint);
RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
server_fd = platform::CreateListenSocket(endpoint);
platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
}
CopyNCCLIDToVar(nccl_ids, func, scope);
/// 2. hierarchical inter ncclid
func = platform::GetHierarchicalInterNCCLVarName;
@ -127,10 +154,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
}
VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
SendBroadCastNCCLID(inter_endpoints, nccl_comm_num, func, scope);
GenNCCLID(&nccl_ids);
platform::SendBroadCastCommID(inter_endpoints, &nccl_ids);
CopyNCCLIDToVar(nccl_ids, func, scope);
} else if (inter_trainer_id > 0) {
VLOG(1) << "Hierarchical inter ring";
RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
CopyNCCLIDToVar(nccl_ids, func, scope);
}
/// 3. hierarchical exter ncclid
@ -146,15 +176,18 @@ class GenNCCLIdOp : public framework::OperatorBase {
}
VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
SendBroadCastNCCLID(exter_endpoints, nccl_comm_num, func, scope);
GenNCCLID(&nccl_ids);
platform::SendBroadCastCommID(exter_endpoints, &nccl_ids);
CopyNCCLIDToVar(nccl_ids, func, scope);
} else if (exter_trainer_id > 0) {
VLOG(1) << "Hierarchical exter ring";
RecvBroadCastNCCLID(server_fd, endpoint, nccl_comm_num, func, scope);
platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
CopyNCCLIDToVar(nccl_ids, func, scope);
}
// close socket server
if (trainer_id != 0) {
CloseSocket(server_fd);
platform::CloseSocket(server_fd);
}
}
};

@ -101,7 +101,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce)
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
if(WITH_GPU)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)

@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/gen_nccl_id_op_helper.h"
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h>
#include <netdb.h>
@ -31,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/string/split.h"
namespace paddle {
namespace operators {
namespace platform {
constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
@ -257,26 +258,29 @@ static int ConnectAddr(const std::string& ep, const char* head) {
return sock;
}
static void RecvNCCLID(int conn, ncclUniqueId* nccl_id) {
template <typename CommUniqueId>
static void RecvCommID(int conn, CommUniqueId* nccl_id) {
char buffer[1024] = {0};
static_assert(NCCL_UNIQUE_ID_BYTES <= 1024,
static_assert(sizeof(CommUniqueId) <= 1024,
"nccl id bytes must <= buffer size");
CHECK_SYS_CALL(SocketRecv(conn, buffer, NCCL_UNIQUE_ID_BYTES), "recv ncc id");
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
CHECK_SYS_CALL(SocketRecv(conn, buffer, sizeof(CommUniqueId)),
"recv comm unique id");
memcpy(nccl_id, buffer, sizeof(CommUniqueId));
}
static void SendNCCLID(int conn, ncclUniqueId* nccl_id) {
template <typename CommUniqueId>
static void SendCommID(int conn, CommUniqueId* nccl_id) {
char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
memcpy(buffer, nccl_id, sizeof(CommUniqueId));
CHECK_SYS_CALL(SocketSend(conn, buffer, NCCL_UNIQUE_ID_BYTES),
"send nccl id");
CHECK_SYS_CALL(SocketSend(conn, buffer, sizeof(CommUniqueId)),
"send comm unique id");
}
void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
template <typename CommUniqueId>
void SendBroadCastCommID(std::vector<std::string> servers,
std::vector<CommUniqueId>* nccl_ids) {
// connect with server
std::vector<int> connects;
for (auto server : servers) {
@ -286,23 +290,13 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
}
VLOG(3) << "connecting completed...";
for (int i = 0; i < nccl_comm_num; ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto nccl_id = var->GetMutable<ncclUniqueId>();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(nccl_id));
for (size_t i = 0; i < nccl_ids->size(); ++i) {
int j = 0;
for (auto conn : connects) {
VLOG(3) << "sending nccl_id_var: " << var_name << " to " << servers[j]
<< " nccl_comm_no: " << i;
SendNCCLID(conn, nccl_id);
VLOG(3) << "sending comm_id to " << servers[j] << " nccl_comm_no: " << i;
SendCommID(conn, &(*nccl_ids)[i]);
++j;
}
VLOG(3) << "sending completed...";
}
// close client
@ -311,34 +305,43 @@ void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
}
}
void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
template <typename CommUniqueId>
void RecvBroadCastCommID(std::string endpoint,
std::vector<CommUniqueId>* nccl_ids) {
int server = CreateListenSocket(endpoint);
RecvBroadCastNCCLID(server, endpoint, nccl_comm_num, func, scope);
RecvBroadCastCommID(server, endpoint, nccl_ids);
CloseSocket(server);
}
void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
template <typename CommUniqueId>
void RecvBroadCastCommID(int server_fd, std::string endpoint,
std::vector<CommUniqueId>* nccl_ids) {
int client = SocketAccept(server_fd, COMM_HEAD);
for (int i = 0; i < nccl_comm_num; ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto nccl_id = var->GetMutable<ncclUniqueId>();
VLOG(3) << "trainer: " << endpoint << " receiving nccl_id_var: " << var_name
<< " from trainer 0, nccl_comm_no: " << i;
RecvNCCLID(client, nccl_id);
for (size_t i = 0; i < nccl_ids->size(); ++i) {
VLOG(3) << "trainer: " << endpoint
<< " receiving comm_id from trainer 0, nccl_comm_no: " << i;
RecvCommID(client, &(*nccl_ids)[i]);
}
VLOG(3) << "receiving completed...";
CloseSocket(client);
}
} // namespace operators
/// template instantiation
#define INSTANT_TEMPLATE(Type) \
template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
std::vector<Type> * nccl_ids); \
template void RecvBroadCastCommID<Type>(std::string endpoint, \
std::vector<Type> * nccl_ids);
#ifdef PADDLE_WITH_NCCL
INSTANT_TEMPLATE(ncclUniqueId)
#endif
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE(bkclUniqueId)
#endif
} // namespace platform
} // namespace paddle
#endif

@ -14,35 +14,31 @@ limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NCCL
#include <functional>
#include <string>
#include <vector>
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
namespace platform {
int CreateListenSocket(const std::string& ep);
void CloseSocket(int fd);
void SendBroadCastNCCLID(std::vector<std::string> servers, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
template <typename CommUniqueId>
void SendBroadCastCommID(std::vector<std::string> servers,
std::vector<CommUniqueId>* nccl_ids);
// server listen on endpoint, then recv nccl id
void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
template <typename CommUniqueId>
void RecvBroadCastCommID(std::string endpoint,
std::vector<CommUniqueId>* nccl_ids);
// recv nccl id from socket
void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
} // namespace operators
template <typename CommUniqueId>
void RecvBroadCastCommID(int server_fd, std::string endpoint,
std::vector<CommUniqueId>* nccl_ids);
} // namespace platform
} // namespace paddle
#endif

@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_NCCL
#pragma once
#ifdef PADDLE_WITH_NCCL
#include <stdio.h>
#include <memory>
#include <string>

@ -14,10 +14,11 @@
import unittest
import os
import copy
from launch_function_helper import wait, _find_free_port
from multiprocessing import Pool, Process
from threading import Thread
os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10")
os.environ['GLOG_vmodule'] = str("gen_nccl_id_op*=10,gen_comm_id*=10")
import paddle
from paddle.fluid import core
@ -29,8 +30,8 @@ def run_gen_ncc_id(attr):
nccl_comm_num = attr['nccl_comm_num']
use_hallreduce = attr['use_hierarchical_allreduce']
startup_program = paddle.static.default_startup_program()
main_program = paddle.static.default_main_program()
startup_program = paddle.static.Program()
main_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
nccl_id_var = startup_program.global_block().create_var(
@ -60,8 +61,9 @@ def run_gen_ncc_id(attr):
attrs=attr)
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
scope = paddle.static.Scope()
with paddle.static.scope_guard(scope):
exe.run(startup_program)
@ -97,16 +99,19 @@ class TestGenNcclIdOp(unittest.TestCase):
procs = []
for i in range(nranks):
attr['trainer_id'] = i
p = Process(target=run_gen_ncc_id, args=(attr, ))
# NOTE. multiprocessing cannot be covered by coverage
p = Thread(target=run_gen_ncc_id, args=(copy.copy(attr), ))
p.start()
procs.append(p)
wait(procs, timeout=120)
for p in procs:
p.join()
def test_flat(self):
print(">>> test gen flat nccl id")
self.gen_nccl_id(2)
print("<<< end test gen flat nccl id")
print()
def test_hierarchical(self):
print(">>> test gen hierarchical nccl id")

Loading…
Cancel
Save