parent
51abb6c323
commit
d144310415
@ -0,0 +1,8 @@
|
||||
if(WITH_GPU)
|
||||
nv_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
|
||||
nv_library(nccl_op SRCS nccl_ops.cc DEPS nccl_common)
|
||||
else()
|
||||
cc_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
|
||||
endif()
|
||||
|
||||
cc_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
|
@ -1,9 +1,58 @@
|
||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||
#include "paddle/platform/gpu_info.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
|
||||
NCCLManager::NCCLManager() {}
|
||||
|
||||
NCCLManager::~NCCLManager() {
|
||||
for (auto& p : comm_table) {
|
||||
auto* comm = p.second;
|
||||
auto& gpus_ = comm->gpus_;
|
||||
for (int i = 0; i < gpus_.size(); ++i) {
|
||||
int gid = gpus_[i];
|
||||
platform::SetDeviceId(gid);
|
||||
|
||||
// mapping gid to idx
|
||||
int idx = gid % gpus_.size();
|
||||
// wait finish
|
||||
NCCL_CHECK(
|
||||
cudaStreamWaitEvent(*comm->streams_[idx], comm->events_[idx], 0));
|
||||
|
||||
NCCL_CHECK(cudaEventDestroy(comm->events_[idx]));
|
||||
|
||||
NCCL_CHECK(ncclCommDestroy(comm->comms_[idx]));
|
||||
}
|
||||
delete comm;
|
||||
}
|
||||
}
|
||||
|
||||
Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) const {
|
||||
std::string key;
|
||||
for (auto& id : gpus) {
|
||||
key += std::to_string(id);
|
||||
}
|
||||
std::sort(key.begin(), key.end());
|
||||
|
||||
std::mutex mu;
|
||||
std::lock_guard<std::mutex> lk(mu);
|
||||
auto* comm = comm_table[key];
|
||||
if (comm == nullptr) {
|
||||
comm = new Communicator(gpus.size());
|
||||
NCCL_CHECK(ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
|
||||
|
||||
for (size_t i = 0; i < gpus.size(); ++i) {
|
||||
platform::SetDeviceId(gpus[i]);
|
||||
|
||||
// block wait
|
||||
NCCL_CHECK(cudaEventCreateWithFlags(
|
||||
&events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
|
||||
}
|
||||
comm_table[key] = comm;
|
||||
}
|
||||
return comm;
|
||||
}
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
@ -0,0 +1,23 @@
|
||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
TEST(WaitGroup, wait) {
|
||||
WaitGroup wg;
|
||||
auto run_thread = [](int idx) {
|
||||
wg.Add(1);
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
wg.Done();
|
||||
};
|
||||
|
||||
std::vector<std::thread> ths;
|
||||
constexpr const int TNUM = 5;
|
||||
for (int i = 0; i < TNUM; ++i) {
|
||||
ths.emplace_back(std::thread(run_thread, i));
|
||||
}
|
||||
wg.Wait();
|
||||
}
|
Loading…
Reference in new issue