parent
51abb6c323
commit
d144310415
@ -0,0 +1,8 @@
|
|||||||
|
if(WITH_GPU)
|
||||||
|
nv_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
|
||||||
|
nv_library(nccl_op SRCS nccl_ops.cc DEPS nccl_common)
|
||||||
|
else()
|
||||||
|
cc_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
cc_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
|
@ -1,9 +1,58 @@
|
|||||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||||
|
#include "paddle/platform/gpu_info.h"
|
||||||
|
|
||||||
namespace paddle {
|
namespace paddle {
|
||||||
namespace platform {
|
namespace platform {
|
||||||
|
|
||||||
|
NCCLManager::NCCLManager() {}
|
||||||
|
|
||||||
|
NCCLManager::~NCCLManager() {
|
||||||
|
for (auto& p : comm_table) {
|
||||||
|
auto* comm = p.second;
|
||||||
|
auto& gpus_ = comm->gpus_;
|
||||||
|
for (int i = 0; i < gpus_.size(); ++i) {
|
||||||
|
int gid = gpus_[i];
|
||||||
|
platform::SetDeviceId(gid);
|
||||||
|
|
||||||
|
// mapping gid to idx
|
||||||
|
int idx = gid % gpus_.size();
|
||||||
|
// wait finish
|
||||||
|
NCCL_CHECK(
|
||||||
|
cudaStreamWaitEvent(*comm->streams_[idx], comm->events_[idx], 0));
|
||||||
|
|
||||||
|
NCCL_CHECK(cudaEventDestroy(comm->events_[idx]));
|
||||||
|
|
||||||
|
NCCL_CHECK(ncclCommDestroy(comm->comms_[idx]));
|
||||||
|
}
|
||||||
|
delete comm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) const {
|
||||||
|
std::string key;
|
||||||
|
for (auto& id : gpus) {
|
||||||
|
key += std::to_string(id);
|
||||||
|
}
|
||||||
|
std::sort(key.begin(), key.end());
|
||||||
|
|
||||||
|
std::mutex mu;
|
||||||
|
std::lock_guard<std::mutex> lk(mu);
|
||||||
|
auto* comm = comm_table[key];
|
||||||
|
if (comm == nullptr) {
|
||||||
|
comm = new Communicator(gpus.size());
|
||||||
|
NCCL_CHECK(ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < gpus.size(); ++i) {
|
||||||
|
platform::SetDeviceId(gpus[i]);
|
||||||
|
|
||||||
|
// block wait
|
||||||
|
NCCL_CHECK(cudaEventCreateWithFlags(
|
||||||
|
&events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
|
||||||
|
}
|
||||||
|
comm_table[key] = comm;
|
||||||
|
}
|
||||||
|
return comm;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace operators
|
} // namespace operators
|
||||||
} // namespace paddle
|
} // namespace paddle
|
||||||
|
@ -0,0 +1,23 @@
|
|||||||
|
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
TEST(WaitGroup, wait) {
|
||||||
|
WaitGroup wg;
|
||||||
|
auto run_thread = [](int idx) {
|
||||||
|
wg.Add(1);
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||||
|
wg.Done();
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<std::thread> ths;
|
||||||
|
constexpr const int TNUM = 5;
|
||||||
|
for (int i = 0; i < TNUM; ++i) {
|
||||||
|
ths.emplace_back(std::thread(run_thread, i));
|
||||||
|
}
|
||||||
|
wg.Wait();
|
||||||
|
}
|
Loading…
Reference in new issue