parent
fdfc8f9baa
commit
333045d7b2
@ -1,61 +1,17 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||
#include "paddle/platform/gpu_info.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
|
||||
NCCLManager::NCCLManager() {}
|
||||
|
||||
NCCLManager::~NCCLManager() {
|
||||
for (auto& p : comm_table) {
|
||||
auto& comm = p.second;
|
||||
auto& gpus_ = comm->gpus_;
|
||||
for (size_t i = 0; i < gpus_.size(); ++i) {
|
||||
int gid = gpus_[i];
|
||||
platform::SetDeviceId(gid);
|
||||
|
||||
// mapping gid to idx
|
||||
int idx = gid % gpus_.size();
|
||||
// wait finish
|
||||
PADDLE_ENFORCE(
|
||||
cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
|
||||
|
||||
PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
|
||||
|
||||
PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
|
||||
}
|
||||
comm.reset(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) {
|
||||
std::string key;
|
||||
for (auto& id : gpus) {
|
||||
key += std::to_string(id);
|
||||
}
|
||||
std::sort(key.begin(), key.end());
|
||||
|
||||
std::mutex mu;
|
||||
std::lock_guard<std::mutex> lk(mu);
|
||||
|
||||
auto it = comm_table.find(key);
|
||||
|
||||
if (it->second == nullptr) {
|
||||
auto* comm = new Communicator(gpus);
|
||||
PADDLE_ENFORCE(
|
||||
ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
|
||||
|
||||
for (size_t i = 0; i < gpus.size(); ++i) {
|
||||
platform::SetDeviceId(gpus[i]);
|
||||
|
||||
// block wait
|
||||
PADDLE_ENFORCE(cudaEventCreateWithFlags(
|
||||
&comm->events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
|
||||
}
|
||||
comm_table[key].reset(comm);
|
||||
}
|
||||
return comm_table[key].get();
|
||||
}
|
||||
|
||||
} // namespace operators
|
||||
namespace platform {} // namespace platform
|
||||
} // namespace paddle
|
||||
|
@ -1,16 +0,0 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "paddle/operators/nccl/nccl_ops.h"
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
|
@ -1,103 +0,0 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include "paddle/framework/op_registry.h"
|
||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using framework::Tensor;
|
||||
|
||||
template <typename Type>
|
||||
class NCCLTypeWrapper;
|
||||
|
||||
template <>
|
||||
class NCCLTypeWrapper<float> {
|
||||
public:
|
||||
static const ncclDataType_t type = ncclFloat;
|
||||
};
|
||||
|
||||
template <>
|
||||
class NCCLTypeWrapper<double> {
|
||||
public:
|
||||
static const ncclDataType_t type = ncclDouble;
|
||||
};
|
||||
|
||||
class NCCLInitOp : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto gpus = ctx.Input<std::vector<int>>("gpus");
|
||||
auto* comm = ctx.Output<Communicator>("Communicator");
|
||||
comm->mutable_data<Communicator>(CPUPlace());
|
||||
comm = NCCLManager::GetCommunicator(gpus);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class NCCLAllReduceKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto ins = ctx.MultiInput<Tensor>("X");
|
||||
auto outs = ctx.MultiOutput<Tensor>("Out");
|
||||
std::string reduction = ctx.Attr<std::string>("reduction");
|
||||
std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
|
||||
ncclRedOp_t op_type;
|
||||
if (reduction == "ncclSum") {
|
||||
op_type = ncclSum;
|
||||
} else if (reduction == "ncclProd") {
|
||||
op_type = ncclProd;
|
||||
} else if (reduction == "ncclMin") {
|
||||
op_type = ncclMin;
|
||||
} else if (reduction == "ncclMax") {
|
||||
op_type = ncclMax;
|
||||
}
|
||||
|
||||
auto* comm = ctx.Input<Communicator>("Communicator");
|
||||
|
||||
auto dev_ctx =
|
||||
static_cast<const platform::CUDADeviceContext>(ctx.device_context());
|
||||
|
||||
// platform::NCCLManager* m = platform::NCCLManager::Get();
|
||||
|
||||
// auto* comm = m->GetCommunicator(gpus);
|
||||
// comm->wg_.Add(1);
|
||||
|
||||
auto stream = dev_ctx.stream();
|
||||
|
||||
// device id
|
||||
int gid = static_cast<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
|
||||
int idx = gid % gpus.size();
|
||||
comm->streams_[idx] = stream;
|
||||
|
||||
for (size_t i = 0; i < ins.size(); ++i) {
|
||||
PADDLE_ENFORCE(
|
||||
ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
|
||||
outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
|
||||
op_type, comm->comms_[idx], comm->streams_[idx]));
|
||||
PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
|
||||
|
||||
// // wait finish
|
||||
// PADDLE_ENFORCE(
|
||||
// cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
|
||||
}
|
||||
|
||||
// comm->wg_.Done();
|
||||
|
||||
// comm->wg_.Wait();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,66 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "paddle/operators/nccl_op.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class NCCLAllReduceKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
|
||||
"This kernel only runs on GPU device.");
|
||||
|
||||
auto ins = ctx.MultiInput<Tensor>("X");
|
||||
auto outs = ctx.MultiOutput<Tensor>("Out");
|
||||
std::string reduction = ctx.Attr<std::string>("reduction");
|
||||
ncclRedOp_t op_type;
|
||||
if (reduction == "ncclSum") {
|
||||
op_type = ncclSum;
|
||||
} else if (reduction == "ncclProd") {
|
||||
op_type = ncclProd;
|
||||
} else if (reduction == "ncclMin") {
|
||||
op_type = ncclMin;
|
||||
} else if (reduction == "ncclMax") {
|
||||
op_type = ncclMax;
|
||||
} else {
|
||||
PADDLE_ENFORCE(false, "reduction error.");
|
||||
}
|
||||
|
||||
auto* comm = ctx.Input<Communicator>("Communicator");
|
||||
|
||||
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
|
||||
ctx.device_context())
|
||||
.stream();
|
||||
|
||||
// device id
|
||||
int device_id =
|
||||
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
|
||||
int idx = comm->GetCommId(device_id);
|
||||
|
||||
for (size_t i = 0; i < ins.size(); ++i) {
|
||||
PADDLE_ENFORCE(ncclAllReduce(
|
||||
ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
|
||||
outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, op_type,
|
||||
comm->comms_[idx], stream));
|
||||
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
|
@ -0,0 +1,50 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include "paddle/framework/op_registry.h"
|
||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using framework::Tensor;
|
||||
using platform::Communicator;
|
||||
|
||||
template <typename Type>
|
||||
class NCCLTypeWrapper;
|
||||
|
||||
template <>
|
||||
class NCCLTypeWrapper<float> {
|
||||
public:
|
||||
static const ncclDataType_t type = ncclFloat;
|
||||
};
|
||||
|
||||
template <>
|
||||
class NCCLTypeWrapper<double> {
|
||||
public:
|
||||
static const ncclDataType_t type = ncclDouble;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class NCCLInitKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto* gpus = ctx.Input<std::vector<int>>("gpus");
|
||||
auto* comm = ctx.Output<Communicator>("Communicator");
|
||||
comm->InitAll(*gpus);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
Loading…
Reference in new issue