parent
fdfc8f9baa
commit
333045d7b2
@ -1,61 +1,17 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||||
#include "paddle/platform/gpu_info.h"
|
#include "paddle/platform/gpu_info.h"
|
||||||
|
|
||||||
namespace paddle {
|
namespace paddle {
|
||||||
namespace platform {
|
namespace platform {} // namespace platform
|
||||||
|
|
||||||
NCCLManager::NCCLManager() {}
|
|
||||||
|
|
||||||
NCCLManager::~NCCLManager() {
|
|
||||||
for (auto& p : comm_table) {
|
|
||||||
auto& comm = p.second;
|
|
||||||
auto& gpus_ = comm->gpus_;
|
|
||||||
for (size_t i = 0; i < gpus_.size(); ++i) {
|
|
||||||
int gid = gpus_[i];
|
|
||||||
platform::SetDeviceId(gid);
|
|
||||||
|
|
||||||
// mapping gid to idx
|
|
||||||
int idx = gid % gpus_.size();
|
|
||||||
// wait finish
|
|
||||||
PADDLE_ENFORCE(
|
|
||||||
cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
|
|
||||||
|
|
||||||
PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
|
|
||||||
|
|
||||||
PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
|
|
||||||
}
|
|
||||||
comm.reset(nullptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) {
|
|
||||||
std::string key;
|
|
||||||
for (auto& id : gpus) {
|
|
||||||
key += std::to_string(id);
|
|
||||||
}
|
|
||||||
std::sort(key.begin(), key.end());
|
|
||||||
|
|
||||||
std::mutex mu;
|
|
||||||
std::lock_guard<std::mutex> lk(mu);
|
|
||||||
|
|
||||||
auto it = comm_table.find(key);
|
|
||||||
|
|
||||||
if (it->second == nullptr) {
|
|
||||||
auto* comm = new Communicator(gpus);
|
|
||||||
PADDLE_ENFORCE(
|
|
||||||
ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
|
|
||||||
|
|
||||||
for (size_t i = 0; i < gpus.size(); ++i) {
|
|
||||||
platform::SetDeviceId(gpus[i]);
|
|
||||||
|
|
||||||
// block wait
|
|
||||||
PADDLE_ENFORCE(cudaEventCreateWithFlags(
|
|
||||||
&comm->events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
|
|
||||||
}
|
|
||||||
comm_table[key].reset(comm);
|
|
||||||
}
|
|
||||||
return comm_table[key].get();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace operators
|
|
||||||
} // namespace paddle
|
} // namespace paddle
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License. */
|
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
|
||||||
#include "paddle/operators/nccl/nccl_ops.h"
|
|
||||||
|
|
||||||
namespace ops = paddle::operators;
|
|
||||||
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
|
|
@ -1,103 +0,0 @@
|
|||||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License. */
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "paddle/framework/op_registry.h"
|
|
||||||
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace operators {
|
|
||||||
|
|
||||||
using framework::Tensor;
|
|
||||||
|
|
||||||
template <typename Type>
|
|
||||||
class NCCLTypeWrapper;
|
|
||||||
|
|
||||||
template <>
|
|
||||||
class NCCLTypeWrapper<float> {
|
|
||||||
public:
|
|
||||||
static const ncclDataType_t type = ncclFloat;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <>
|
|
||||||
class NCCLTypeWrapper<double> {
|
|
||||||
public:
|
|
||||||
static const ncclDataType_t type = ncclDouble;
|
|
||||||
};
|
|
||||||
|
|
||||||
class NCCLInitOp : public framework::OpKernel<T> {
|
|
||||||
public:
|
|
||||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
|
||||||
auto gpus = ctx.Input<std::vector<int>>("gpus");
|
|
||||||
auto* comm = ctx.Output<Communicator>("Communicator");
|
|
||||||
comm->mutable_data<Communicator>(CPUPlace());
|
|
||||||
comm = NCCLManager::GetCommunicator(gpus);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
class NCCLAllReduceKernel : public framework::OpKernel<T> {
|
|
||||||
public:
|
|
||||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
|
||||||
auto ins = ctx.MultiInput<Tensor>("X");
|
|
||||||
auto outs = ctx.MultiOutput<Tensor>("Out");
|
|
||||||
std::string reduction = ctx.Attr<std::string>("reduction");
|
|
||||||
std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
|
|
||||||
ncclRedOp_t op_type;
|
|
||||||
if (reduction == "ncclSum") {
|
|
||||||
op_type = ncclSum;
|
|
||||||
} else if (reduction == "ncclProd") {
|
|
||||||
op_type = ncclProd;
|
|
||||||
} else if (reduction == "ncclMin") {
|
|
||||||
op_type = ncclMin;
|
|
||||||
} else if (reduction == "ncclMax") {
|
|
||||||
op_type = ncclMax;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto* comm = ctx.Input<Communicator>("Communicator");
|
|
||||||
|
|
||||||
auto dev_ctx =
|
|
||||||
static_cast<const platform::CUDADeviceContext>(ctx.device_context());
|
|
||||||
|
|
||||||
// platform::NCCLManager* m = platform::NCCLManager::Get();
|
|
||||||
|
|
||||||
// auto* comm = m->GetCommunicator(gpus);
|
|
||||||
// comm->wg_.Add(1);
|
|
||||||
|
|
||||||
auto stream = dev_ctx.stream();
|
|
||||||
|
|
||||||
// device id
|
|
||||||
int gid = static_cast<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
|
|
||||||
int idx = gid % gpus.size();
|
|
||||||
comm->streams_[idx] = stream;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ins.size(); ++i) {
|
|
||||||
PADDLE_ENFORCE(
|
|
||||||
ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
|
|
||||||
outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
|
|
||||||
op_type, comm->comms_[idx], comm->streams_[idx]));
|
|
||||||
PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
|
|
||||||
|
|
||||||
// // wait finish
|
|
||||||
// PADDLE_ENFORCE(
|
|
||||||
// cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
// comm->wg_.Done();
|
|
||||||
|
|
||||||
// comm->wg_.Wait();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace operators
|
|
||||||
} // namespace paddle
|
|
@ -0,0 +1,66 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "paddle/operators/nccl_op.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class NCCLAllReduceKernel : public framework::OpKernel<T> {
|
||||||
|
public:
|
||||||
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||||
|
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
|
||||||
|
"This kernel only runs on GPU device.");
|
||||||
|
|
||||||
|
auto ins = ctx.MultiInput<Tensor>("X");
|
||||||
|
auto outs = ctx.MultiOutput<Tensor>("Out");
|
||||||
|
std::string reduction = ctx.Attr<std::string>("reduction");
|
||||||
|
ncclRedOp_t op_type;
|
||||||
|
if (reduction == "ncclSum") {
|
||||||
|
op_type = ncclSum;
|
||||||
|
} else if (reduction == "ncclProd") {
|
||||||
|
op_type = ncclProd;
|
||||||
|
} else if (reduction == "ncclMin") {
|
||||||
|
op_type = ncclMin;
|
||||||
|
} else if (reduction == "ncclMax") {
|
||||||
|
op_type = ncclMax;
|
||||||
|
} else {
|
||||||
|
PADDLE_ENFORCE(false, "reduction error.");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto* comm = ctx.Input<Communicator>("Communicator");
|
||||||
|
|
||||||
|
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
|
||||||
|
ctx.device_context())
|
||||||
|
.stream();
|
||||||
|
|
||||||
|
// device id
|
||||||
|
int device_id =
|
||||||
|
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
|
||||||
|
int idx = comm->GetCommId(device_id);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < ins.size(); ++i) {
|
||||||
|
PADDLE_ENFORCE(ncclAllReduce(
|
||||||
|
ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
|
||||||
|
outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, op_type,
|
||||||
|
comm->comms_[idx], stream));
|
||||||
|
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
|
@ -0,0 +1,50 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "paddle/framework/op_registry.h"
|
||||||
|
#include "paddle/operators/nccl/nccl_gpu_common.h"
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
using framework::Tensor;
|
||||||
|
using platform::Communicator;
|
||||||
|
|
||||||
|
template <typename Type>
|
||||||
|
class NCCLTypeWrapper;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
class NCCLTypeWrapper<float> {
|
||||||
|
public:
|
||||||
|
static const ncclDataType_t type = ncclFloat;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
class NCCLTypeWrapper<double> {
|
||||||
|
public:
|
||||||
|
static const ncclDataType_t type = ncclDouble;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class NCCLInitKernel : public framework::OpKernel<T> {
|
||||||
|
public:
|
||||||
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||||
|
auto* gpus = ctx.Input<std::vector<int>>("gpus");
|
||||||
|
auto* comm = ctx.Output<Communicator>("Communicator");
|
||||||
|
comm->InitAll(*gpus);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
Loading…
Reference in new issue