supports multiple NCCL communicators preserved in NCCLCommContext (#19407)
* supports multiple NCCL communicators preserved in NCCLCommContext test=develop * add ut for c_comm_init_all operator and fix cuda resource release problem test=developassert
parent
56dd76538c
commit
efb05ba258
@ -0,0 +1,93 @@
|
||||
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
||||
#include <nccl.h>
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
|
||||
#include "paddle/fluid/framework/executor.h"
|
||||
#include "paddle/fluid/framework/lod_tensor.h"
|
||||
#include "paddle/fluid/framework/op_info.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/threadpool.h"
|
||||
#include "paddle/fluid/operators/distributed/distributed.h"
|
||||
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
|
||||
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
||||
#include "paddle/fluid/platform/collective_helper.h"
|
||||
#include "paddle/fluid/platform/nccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
class CCommInitAllInferShape : public framework::InferShapeBase {
|
||||
public:
|
||||
~CCommInitAllInferShape() {}
|
||||
void operator()(framework::InferShapeContext* ctx) const override{};
|
||||
};
|
||||
|
||||
class CCommInitAllOp : public framework::OperatorBase {
|
||||
public:
|
||||
CCommInitAllOp(const std::string& type,
|
||||
const framework::VariableNameMap& inputs,
|
||||
const framework::VariableNameMap& outputs,
|
||||
const framework::AttributeMap& attrs)
|
||||
: OperatorBase(type, inputs, outputs, attrs) {}
|
||||
|
||||
void RunImpl(const framework::Scope& scope,
|
||||
const platform::Place& place) const override {
|
||||
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
|
||||
"CCommInitAllOp can run on gpu place only.");
|
||||
|
||||
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
|
||||
std::vector<int> devices = Attr<std::vector<int>>("devices");
|
||||
if (devices.empty()) {
|
||||
devices = platform::GetSelectedDevices();
|
||||
}
|
||||
|
||||
int rid = Attr<int>("ring_id");
|
||||
|
||||
platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid);
|
||||
#else
|
||||
PADDLE_THROW("PaddlePaddle should compile with GPU.");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
class CCommInitAllOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
void Make() override {
|
||||
AddComment(R"DOC(
|
||||
CCommInitAll operator
|
||||
|
||||
Initialize all collective communicatoin context
|
||||
)DOC");
|
||||
AddAttr<std::vector<int>>(
|
||||
"devices",
|
||||
"(std::vector<int>) which devices does the nccl comm initialized on")
|
||||
.SetDefault({});
|
||||
AddAttr<int>("ring_id", "(int default 0) user specified ring id")
|
||||
.SetDefault(0);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
|
||||
REGISTER_OPERATOR(c_comm_init_all, ops::CCommInitAllOp,
|
||||
ops::CCommInitAllInferShape, ops::CCommInitAllOpMaker);
|
@ -0,0 +1,50 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import paddle.fluid.core as core
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class TestCCommInitAllOp(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.place = fluid.CUDAPlace(0)
|
||||
self.exe = fluid.Executor(self.place)
|
||||
|
||||
def test_default_attrs(self):
|
||||
program = fluid.Program()
|
||||
block = program.global_block()
|
||||
block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
|
||||
self.exe.run(program)
|
||||
|
||||
def test_init_with_same_ring_id(self):
|
||||
program = fluid.Program()
|
||||
block = program.global_block()
|
||||
block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
|
||||
with self.assertRaises(core.EnforceNotMet):
|
||||
self.exe.run(program)
|
||||
|
||||
def test_specifying_devices(self):
|
||||
program = fluid.Program()
|
||||
block = program.global_block()
|
||||
block.append_op(
|
||||
type='c_comm_init_all', attrs={'devices': [0],
|
||||
'ring_id': 1})
|
||||
self.exe.run(program)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Loading…
Reference in new issue