parent
d2d6c3cfb5
commit
171b468bb3
@ -0,0 +1,173 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/ascend/ascend_bucket.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/mem.h"
|
||||||
|
#include "external/hccl/hccl.h"
|
||||||
|
#include "runtime/device/ascend/ascend_memory_pool.h"
|
||||||
|
#include "backend/kernel_compiler/hccl/hcom_util.h"
|
||||||
|
#include "backend/kernel_compiler/hccl/hccl_context.h"
|
||||||
|
#include "runtime/device/memory_manager.h"
|
||||||
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
#include "runtime/device/ascend/ascend_event.h"
|
||||||
|
#include "utils/profile.h"
|
||||||
|
|
||||||
|
#define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \
|
||||||
|
{ \
|
||||||
|
rtError_t ret = (expression); \
|
||||||
|
if (ret != RT_ERROR_NONE) { \
|
||||||
|
MS_LOG(EXCEPTION) << message << ", error code: " << ret; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
void AscendBucket::AllocateAllReduceAddr() {
|
||||||
|
// Check bucket is full
|
||||||
|
if (grad_tensor_list_.size() != bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "grad tensor list size:" << grad_tensor_list_.size()
|
||||||
|
<< " is not equal to bucket size:" << bucket_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto total_size = 0;
|
||||||
|
std::vector<size_t> align_size_list;
|
||||||
|
std::vector<size_t> origin_size_list;
|
||||||
|
for (auto &tensor : grad_tensor_list_) {
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
|
tensor_type_list_.emplace_back(tensor->data_type());
|
||||||
|
DeviceAddressPtr device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
|
||||||
|
auto origin_size = device_address->GetSize();
|
||||||
|
auto align_size = MemoryManager::GetCommonAlignSize(origin_size);
|
||||||
|
origin_size_list.emplace_back(origin_size);
|
||||||
|
align_size_list.emplace_back(align_size);
|
||||||
|
total_size += align_size;
|
||||||
|
memcpy_input_addrs_.emplace_back(std::make_shared<kernel::Address>(
|
||||||
|
static_cast<uint8_t *>(device_address->GetMutablePtr()), device_address->GetSize()));
|
||||||
|
}
|
||||||
|
|
||||||
|
total_size_ = total_size;
|
||||||
|
|
||||||
|
auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
|
||||||
|
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||||
|
// AllReduce input output addr need to clear zero
|
||||||
|
ar_input_addr_ = runtime_instance->MallocCommunicationMemFromMemPool(total_size);
|
||||||
|
ar_output_addr_ = runtime_instance->MallocCommunicationMemFromMemPool(total_size);
|
||||||
|
|
||||||
|
// generate memecpy output addr
|
||||||
|
uint8_t *memcpy_output = ar_input_addr_;
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, origin_size_list[i]));
|
||||||
|
memcpy_output += align_size_list[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// store output tensor addr
|
||||||
|
uint8_t *tensor_output = ar_output_addr_;
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
new_tensor_output_addrs_.emplace_back(tensor_output);
|
||||||
|
tensor_output += align_size_list[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendBucket::FreeDeviceMem(void *dev_ptr) { AscendMemoryPool::GetInstance().FreeTensorMem(dev_ptr); }
|
||||||
|
|
||||||
|
void AscendBucket::FreeAllDeviceMem() {
|
||||||
|
if (ar_input_addr_ != nullptr) {
|
||||||
|
uint8_t *origin_dev_addr = ar_input_addr_ - kMemAlignSize;
|
||||||
|
FreeDeviceMem(origin_dev_addr);
|
||||||
|
ar_input_addr_ = nullptr;
|
||||||
|
}
|
||||||
|
if (ar_output_addr_ != nullptr) {
|
||||||
|
uint8_t *origin_dev_addr = ar_output_addr_ - kMemAlignSize;
|
||||||
|
FreeDeviceMem(origin_dev_addr);
|
||||||
|
ar_output_addr_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendBucket::CopyTensorToContiguousMemory() {
|
||||||
|
// Clean input addr
|
||||||
|
CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_),
|
||||||
|
"Call rtMemsetAsync failed");
|
||||||
|
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
|
||||||
|
MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
|
||||||
|
MS_LOG(DEBUG) << "MemcpyAsync dst size:" << memcpy_output_addrs_[i]->size
|
||||||
|
<< " src size:" << memcpy_input_addrs_[i]->size;
|
||||||
|
if (memcpy_output_addrs_[i]->size < memcpy_input_addrs_[i]->size) {
|
||||||
|
MS_LOG(EXCEPTION) << "rtMemcpyAsync dst size < src size";
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK_ASCEND_RT_WITH_EXCEPTION(
|
||||||
|
rtMemcpyAsync(memcpy_output_addrs_[i]->addr, memcpy_output_addrs_[i]->size, memcpy_input_addrs_[i]->addr,
|
||||||
|
memcpy_input_addrs_[i]->size, RT_MEMCPY_DEVICE_TO_DEVICE, compute_stream_),
|
||||||
|
"Call rtMemcpyAsync failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendBucket::LaunchAllReduce() {
|
||||||
|
if (tensor_type_list_.empty()) {
|
||||||
|
MS_LOG(EXCEPTION) << "No tesnor type found";
|
||||||
|
}
|
||||||
|
|
||||||
|
// AllReduce inputs data type should be same
|
||||||
|
auto type = tensor_type_list_[0];
|
||||||
|
if (std::any_of(tensor_type_list_.begin(), tensor_type_list_.end(),
|
||||||
|
[&type](TypeId tensor_type) { return type != tensor_type; })) {
|
||||||
|
MS_LOG(EXCEPTION) << "allreduce input have different dtype";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto iter = CONST_OP_HCOM_DATA_TYPE_MAP.find(type);
|
||||||
|
if (iter == CONST_OP_HCOM_DATA_TYPE_MAP.end()) {
|
||||||
|
MS_LOG(EXCEPTION) << "unknown data type:" << type;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t type_size;
|
||||||
|
if (!HcomUtil::GetHcomTypeSize(iter->second, &type_size)) {
|
||||||
|
MS_LOG(EXCEPTION) << "get hcom type size fialed";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type_size == 0 || total_size_ % type_size != 0) {
|
||||||
|
MS_LOG(EXCEPTION) << "Total_size[" << total_size_ << "],Type_size[" << type_size << "] != 0, fail!";
|
||||||
|
}
|
||||||
|
auto hccl_count = total_size_ / type_size;
|
||||||
|
|
||||||
|
HcclReduceOp op_type = HcclReduceOp::HCCL_REDUCE_SUM;
|
||||||
|
auto hccl_result = HcclAllReduce(ar_input_addr_, ar_output_addr_, hccl_count, iter->second, op_type,
|
||||||
|
kernel::HcclContext::GetInstance().hccl_comm(), stream_);
|
||||||
|
if (hccl_result != HCCL_SUCCESS) {
|
||||||
|
MS_LOG(EXCEPTION) << "HcclAllReduce faled, ret:" << hccl_result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendBucket::Init() {
|
||||||
|
pre_event_ = std::make_shared<AscendEvent>();
|
||||||
|
post_event_ = std::make_shared<AscendEvent>();
|
||||||
|
|
||||||
|
auto kernel_runtime = KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_runtime);
|
||||||
|
compute_stream_ = kernel_runtime->compute_stream();
|
||||||
|
stream_ = kernel_runtime->communication_stream();
|
||||||
|
|
||||||
|
MS_EXCEPTION_IF_NULL(pre_event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(post_event_);
|
||||||
|
pre_event_->set_wait_stream(stream_);
|
||||||
|
pre_event_->set_record_stream(compute_stream_);
|
||||||
|
post_event_->set_wait_stream(compute_stream_);
|
||||||
|
post_event_->set_record_stream(stream_);
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::ascend
|
@ -0,0 +1,38 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
||||||
|
|
||||||
|
#include "runtime/device/bucket.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
class AscendBucket : public Bucket {
|
||||||
|
public:
|
||||||
|
AscendBucket(uint32_t id, uint32_t bucket_size) : Bucket(id, bucket_size) {}
|
||||||
|
~AscendBucket() override = default;
|
||||||
|
|
||||||
|
void Init() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void AllocateAllReduceAddr() override;
|
||||||
|
void FreeAllDeviceMem() override;
|
||||||
|
void FreeDeviceMem(void *dev_ptr) override;
|
||||||
|
void CopyTensorToContiguousMemory() override;
|
||||||
|
void LaunchAllReduce() override;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::ascend
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
@ -0,0 +1,60 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/ascend/ascend_event.h"
|
||||||
|
|
||||||
|
#include "runtime/event.h"
|
||||||
|
#include "runtime/stream.h"
|
||||||
|
#include "utils/log_adapter.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
AscendEvent::AscendEvent() {
|
||||||
|
auto ret = rtEventCreate(&event_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
|
||||||
|
event_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
AscendEvent::~AscendEvent() {
|
||||||
|
auto ret = rtEventDestroy(event_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(ERROR) << "rtEventDestory failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendEvent::RecordEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(record_stream_);
|
||||||
|
auto ret = rtEventRecord(event_, record_stream_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(EXCEPTION) << "rtEventRecord failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
need_wait_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendEvent::WaitEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(wait_stream_);
|
||||||
|
auto ret = rtStreamWaitEvent(wait_stream_, event_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(EXCEPTION) << "rtStreamWaitEvent failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
need_wait_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AscendEvent::NeedWait() { return need_wait_; }
|
||||||
|
} // namespace mindspore::device::ascend
|
@ -0,0 +1,41 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_ASCEND_EVENT_H
|
||||||
|
#define MINDSPORE_ASCEND_EVENT_H
|
||||||
|
|
||||||
|
#include "runtime/base.h"
|
||||||
|
#include "ir/device_event.h"
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
class AscendEvent : public DeviceEvent {
|
||||||
|
public:
|
||||||
|
AscendEvent();
|
||||||
|
~AscendEvent() override;
|
||||||
|
|
||||||
|
void WaitEvent() override;
|
||||||
|
void RecordEvent() override;
|
||||||
|
bool NeedWait() override;
|
||||||
|
void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
|
||||||
|
void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
rtEvent_t event_{nullptr};
|
||||||
|
rtStream_t wait_stream_{nullptr};
|
||||||
|
rtStream_t record_stream_{nullptr};
|
||||||
|
bool need_wait_{false};
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::ascend
|
||||||
|
#endif // MINDSPORE_ASCEND_EVENT_H
|
@ -0,0 +1,106 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/bucket.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
#include "utils/profile.h"
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
void Bucket::AddGradTensor(const tensor::TensorPtr &tensor) {
|
||||||
|
if (grad_tensor_list_.size() >= bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "bucket is full";
|
||||||
|
}
|
||||||
|
grad_tensor_list_.emplace_back(tensor);
|
||||||
|
if (grad_tensor_list_.size() > bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "too many tensor add to the bucket, bucket_size_:" << bucket_size_
|
||||||
|
<< " total tensor size:" << grad_tensor_list_.size();
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "current bucket tensors size:" << grad_tensor_list_.size();
|
||||||
|
// bucket is full, start to launch allreduce
|
||||||
|
if (grad_tensor_list_.size() == bucket_size_) {
|
||||||
|
full_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Bucket::Launch() {
|
||||||
|
auto start = GetTime();
|
||||||
|
if (grad_tensor_list_.size() != bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "Bucket is not full, grad_tensor_list_ size:" << grad_tensor_list_.size()
|
||||||
|
<< " bucket_size_:" << bucket_size_;
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "Bucket is full, start to launch AllReduce";
|
||||||
|
MS_EXCEPTION_IF_NULL(pre_event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(post_event_);
|
||||||
|
AllocateAllReduceAddr();
|
||||||
|
CopyTensorToContiguousMemory();
|
||||||
|
pre_event_->RecordEvent();
|
||||||
|
pre_event_->WaitEvent();
|
||||||
|
LaunchAllReduce();
|
||||||
|
post_event_->RecordEvent();
|
||||||
|
UpdateTensorAddr();
|
||||||
|
// pass event to the tensor
|
||||||
|
for (auto &tensor : grad_tensor_list_) {
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
|
tensor->SetDeviceEvent(post_event_);
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "Bucket launch cost:" << (GetTime() - start) * 1e6 << " us";
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(caifubi): float16 grad cast to float32 grad
|
||||||
|
|
||||||
|
void Bucket::UpdateTensorAddr() {
|
||||||
|
if (grad_tensor_list_.size() != bucket_size_ || new_tensor_output_addrs_.size() != bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "grad_tensor_list size:" << grad_tensor_list_.size()
|
||||||
|
<< " tensor output addr size:" << new_tensor_output_addrs_.size()
|
||||||
|
<< " bucket size:" << bucket_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
auto &tensor = grad_tensor_list_[i];
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
|
auto device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
|
||||||
|
// release old addr and manage addr by this Bucket.
|
||||||
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
|
auto origin_dev_ptr = device_address->GetMutablePtr();
|
||||||
|
// FreeDeviceMem(origin_dev_ptr);
|
||||||
|
tensor_old_addr_list_.emplace_back(origin_dev_ptr);
|
||||||
|
device_address->from_mem_pool_ = false;
|
||||||
|
device_address->set_ptr(new_tensor_output_addrs_[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Bucket::LazyDeleteOldAddr() {
|
||||||
|
MS_LOG(INFO) << "Lazy delete old grad address";
|
||||||
|
for (auto old_addr : tensor_old_addr_list_) {
|
||||||
|
FreeDeviceMem(old_addr);
|
||||||
|
}
|
||||||
|
tensor_old_addr_list_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Bucket::Release() {
|
||||||
|
MS_LOG(INFO) << "Clear bucket:" << id_;
|
||||||
|
grad_tensor_list_.clear();
|
||||||
|
new_tensor_output_addrs_.clear();
|
||||||
|
memcpy_input_addrs_.clear();
|
||||||
|
memcpy_output_addrs_.clear();
|
||||||
|
tensor_type_list_.clear();
|
||||||
|
LazyDeleteOldAddr();
|
||||||
|
FreeAllDeviceMem();
|
||||||
|
full_ = false;
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device
|
@ -0,0 +1,83 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_BUCKET_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_BUCKET_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include "ir/anf.h"
|
||||||
|
#include "ir/device_event.h"
|
||||||
|
#include "runtime/device/device_address.h"
|
||||||
|
#include "backend/session/kernel_graph.h"
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
class Bucket {
|
||||||
|
public:
|
||||||
|
Bucket(uint32_t id, uint32_t bucket_size)
|
||||||
|
: id_(id),
|
||||||
|
bucket_size_(bucket_size),
|
||||||
|
full_(false),
|
||||||
|
stream_(nullptr),
|
||||||
|
compute_stream_(nullptr),
|
||||||
|
pre_event_(nullptr),
|
||||||
|
post_event_(nullptr),
|
||||||
|
total_size_(0),
|
||||||
|
ar_input_addr_(nullptr),
|
||||||
|
ar_output_addr_(nullptr) {}
|
||||||
|
virtual ~Bucket() = default;
|
||||||
|
|
||||||
|
uint32_t id() const { return id_; }
|
||||||
|
bool full() const { return full_; }
|
||||||
|
void Launch();
|
||||||
|
void Release();
|
||||||
|
void AddGradTensor(const tensor::TensorPtr &tensor);
|
||||||
|
virtual void Init() = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
uint32_t id_;
|
||||||
|
uint32_t bucket_size_;
|
||||||
|
bool full_;
|
||||||
|
void *stream_;
|
||||||
|
void *compute_stream_;
|
||||||
|
|
||||||
|
std::shared_ptr<DeviceEvent> pre_event_;
|
||||||
|
std::shared_ptr<DeviceEvent> post_event_;
|
||||||
|
|
||||||
|
size_t total_size_;
|
||||||
|
uint8_t *ar_input_addr_;
|
||||||
|
uint8_t *ar_output_addr_;
|
||||||
|
std::string group_;
|
||||||
|
std::vector<tensor::TensorPtr> grad_tensor_list_;
|
||||||
|
std::vector<uint8_t *> new_tensor_output_addrs_;
|
||||||
|
std::vector<kernel::AddressPtr> memcpy_input_addrs_;
|
||||||
|
std::vector<kernel::AddressPtr> memcpy_output_addrs_;
|
||||||
|
std::vector<TypeId> tensor_type_list_;
|
||||||
|
std::vector<void *> tensor_old_addr_list_;
|
||||||
|
|
||||||
|
virtual void AllocateAllReduceAddr() = 0;
|
||||||
|
void UpdateTensorAddr();
|
||||||
|
virtual void LaunchAllReduce() = 0;
|
||||||
|
virtual void FreeAllDeviceMem() = 0;
|
||||||
|
virtual void FreeDeviceMem(void *dev_ptr) = 0;
|
||||||
|
virtual void CopyTensorToContiguousMemory() = 0;
|
||||||
|
void LazyDeleteOldAddr();
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_BUCKET_H_
|
@ -0,0 +1,177 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/gpu/gpu_bucket.h"
|
||||||
|
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include <nccl.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "abstract/utils.h"
|
||||||
|
#include "runtime/device/gpu/gpu_event.h"
|
||||||
|
#include "runtime/device/gpu/gpu_memory_allocator.h"
|
||||||
|
#include "runtime/device/gpu/gpu_device_manager.h"
|
||||||
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
#include "runtime/device/gpu/distribution/collective_init.h"
|
||||||
|
#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
|
||||||
|
#include "runtime/device/gpu/gpu_common.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const size_t kCommunicationMemAlignSize = 16;
|
||||||
|
size_t AlignMemorySize(size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
return kCommunicationMemAlignSize;
|
||||||
|
}
|
||||||
|
return ((size + kCommunicationMemAlignSize - 1) / kCommunicationMemAlignSize) * kCommunicationMemAlignSize;
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
GPUBucket::GPUBucket(uint32_t id, uint32_t bucket_size) : Bucket(id, bucket_size), collective_handle_(nullptr) {
|
||||||
|
group_ = kNcclWorldGroup;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBucket::AllocateAllReduceAddr() {
|
||||||
|
MS_LOG(INFO) << "start";
|
||||||
|
if (grad_tensor_list_.size() != bucket_size_) {
|
||||||
|
MS_LOG(EXCEPTION) << "grad tensor list size:" << grad_tensor_list_.size()
|
||||||
|
<< " is not equal to bucket size:" << bucket_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto total_size = 0;
|
||||||
|
std::vector<size_t> size_list;
|
||||||
|
std::vector<size_t> align_size_list;
|
||||||
|
for (auto &tensor : grad_tensor_list_) {
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
|
tensor_type_list_.emplace_back(tensor->data_type());
|
||||||
|
DeviceAddressPtr device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
|
||||||
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
|
auto origin_size = device_address->GetSize();
|
||||||
|
auto align_size = AlignMemorySize(origin_size);
|
||||||
|
size_list.emplace_back(origin_size);
|
||||||
|
align_size_list.emplace_back(align_size);
|
||||||
|
total_size += align_size;
|
||||||
|
memcpy_input_addrs_.emplace_back(
|
||||||
|
std::make_shared<kernel::Address>(static_cast<uint8_t *>(device_address->GetMutablePtr()), origin_size));
|
||||||
|
}
|
||||||
|
total_size_ = total_size;
|
||||||
|
|
||||||
|
ar_input_addr_ = static_cast<uint8_t *>(GPUMemoryAllocator::GetInstance().AllocTensorMem(total_size));
|
||||||
|
ar_output_addr_ = static_cast<uint8_t *>(GPUMemoryAllocator::GetInstance().AllocTensorMem(total_size));
|
||||||
|
|
||||||
|
uint8_t *memcpy_output = ar_input_addr_;
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, size_list[i]));
|
||||||
|
memcpy_output += align_size_list[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *tensor_output = ar_output_addr_;
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
new_tensor_output_addrs_.emplace_back(tensor_output);
|
||||||
|
tensor_output += align_size_list[i];
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "end";
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBucket::FreeDeviceMem(void *dev_ptr) { GPUMemoryAllocator::GetInstance().FreeTensorMem(dev_ptr); }
|
||||||
|
|
||||||
|
void GPUBucket::FreeAllDeviceMem() {
|
||||||
|
MS_LOG(INFO) << "start";
|
||||||
|
if (ar_input_addr_ != nullptr) {
|
||||||
|
FreeDeviceMem(ar_input_addr_);
|
||||||
|
ar_input_addr_ = nullptr;
|
||||||
|
}
|
||||||
|
if (ar_output_addr_ != nullptr) {
|
||||||
|
FreeDeviceMem(ar_output_addr_);
|
||||||
|
ar_output_addr_ = nullptr;
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "end";
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBucket::CopyTensorToContiguousMemory() {
|
||||||
|
MS_LOG(INFO) << "start";
|
||||||
|
MS_EXCEPTION_IF_NULL(compute_stream_);
|
||||||
|
// Clean allreduce input
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
|
||||||
|
cudaMemsetAsync(ar_input_addr_, 0, total_size_, static_cast<cudaStream_t>(compute_stream_)),
|
||||||
|
"Call cudaMemsetAsync failed");
|
||||||
|
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
|
||||||
|
MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
|
||||||
|
if (!GPUDeviceManager::GetInstance().CopyDeviceMemToDeviceAsync(memcpy_output_addrs_[i]->addr,
|
||||||
|
memcpy_input_addrs_[i]->addr,
|
||||||
|
memcpy_output_addrs_[i]->size, compute_stream_)) {
|
||||||
|
MS_LOG(EXCEPTION) << "Copy memory failed";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "end";
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBucket::LaunchAllReduce() {
|
||||||
|
MS_LOG(INFO) << "start";
|
||||||
|
collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
|
||||||
|
auto all_reduce_funcptr =
|
||||||
|
reinterpret_cast<kernel::AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
|
||||||
|
MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
|
||||||
|
MS_EXCEPTION_IF_NULL(stream_);
|
||||||
|
|
||||||
|
if (tensor_type_list_.empty()) {
|
||||||
|
MS_LOG(EXCEPTION) << "No tesnor type found";
|
||||||
|
}
|
||||||
|
auto type = tensor_type_list_[0];
|
||||||
|
if (std::any_of(tensor_type_list_.begin(), tensor_type_list_.end(),
|
||||||
|
[&type](TypeId tensor_type) { return type != tensor_type; })) {
|
||||||
|
MS_LOG(EXCEPTION) << "AllReduce input have different dtype";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto type_size = abstract::TypeIdSize(type);
|
||||||
|
if (type_size == 0) {
|
||||||
|
MS_LOG(EXCEPTION) << "Invalid type:" << type;
|
||||||
|
}
|
||||||
|
|
||||||
|
// typeid to nccl_data_type
|
||||||
|
auto nccl_data_type_iter = kernel::kNcclDtypeMap.find(TypeIdLabel(type));
|
||||||
|
if (nccl_data_type_iter == kernel::kNcclDtypeMap.end()) {
|
||||||
|
MS_LOG(EXCEPTION) << "Invalid type:" << type;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto nccl_result =
|
||||||
|
(*all_reduce_funcptr)(ar_input_addr_, ar_output_addr_, total_size_ / type_size, nccl_data_type_iter->second,
|
||||||
|
ncclRedOp_t::ncclSum, static_cast<cudaStream_t>(stream_), group_);
|
||||||
|
if (nccl_result != ncclSuccess) {
|
||||||
|
MS_LOG(EXCEPTION) << "AllReduce failed, ret:" << nccl_result;
|
||||||
|
}
|
||||||
|
|
||||||
|
MS_LOG(INFO) << "end";
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBucket::Init() {
|
||||||
|
pre_event_ = std::make_shared<GpuEvent>();
|
||||||
|
post_event_ = std::make_shared<GpuEvent>();
|
||||||
|
|
||||||
|
auto kernel_runtime = KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_runtime);
|
||||||
|
stream_ = kernel_runtime->communication_stream();
|
||||||
|
compute_stream_ = kernel_runtime->compute_stream();
|
||||||
|
|
||||||
|
MS_EXCEPTION_IF_NULL(pre_event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(post_event_);
|
||||||
|
pre_event_->set_record_stream(compute_stream_);
|
||||||
|
pre_event_->set_wait_stream(stream_);
|
||||||
|
post_event_->set_record_stream(stream_);
|
||||||
|
post_event_->set_wait_stream(compute_stream_);
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::gpu
|
@ -0,0 +1,40 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
||||||
|
|
||||||
|
#include "runtime/device/bucket.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
class GPUBucket : public Bucket {
|
||||||
|
public:
|
||||||
|
GPUBucket(uint32_t id, uint32_t bucket_size);
|
||||||
|
~GPUBucket() override = default;
|
||||||
|
|
||||||
|
void Init() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void AllocateAllReduceAddr() override;
|
||||||
|
void FreeAllDeviceMem() override;
|
||||||
|
void FreeDeviceMem(void *dev_ptr) override;
|
||||||
|
void CopyTensorToContiguousMemory() override;
|
||||||
|
void LaunchAllReduce() override;
|
||||||
|
|
||||||
|
const void *collective_handle_;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::gpu
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
@ -0,0 +1,46 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/gpu/gpu_event.h"
|
||||||
|
#include "runtime/device/gpu/gpu_common.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
GpuEvent::GpuEvent() {
|
||||||
|
auto ret = cudaEventCreate(&event_);
|
||||||
|
if (ret != cudaSuccess) {
|
||||||
|
MS_LOG(ERROR) << "cudaEventCreate failed, ret:" << ret;
|
||||||
|
event_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuEvent::~GpuEvent() { CHECK_CUDA_RET_WITH_ERROR_NOTRACE(cudaEventDestroy(event_), "cudaEventDestory failed"); }
|
||||||
|
|
||||||
|
void GpuEvent::WaitEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(wait_stream_);
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaStreamWaitEvent(wait_stream_, event_, 0), "cudaStreamWaitEvent failed");
|
||||||
|
need_wait_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GpuEvent::RecordEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
MS_EXCEPTION_IF_NULL(record_stream_);
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventRecord(event_, record_stream_), "cudaEventRecord failed");
|
||||||
|
need_wait_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GpuEvent::NeedWait() { return need_wait_; }
|
||||||
|
} // namespace mindspore::device::gpu
|
@ -0,0 +1,42 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_EVENT_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_EVENT_H_
|
||||||
|
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include "ir/device_event.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
class GpuEvent : public DeviceEvent {
|
||||||
|
public:
|
||||||
|
GpuEvent();
|
||||||
|
~GpuEvent() override;
|
||||||
|
|
||||||
|
void WaitEvent() override;
|
||||||
|
void RecordEvent() override;
|
||||||
|
bool NeedWait() override;
|
||||||
|
void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
|
||||||
|
void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaEvent_t event_{nullptr};
|
||||||
|
cudaStream_t wait_stream_{nullptr};
|
||||||
|
cudaStream_t record_stream_{nullptr};
|
||||||
|
bool need_wait_{false};
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::gpu
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_EVENT_H_
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue