From fcaf86f5d9a029c5f4c29028b65fd73eba0b507b Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Fri, 21 Aug 2020 16:37:40 +0800 Subject: [PATCH] fix nccl kernel memory align bug --- .../backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h | 7 ++++--- mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h index 0ac125321e..529045c543 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h @@ -122,6 +122,8 @@ class NcclGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { nccl_data_type_ = kNcclDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))]; + InferCommType(kernel_node); + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); for (size_t i = 0; i < input_num; ++i) { @@ -130,7 +132,7 @@ class NcclGpuKernel : public GpuKernel { for (size_t j = 0; j < shape.size(); j++) { size *= IntToSize(shape[j]); } - size_t aligned_size = AlignMemorySize(size); + size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size); input_size_list_.push_back(aligned_size); input_size_ += aligned_size; } @@ -140,12 +142,11 @@ class NcclGpuKernel : public GpuKernel { for (size_t j = 0; j < shape.size(); j++) { size *= IntToSize(shape[j]); } - size_t aligned_size = AlignMemorySize(size); + size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size); output_size_list_.push_back(aligned_size); output_size_ += aligned_size; } - InferCommType(kernel_node); group_name_ = GetAttr(kernel_node, kAttrGroup); MS_LOG(INFO) << AnfAlgo::GetCNodeName(kernel_node) << " for group " << group_name_; auto comm_stream_attr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stream_id"); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index 40662a334f..f8664875e8 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -40,8 +40,8 @@ bool GPUDeviceAddress::SyncDeviceToHost(const std::vector &, size_t size, T return ret; } if (size != size_) { - // nccl kernel input and outpu memory size is aligned, may lead to sync memory size is inconformity - MS_LOG(INFO) << "Sync memory size is inconformity, host size: " << size << ", device size " << size_; + // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size + MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; } return GPUDeviceManager::GetInstance().CopyDeviceMemToHost(host_ptr, ptr_, size); } @@ -51,8 +51,8 @@ bool GPUDeviceAddress::SyncHostToDevice(const std::vector &, size_t size, T auto &stream = GPUDeviceManager::GetInstance().default_stream(); MS_EXCEPTION_IF_NULL(stream); if (size != size_) { - // nccl kernel input and outpu memory size is aligned, may lead to sync memory size is inconformity - MS_LOG(INFO) << "Sync memory size is inconformity, host size: " << size << ", device size " << size_; + // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size + MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; } if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";