From 4914cb804724109d398ee4923b3980e321e572f6 Mon Sep 17 00:00:00 2001 From: lvchangquan Date: Tue, 30 Mar 2021 19:33:21 +0800 Subject: [PATCH] fix a bug in launch allreduce --- mindspore/ccsrc/runtime/device/bucket.cc | 11 ++++++++--- mindspore/ccsrc/runtime/device/bucket.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mindspore/ccsrc/runtime/device/bucket.cc b/mindspore/ccsrc/runtime/device/bucket.cc index dcf54d76e8..7c9fef7a03 100644 --- a/mindspore/ccsrc/runtime/device/bucket.cc +++ b/mindspore/ccsrc/runtime/device/bucket.cc @@ -92,6 +92,7 @@ void Bucket::CalculateMean() { MS_EXCEPTION_IF_NULL(parallel_context); auto grad_mean = parallel_context->gradients_mean(); if (!grad_mean) { + UpdateTensorOutputAddr(ar_output_addr_); return; } if (launch_mul_ == nullptr) { @@ -102,12 +103,16 @@ void Bucket::CalculateMean() { launch_mul_->SetInputAddr(ar_output_addr_); // launch mean launch_mul_->LaunchOpKernel(); - // store output tensor addr + // store tensor output addr auto launch_output = launch_mul_->GetKernelOutputAddr(); if (launch_output.size() != 1) { - MS_LOG(ERROR) << "launch mul outputs should have one output"; + MS_LOG(EXCEPTION) << "launch mul outputs should have one output"; } - uint8_t *tensor_output = launch_output[0]; + UpdateTensorOutputAddr(launch_output[0]); +} + +void Bucket::UpdateTensorOutputAddr(uint8_t *addr) { + uint8_t *tensor_output = addr; for (size_t i = 0; i < bucket_size_; ++i) { new_tensor_output_addrs_.emplace_back(tensor_output); tensor_output += align_size_list_[i]; diff --git a/mindspore/ccsrc/runtime/device/bucket.h b/mindspore/ccsrc/runtime/device/bucket.h index 4bbe8f4a8e..f65c4d3cea 100644 --- a/mindspore/ccsrc/runtime/device/bucket.h +++ b/mindspore/ccsrc/runtime/device/bucket.h @@ -84,6 +84,7 @@ class Bucket { virtual void FreeAllDeviceMem() = 0; virtual void FreeDeviceMem(void *dev_ptr) = 0; virtual void CopyTensorToContiguousMemory() = 0; + void UpdateTensorOutputAddr(uint8_t *addr); void LazyDeleteOldAddr(); }; } // namespace mindspore::device