diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc index 900458992e..905d024f5e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc @@ -43,7 +43,15 @@ int ConcatFp16CPUKernel::Init() { int ConcatFp16CPUKernel::ReSize() { FreeTmpBuffer(); + auto ret = MallocTmpBuffer(); + if (ret != RET_OK) { + FreeTmpBuffer(); + return ret; + } + return ConcatBaseCPUKernel::ReSize(); +} +int ConcatFp16CPUKernel::MallocTmpBuffer() { for (const auto &in_tensor : in_tensors_) { float16_t *ptr = nullptr; if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { @@ -58,10 +66,6 @@ int ConcatFp16CPUKernel::ReSize() { auto &out_tensor = out_tensors_.at(0); if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { - if (fp16_output_ != nullptr) { - context_->allocator->Free(fp16_output_); - fp16_output_ = nullptr; - } fp16_output_ = reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); if (fp16_output_ == nullptr) { @@ -70,17 +74,29 @@ int ConcatFp16CPUKernel::ReSize() { } } - return ConcatBaseCPUKernel::ReSize(); + return RET_OK; } void ConcatFp16CPUKernel::FreeTmpBuffer() { - for (auto ptr : fp16_inputs_) { - if (ptr != nullptr) { - context_->allocator->Free(ptr); - ptr = nullptr; + for (auto i = 0; i < fp16_inputs_.size(); i++) { + auto &in_tensor = in_tensors_.at(i); + auto in_ptr = fp16_inputs_.at(i); + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + if (in_ptr != nullptr) { + context_->allocator->Free(in_ptr); + in_ptr = nullptr; + } } } fp16_inputs_.clear(); + + auto &out_tensor = out_tensors_.at(0); + if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { + if (fp16_output_ != nullptr) { + context_->allocator->Free(fp16_output_); + fp16_output_ = nullptr; + } + } } int ConcatFp16CPUKernel::Run() { @@ -119,24 +135,10 @@ int ConcatFp16CPUKernel::Run() { ConcatFp16(reinterpret_cast(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), output_shape.size(), reinterpret_cast(fp16_output_)); - // free fp16 in out buffer if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { Float16ToFloat32(fp16_output_, reinterpret_cast(output_addr), out_tensors_.at(0)->ElementsNum()); - context_->allocator->Free(fp16_output_); - fp16_output_ = nullptr; } - for (auto i = 0; i < fp16_inputs_.size(); i++) { - const auto in_tensor = in_tensors_[i]; - if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { - auto ptr = fp16_inputs_[i]; - if (ptr != nullptr) { - context_->allocator->Free(ptr); - ptr = nullptr; - } - } - } - fp16_inputs_.clear(); - + FreeTmpBuffer(); return RET_OK; } @@ -164,5 +166,5 @@ kernel::LiteKernel *CpuConcatFp16KernelCreator(const std::vectorallocator->Free(fp16_input_); - fp16_input_ = nullptr; - } - auto in_tensor = in_tensors_.front(); - if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { - fp16_input_ = - reinterpret_cast(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); - if (fp16_input_ == nullptr) { - return RET_ERROR; - } - Float32ToFloat16(reinterpret_cast(in_tensor->Data()), fp16_input_, in_tensor->ElementsNum()); + FreeTmpBuffer(); + auto ret = MallocTmpBuffer(); + if (ret != RET_OK) { + FreeTmpBuffer(); + return ret; } - return MallocTmpBuffer(); + return RET_OK; } int ReduceFp16CPUKernel::CallReduceUnit(int task_id) { @@ -99,9 +92,13 @@ int ReduceFp16CPUKernel::Run() { tmp_shape_ = in_tensors_.at(0)->shape(); auto in_tensor = in_tensors_.at(0); - if (in_tensor->data_type() == kNumberTypeFloat16) { + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + auto input_data = reinterpret_cast(in_tensor->Data()); + Float32ToFloat16(input_data, fp16_input_, in_tensor->ElementsNum()); + } else { fp16_input_ = reinterpret_cast(in_tensor->Data()); } + fp16_src_data_ = fp16_input_; for (int i = 0; i < data_buffers_.size(); ++i) { fp16_dst_data_ = data_buffers_[i]; @@ -117,6 +114,7 @@ int ReduceFp16CPUKernel::Run() { axis_size_ = tmp_shape_[axis]; auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_); if (error_code != RET_OK) { + FreeTmpBuffer(); MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; return RET_ERROR; } @@ -132,16 +130,11 @@ int ReduceFp16CPUKernel::Run() { memcpy(out_tensor->Data(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); } - if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { - context_->allocator->Free(fp16_input_); - } - fp16_input_ = nullptr; - FreeTmpBuffer(); return RET_OK; } -int ReduceFp16CPUKernel::FreeTmpBuffer() { +void ReduceFp16CPUKernel::FreeTmpBuffer() { for (auto buffer : data_buffers_) { if (buffer != nullptr) { context_->allocator->Free(buffer); @@ -149,12 +142,17 @@ int ReduceFp16CPUKernel::FreeTmpBuffer() { } } data_buffers_.clear(); - return RET_OK; + + auto in_tensor = in_tensors_.at(0); + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + if (fp16_input_ != nullptr) { + context_->allocator->Free(fp16_input_); + fp16_input_ = nullptr; + } + } } int ReduceFp16CPUKernel::MallocTmpBuffer() { - auto ret = FreeTmpBuffer(); - auto input_shape = in_tensors_.at(0)->shape(); for (auto i = 0; i < num_axes_; i++) { int axis = axes_[i]; @@ -166,13 +164,23 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { } float16_t *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(float16_t))); if (buffer == nullptr) { - MS_LOG(ERROR) << "Malloc data failed."; + MS_LOG(ERROR) << "Malloc data failed"; return RET_ERROR; } data_buffers_.emplace_back(buffer); input_shape[axis] = 1; } - return ret; + + auto in_tensor = in_tensors_.front(); + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + fp16_input_ = + reinterpret_cast(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); + if (fp16_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + } + return RET_OK; } kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector &inputs, @@ -235,6 +243,6 @@ kernel::LiteKernel *CpuMeanFp16KernelCreator(const std::vectorallocator->Free(input_ptr); } return RET_OK; -} // namespace mindspore::kernel +} } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc index 6cc6137d91..ccd9103762 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc @@ -140,5 +140,4 @@ kernel::LiteKernel *CpuSplitFp16KernelCreator(const std::vectorout_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; } - if (fp16_in_data_ != nullptr) { - context_->allocator->Free(fp16_in_data_); - fp16_in_data_ = nullptr; + FreeFp16Buffer(); + auto ret = MallocFp16Buffer(); + if (ret != RET_OK) { + FreeFp16Buffer(); + return ret; } + return RET_OK; +} + +int TransposeFp16CPUKernel::MallocFp16Buffer() { + auto &in_tensor = in_tensors_.front(); + auto &out_tensor = out_tensors_.front(); + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { fp16_in_data_ = reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); @@ -71,11 +80,6 @@ int TransposeFp16CPUKernel::ReSize() { return RET_ERROR; } } - - if (fp16_out_data_ != nullptr) { - context_->allocator->Free(fp16_out_data_); - fp16_out_data_ = nullptr; - } if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { fp16_out_data_ = reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); @@ -87,6 +91,24 @@ int TransposeFp16CPUKernel::ReSize() { return RET_OK; } +void TransposeFp16CPUKernel::FreeFp16Buffer() { + auto &in_tensor = in_tensors_.front(); + auto &out_tensor = out_tensors_.front(); + + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + if (fp16_in_data_ != nullptr) { + context_->allocator->Free(fp16_in_data_); + fp16_in_data_ = nullptr; + } + } + if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { + if (fp16_out_data_ != nullptr) { + context_->allocator->Free(fp16_out_data_); + fp16_out_data_ = nullptr; + } + } +} + int TransposeFp16CPUKernel::TransposeParallel(int task_id) { int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); if (num_unit_thread <= 0) { @@ -95,13 +117,6 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { int thread_offset = task_id * thread_h_stride_; TransposeParameter *param = reinterpret_cast(this->op_parameter_); - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat16) { - fp16_in_data_ = reinterpret_cast(in_tensors_.at(0)->Data()); - } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { - fp16_out_data_ = reinterpret_cast(out_tensors_.at(0)->Data()); - } - auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, thread_offset + num_unit_thread); if (ret != RET_OK) { @@ -109,12 +124,6 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { return RET_ERROR; } - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32 || in_tensors_.at(0)->data_type() == kNumberTypeFloat) { - context_->allocator->Free(fp16_in_data_); - } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { - context_->allocator->Free(fp16_out_data_); - } return RET_OK; } @@ -139,7 +148,8 @@ int TransposeFp16CPUKernel::Run() { auto &in_tensor = in_tensors_.front(); auto &out_tensor = out_tensors_.front(); if (in_tensor == nullptr || out_tensor == nullptr) { - MS_LOG(ERROR) << "null pointer dreferencing."; + MS_LOG(ERROR) << "null pointer referencing."; + FreeFp16Buffer(); return RET_ERROR; } @@ -159,23 +169,15 @@ int TransposeFp16CPUKernel::Run() { ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; + FreeFp16Buffer(); return ret; } - if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(fp16_in_data_); - fp16_in_data_ = nullptr; - } if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { out_data_ = reinterpret_cast(out_tensor->Data()); - if (out_data_ == nullptr) { - return RET_ERROR; - } Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); - - context_->allocator->Free(fp16_out_data_); - fp16_out_data_ = nullptr; } + FreeFp16Buffer(); return ret; } @@ -206,5 +208,5 @@ kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector