diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc index 95fd61b57b..5bea611e92 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc @@ -93,6 +93,18 @@ int ConvolutionBaseCPUKernel::Init() { return RET_OK; } +int ConvolutionBaseCPUKernel::CheckResizeValid() { + // ===============check in channel================= // + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto filter_in_channel = filter_tensor->Channel(); + int resize_in_channel = in_tensors_.at(kInputIndex)->Channel(); + if (filter_in_channel != resize_in_channel) { + MS_LOG(ERROR) << "Channel of resized input should be equal to in channel of filter."; + return RET_ERROR; + } + return RET_OK; +} + int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) { auto data_type = input_tensor->data_type(); auto input_format = input_tensor->GetFormat(); diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h index 72969ef360..1cf688f432 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h @@ -58,6 +58,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel { int SetFilterTensorQuantParam(); int SetOutputTensorQuantParam(); int SetQuantMultiplier(); + int CheckResizeValid(); void FreeQuantParam(); protected: diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index 687d6273f2..88fc177160 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -50,11 +50,14 @@ void ProcessFilterFp16(float16_t *origin_weight, float16_t *dst_weight, ConvPara } int Convolution3x3FP16CPUKernel::InitWeightBias() { - auto input_channel = conv_param_->input_channel_; - int output_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; int iC8 = UP_DIV(input_channel, C8NUM); int oC8 = UP_DIV(output_channel, C8NUM); - // init weight + // ===========================init weight========================== // size_t transformed_size = iC8 * C8NUM * oC8 * C8NUM * 36 * sizeof(float16_t); transformed_filter_addr_ = reinterpret_cast(malloc(transformed_size)); if (transformed_filter_addr_ == nullptr) { @@ -69,7 +72,7 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { } ProcessFilterFp16(execute_weight_, transformed_filter_addr_, conv_param_); - // init bias + // =============================init bias========================= // size_t new_bias_size = oC8 * C8NUM * sizeof(float16_t); bias_data_ = malloc(new_bias_size); if (bias_data_ == nullptr) { @@ -92,55 +95,32 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { int Convolution3x3FP16CPUKernel::InitTmpBuffer() { const int tile_num = 16; const int k_plane = 36; - int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM); int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); - - /*=============================tile_buffer_============================*/ - size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t); - tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); - if (tile_buffer_ == nullptr) { - MS_LOG(ERROR) << "malloc tile_buffer_ failed."; - return RET_ERROR; - } - memset(tile_buffer_, 0, tile_buffer_size); - + MS_ASSERT(ctx_->allocator != nullptr); /*=============================block_unit_buffer_============================*/ size_t block_unit_buffer_size = thread_count_ * k_plane * C8NUM * sizeof(float16_t); - block_unit_buffer_ = reinterpret_cast(malloc(block_unit_buffer_size)); + block_unit_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(block_unit_buffer_size)); if (block_unit_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc block_unit_buffer_ failed."; return RET_ERROR; } - memset(block_unit_buffer_, 0, block_unit_buffer_size); /*=============================tmp_dst_buffer_============================*/ size_t tmp_dst_buffer_size = thread_count_ * tile_num * k_plane * oC8 * C8NUM * sizeof(float16_t); - tmp_dst_buffer_ = reinterpret_cast(malloc(tmp_dst_buffer_size)); + tmp_dst_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_buffer_size)); if (tmp_dst_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; return RET_ERROR; } - memset(tmp_dst_buffer_, 0, tmp_dst_buffer_size); /*=============================tmp_out_============================*/ int new_out_plane = UP_DIV(conv_param_->output_h_, C4NUM) * UP_DIV(conv_param_->output_w_, C4NUM) * C4NUM * C4NUM; size_t tmp_out_size = oC8 * C8NUM * conv_param_->output_batch_ * new_out_plane * sizeof(float16_t); - tmp_out_ = reinterpret_cast(malloc(tmp_out_size)); + tmp_out_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_out_size)); if (tmp_out_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_ failed."; return RET_ERROR; } - memset(tmp_out_, 0, tmp_out_size); - - /*=============================nhwc4_input_============================*/ - size_t nhwc8_input_size = - iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc8_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc8_input_size); return RET_OK; } @@ -160,12 +140,22 @@ int Convolution3x3FP16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } return ReSize(); } int Convolution3x3FP16CPUKernel::ReSize() { - FreeTmpBuffer(); + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (tile_buffer_ != nullptr) { free(tile_buffer_); tile_buffer_ = nullptr; @@ -174,21 +164,35 @@ int Convolution3x3FP16CPUKernel::ReSize() { free(nhwc4_input_); nhwc4_input_ = nullptr; } - auto ret = ConvolutionBaseCPUKernel::Init(); + + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return ret; } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; + const int tile_num = 16; + const int k_plane = 36; + int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM); + + /*=============================nhwc4_input_============================*/ + size_t nhwc8_input_size = + iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = malloc(nhwc8_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; return RET_ERROR; } - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + memset(nhwc4_input_, 0, nhwc8_input_size); + + /*=============================tile_buffer_============================*/ + size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t); + tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); + if (tile_buffer_ == nullptr) { + MS_LOG(ERROR) << "malloc tile_buffer_ failed."; return RET_ERROR; } + memset(tile_buffer_, 0, tile_buffer_size); + return RET_OK; } @@ -220,6 +224,11 @@ int Convolution3x3FP16CPUKernel::Run() { MS_LOG(ERROR) << "Get execute tensor failed."; return ret; } + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; int in_w = conv_param_->input_w_; @@ -229,6 +238,7 @@ int Convolution3x3FP16CPUKernel::Run() { int error_code = LiteBackendParallelLaunch(Convolution3x3Fp16Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv3x3 fp16 error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } @@ -248,6 +258,7 @@ int Convolution3x3FP16CPUKernel::Run() { ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h index fcb89b45dc..d5584bfdcd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h @@ -30,23 +30,11 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const std::vector &outputs, const Context *ctx, const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~Convolution3x3FP16CPUKernel() override { FreeTmpBuffer(); } - - int Init() override; - int ReSize() override; - int Run() override; - int RunImpl(int task_id); - int InitWeightBias(); - int InitTmpBuffer(); - void ConfigInputOutput(); - - private: - void FreeTmpBuffer() { + ~Convolution3x3FP16CPUKernel() override { if (fp16_weight_ != nullptr) { free(fp16_weight_); fp16_weight_ = nullptr; } - if (transformed_filter_addr_ != nullptr) { free(transformed_filter_addr_); transformed_filter_addr_ = nullptr; @@ -55,16 +43,28 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { free(tile_buffer_); tile_buffer_ = nullptr; } + } + + int Init() override; + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + int InitWeightBias(); + int InitTmpBuffer(); + void ConfigInputOutput(); + + private: + void FreeTmpBuffer() { if (block_unit_buffer_ != nullptr) { - free(block_unit_buffer_); + ctx_->allocator->Free(block_unit_buffer_); block_unit_buffer_ = nullptr; } if (tmp_dst_buffer_ != nullptr) { - free(tmp_dst_buffer_); + ctx_->allocator->Free(tmp_dst_buffer_); tmp_dst_buffer_ = nullptr; } if (tmp_out_ != nullptr) { - free(tmp_out_); + ctx_->allocator->Free(tmp_out_); tmp_out_ = nullptr; } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index 097fc30091..43e1d1db58 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -143,14 +143,19 @@ int ConvolutionFP16CPUKernel::Init() { } int ConvolutionFP16CPUKernel::ReSize() { - FreeTmpBuffer(); + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret; return ret; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc index f33ca238fe..bb18df3494 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc @@ -59,16 +59,19 @@ int ConvolutionSWFP16CPUKernel::ProcessFilter() { } int ConvolutionSWFP16CPUKernel::InitWeightBias() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_channel = conv_param_->input_channel_; - int out_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + int kernel_h = filter_tensor->Height(); + int kernel_w = filter_tensor->Width(); + int in_channel = filter_tensor->Channel(); + int out_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; int oc4 = UP_DIV(out_channel, C4NUM); int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane; - // init weight + // ========================init weight==================== // packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "malloc packed_weight_ failed."; @@ -81,7 +84,7 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() { return ret; } - // init bias + // =======================init bias====================== // bias_data_ = malloc(oc4 * C4NUM * sizeof(float16_t)); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "malloc bias_data_ failed."; @@ -101,29 +104,16 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() { } int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { - int in_channel = conv_param_->input_channel_; int out_channel = conv_param_->output_channel_; - int channel_block = UP_DIV(in_channel, C4NUM); int oc4 = UP_DIV(out_channel, C4NUM); - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * - conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - /*=============================tmp_output_block_============================*/ - tmp_output_block_ = reinterpret_cast(malloc(conv_param_->output_batch_ * conv_param_->output_h_ * - conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t))); + tmp_output_block_ = reinterpret_cast(ctx_->allocator->Malloc( + conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t))); if (tmp_output_block_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_output_block_ failed."; return RET_ERROR; } - return RET_OK; } @@ -142,32 +132,44 @@ int ConvolutionSWFP16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } + ConfigInputOutput(); return ReSize(); } int ConvolutionSWFP16CPUKernel::ReSize() { - FreeTmpBuffer(); + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } - auto ret = ConvolutionBaseCPUKernel::Init(); + + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret; return ret; } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; - return RET_ERROR; - } - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + + /*=============================nhwc4_input_============================*/ + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * + conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; return RET_ERROR; } - ConfigInputOutput(); + memset(nhwc4_input_, 0, nhwc4_input_size); // init sliding window param slidingWindow_param_ = new SlidingWindowParam; @@ -202,6 +204,11 @@ int ConvolutionSWFP16CPUKernel::Run() { MS_LOG(ERROR) << "Get Execute tensor failed."; return ret; } + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; @@ -212,6 +219,7 @@ int ConvolutionSWFP16CPUKernel::Run() { int error_code = LiteBackendParallelLaunch(ConvolutionSWFp16Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } @@ -224,6 +232,7 @@ int ConvolutionSWFP16CPUKernel::Run() { } ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h index fa59cf517c..45133786de 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h @@ -28,7 +28,16 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const std::vector &outputs, const Context *ctx, const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConvolutionSWFP16CPUKernel() override { FreeTmpBuffer(); } + ~ConvolutionSWFP16CPUKernel() override { + if (fp16_weight_ != nullptr) { + free(fp16_weight_); + fp16_weight_ = nullptr; + } + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } + } int Init() override; int ReSize() override; @@ -41,16 +50,8 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { private: void FreeTmpBuffer() { - if (fp16_weight_ != nullptr) { - free(fp16_weight_); - fp16_weight_ = nullptr; - } - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } if (tmp_output_block_ != nullptr) { - free(tmp_output_block_); + ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; } if (slidingWindow_param_ != nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index e16b5b26c5..f04a77fc0b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -110,10 +110,15 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei } int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { - int output_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + int in_channel = filter_tensor->Channel(); + int out_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; + int oc_block, oc_block_num; oc_block = C8NUM; - oc_block_num = UP_DIV(output_channel, C8NUM); + oc_block_num = UP_DIV(out_channel, C8NUM); // init weight auto ret = MallocFilterMatrix(oc_block, oc_block_num); @@ -139,7 +144,7 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { auto fp16_bias_data = reinterpret_cast(bias_data_); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - for (int i = 0; i < output_channel; ++i) { + for (int i = 0; i < out_channel; ++i) { fp16_bias_data[i] = (float16_t)ori_bias[i]; } } else { @@ -188,25 +193,14 @@ int ConvolutionWinogradFP16CPUKernel::MallocFilterMatrix(int oc_block, int oc_bl int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { int cal_num = 16; - int channel_in = conv_param_->input_channel_; int channel_out = conv_param_->output_channel_; int output_h = conv_param_->output_h_; int output_w = conv_param_->output_w_; - int ic8 = UP_DIV(channel_in, C8NUM); int oc8 = UP_DIV(channel_out, C8NUM); - /*=============================trans_input_============================*/ - size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t); - trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); - if (trans_input_ == nullptr) { - MS_LOG(ERROR) << "malloc trans_input_ failed."; - return RET_ERROR; - } - memset(trans_input_, 0, tile_buffer_size); - /*=============================gemm_out_============================*/ gemm_out_ = reinterpret_cast( - malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t))); + ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t))); if (gemm_out_ == nullptr) { MS_LOG(ERROR) << "malloc gemm_out_ failed."; return RET_ERROR; @@ -215,36 +209,26 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { /*=============================tmp_out_data_============================*/ int out_w_block = UP_DIV(output_w, output_unit_); int out_h_block = UP_DIV(output_h, output_unit_); - tmp_out_data_ = reinterpret_cast(malloc(conv_param_->output_batch_ * out_w_block * out_h_block * - output_unit_ * output_unit_ * oc8 * C8NUM * sizeof(float16_t))); + tmp_out_data_ = reinterpret_cast( + ctx_->allocator->Malloc(conv_param_->output_batch_ * out_w_block * out_h_block * output_unit_ * output_unit_ * oc8 * + C8NUM * sizeof(float16_t))); if (tmp_out_data_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; return RET_ERROR; } /*=============================tmp_data_============================*/ - tmp_data_ = - reinterpret_cast(malloc(thread_count_ * C8NUM * input_unit_ * input_unit_ * sizeof(float16_t))); + tmp_data_ = reinterpret_cast( + ctx_->allocator->Malloc(thread_count_ * C8NUM * input_unit_ * input_unit_ * sizeof(float16_t))); if (tmp_data_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_data_ failed."; return RET_ERROR; } - memset(tmp_data_, 0, C8NUM * input_unit_ * input_unit_ * sizeof(float16_t)); tmp_buffer_address_list_[0] = trans_input_; tmp_buffer_address_list_[1] = gemm_out_; tmp_buffer_address_list_[2] = tmp_out_data_; tmp_buffer_address_list_[3] = tmp_data_; - - /*=============================nhwc4_input_============================*/ - size_t nhwc8_input_size = - ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc8_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc8_input_size); return RET_OK; } @@ -270,17 +254,37 @@ int ConvolutionWinogradFP16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + kernel_unit_ = conv_param_->kernel_h_; + input_unit_ = output_unit_ + kernel_unit_ - 1; + conv_param_->input_unit_ = input_unit_; + conv_param_->output_unit_ = output_unit_; + + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } return ReSize(); } int ConvolutionWinogradFP16CPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } + if (trans_input_ != nullptr) { + free(trans_input_); + trans_input_ = nullptr; + } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; @@ -290,17 +294,28 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { conv_param_->input_unit_ = input_unit_; conv_param_->output_unit_ = output_unit_; - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; + int cal_num = 16; + int channel_in = conv_param_->input_channel_; + int ic8 = UP_DIV(channel_in, C8NUM); + /*=============================nhwc4_input_============================*/ + size_t nhwc8_input_size = + ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = malloc(nhwc8_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; return RET_ERROR; } - // malloc tmp buffer - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + memset(nhwc4_input_, 0, nhwc8_input_size); + + /*=============================trans_input_============================*/ + size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t); + trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); + if (trans_input_ == nullptr) { + MS_LOG(ERROR) << "malloc trans_input_ failed."; return RET_ERROR; } + memset(trans_input_, 0, tile_buffer_size); + ret = ConfigInputOutput(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConfigInputOutput failed."; @@ -339,6 +354,12 @@ int ConvolutionWinogradFP16CPUKernel::Run() { return ret; } + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; int in_w = conv_param_->input_w_; @@ -348,6 +369,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() { int error_code = LiteBackendParallelLaunch(ConvolutionWinogradFp16Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } @@ -364,6 +386,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() { } ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index 13fdf8de8f..a022afcee6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -33,7 +33,20 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const std::vector &outputs, const Context *ctx, const mindspore::lite::PrimitiveC *primitive, int out_unit) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive), output_unit_(out_unit) {} - ~ConvolutionWinogradFP16CPUKernel() override { FreeTmpBuffer(); } + ~ConvolutionWinogradFP16CPUKernel() override { + if (fp16_weight_ != nullptr) { + free(fp16_weight_); + fp16_weight_ = nullptr; + } + if (trans_input_ != nullptr) { + free(trans_input_); + trans_input_ = nullptr; + } + if (trans_weight_ != nullptr) { + delete trans_weight_; + trans_weight_ = nullptr; + } + } int Init() override; int ReSize() override; @@ -46,30 +59,18 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { private: void FreeTmpBuffer() { - if (fp16_weight_ != nullptr) { - free(fp16_weight_); - fp16_weight_ = nullptr; - } if (tmp_data_ != nullptr) { - free(tmp_data_); + ctx_->allocator->Free(tmp_data_); tmp_data_ = nullptr; } - if (trans_input_ != nullptr) { - free(trans_input_); - trans_input_ = nullptr; - } if (gemm_out_ != nullptr) { - free(gemm_out_); + ctx_->allocator->Free(gemm_out_); gemm_out_ = nullptr; } if (tmp_out_data_ != nullptr) { - free(tmp_out_data_); + ctx_->allocator->Free(tmp_out_data_); tmp_out_data_ = nullptr; } - if (trans_weight_ != nullptr) { - delete trans_weight_; - trans_weight_ = nullptr; - } } int kernel_unit_; int input_unit_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index 3e68a41149..cd0a700d56 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -35,10 +35,13 @@ using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { int ConvolutionCPUKernel::InitWeightBias() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_channel = conv_param_->input_channel_; - int out_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + int kernel_h = filter_tensor->Height(); + int kernel_w = filter_tensor->Width(); + int in_channel = filter_tensor->Channel(); + int out_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int oc_block, oc_block_num; @@ -52,7 +55,7 @@ int ConvolutionCPUKernel::InitWeightBias() { int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; // =====================init weight==========================// - auto origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); + auto origin_weight = reinterpret_cast(filter_tensor->Data()); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "malloc packed weight failed."; @@ -67,7 +70,7 @@ int ConvolutionCPUKernel::InitWeightBias() { MS_LOG(ERROR) << "malloc bias failed."; return RET_ERROR; } - memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float)); + if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); memcpy(bias_data_, ori_bias, out_channel * sizeof(float)); @@ -78,39 +81,11 @@ int ConvolutionCPUKernel::InitWeightBias() { } int ConvolutionCPUKernel::InitTmpBuffer() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_batch = conv_param_->input_batch_; - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); int out_channel = conv_param_->output_channel_; - int kernel_plane = kernel_h * kernel_w; - - // malloc packed_inputs - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, TILE_NUM); - int unit_size = kernel_plane * ic4 * C4NUM; - int packed_input_size = output_tile_count * TILE_NUM * unit_size; - /*=============================packed_input============================*/ - packed_input_ = reinterpret_cast(malloc(in_batch * packed_input_size * sizeof(float))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "malloc packed input failed."; - return RET_ERROR; - } - memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float)); - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); + MS_ASSERT(ctx_->allocator != nullptr); /*=============================tmp_output_block_============================*/ - tmp_output_block_ = reinterpret_cast(malloc(TILE_NUM * out_channel * sizeof(float))); + tmp_output_block_ = reinterpret_cast(ctx_->allocator->Malloc(TILE_NUM * out_channel * sizeof(float))); if (tmp_output_block_ == nullptr) { MS_LOG(ERROR) << "malloc tmp output block failed."; return RET_ERROR; @@ -134,34 +109,59 @@ int ConvolutionCPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } + ConfigInputOutput(); return ReSize(); } int ConvolutionCPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } - - auto ret = ConvolutionBaseCPUKernel::Init(); + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; + + /*=============================nhwc4_input_============================*/ + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; return RET_ERROR; } - // init tmp input, output - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + memset(nhwc4_input_, 0, nhwc4_input_size); + + /*=============================packed_input============================*/ + int output_count = conv_param_->output_h_ * conv_param_->output_w_; + int output_tile_count = UP_DIV(output_count, TILE_NUM); + int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM; + int packed_input_size = output_tile_count * TILE_NUM * unit_size; + packed_input_ = reinterpret_cast(malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "malloc packed input failed."; return RET_ERROR; } - // config input output - ConfigInputOutput(); + memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size * sizeof(float)); return RET_OK; } @@ -192,19 +192,25 @@ int ConvolutionCPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + // ============Init buffer using memory pool allocator=============// + auto ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); - int in_batch = conv_param_->input_batch_; - int in_h = conv_param_->input_h_; - int in_w = conv_param_->input_w_; - int in_channel = conv_param_->input_channel_; - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, in_batch, in_h * in_w, in_channel); + PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = LiteBackendParallelLaunch(ConvolutionImpl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } + FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h index d59c90dee0..b53dd14dac 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h @@ -30,7 +30,16 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { const std::vector &outputs, const lite::Context *ctx, const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConvolutionCPUKernel() override { FreeTmpBuffer(); } + ~ConvolutionCPUKernel() override { + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + } int Init() override; int ReSize() override; @@ -42,18 +51,10 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { - if (packed_input_ != nullptr) { - free(packed_input_); - packed_input_ = nullptr; - } if (tmp_output_block_ != nullptr) { - free(tmp_output_block_); + ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; } - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } } float *packed_input_ = nullptr; float *packed_weight_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index 386774d009..563339bb12 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -49,8 +49,11 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_ } int Convolution3x3CPUKernel::InitWeightBias() { - auto input_channel = conv_param_->input_channel_; - auto output_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; int iC4 = UP_DIV(input_channel, C4NUM); int oC4 = UP_DIV(output_channel, C4NUM); int oc_block, oc_block_num; @@ -91,56 +94,35 @@ int Convolution3x3CPUKernel::InitWeightBias() { } int Convolution3x3CPUKernel::InitTmpBuffer() { - int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); const int k_plane = 16; - - /*=============================tile_buffer_============================*/ - size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); - tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); - if (tile_buffer_ == nullptr) { - MS_LOG(ERROR) << "malloc tile buffer failed."; - return RET_ERROR; - } - memset(tile_buffer_, 0, tile_buffer_size); + MS_ASSERT(ctx_->allocator != nullptr); /*=============================block_unit_buffer_============================*/ size_t block_unit_buffer_size = thread_count_ * k_plane * C4NUM * sizeof(float); - block_unit_buffer_ = reinterpret_cast(malloc(block_unit_buffer_size)); + block_unit_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(block_unit_buffer_size)); if (block_unit_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc block_unit_buffer_ failed."; return RET_ERROR; } - memset(block_unit_buffer_, 0, block_unit_buffer_size); /*=============================tmp_dst_buffer_============================*/ size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float); - tmp_dst_buffer_ = reinterpret_cast(malloc(tmp_dst_buffer_size)); + tmp_dst_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_buffer_size)); if (tmp_dst_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; return RET_ERROR; } - memset(tmp_dst_buffer_, 0, tmp_dst_buffer_size); - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = - iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); /*=============================nc4hw4_out_============================*/ size_t nc4hw4_out_size = oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float); - nc4hw4_out_ = reinterpret_cast(malloc(nc4hw4_out_size)); + nc4hw4_out_ = reinterpret_cast(ctx_->allocator->Malloc(nc4hw4_out_size)); if (nc4hw4_out_ == nullptr) { MS_LOG(ERROR) << "malloc nc4hw4_out_ failed."; return RET_ERROR; } - memset(nc4hw4_out_, 0, nc4hw4_out_size); + tmp_buffer_address_list_[0] = tile_buffer_; tmp_buffer_address_list_[1] = block_unit_buffer_; tmp_buffer_address_list_[2] = tmp_dst_buffer_; @@ -162,28 +144,57 @@ int Convolution3x3CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed.ret: " << ret; + return RET_ERROR; + } + ConfigInputOutput(); return ReSize(); } int Convolution3x3CPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); + if (nhwc4_input_ != nullptr) { + free(nhwc4_input_); + nhwc4_input_ = nullptr; + } + if (tile_buffer_ != nullptr) { + free(tile_buffer_); + tile_buffer_ = nullptr; + } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed.ret: " << ret; return RET_ERROR; } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed.ret: " << ret; + + int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); + /*=============================nhwc4_input_============================*/ + size_t nhwc4_input_size = + iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; return RET_ERROR; } - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed.ret: " << ret; + memset(nhwc4_input_, 0, nhwc4_input_size); + + /*=============================tile_buffer_============================*/ + size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float); + tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); + if (tile_buffer_ == nullptr) { + MS_LOG(ERROR) << "malloc tile buffer failed."; return RET_ERROR; } - ConfigInputOutput(); + memset(tile_buffer_, 0, tile_buffer_size); return RET_OK; } @@ -214,17 +225,21 @@ int Convolution3x3CPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + + auto ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed.ret: " << ret; + return RET_ERROR; + } auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); - int in_batch = conv_param_->input_batch_; - int in_h = conv_param_->input_h_; - int in_w = conv_param_->input_w_; - int in_channel = conv_param_->input_channel_; - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, in_batch, in_h * in_w, in_channel); + PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = LiteBackendParallelLaunch(Convolution3x3Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv3x3 error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } @@ -241,6 +256,7 @@ int Convolution3x3CPUKernel::Run() { PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index 39937118c8..c2f8747b8f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -29,8 +29,15 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { const std::vector &outputs, const lite::Context *ctx, const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~Convolution3x3CPUKernel() override { FreeTmpBuffer(); } - + ~Convolution3x3CPUKernel() override { + if (transformed_filter_addr_ != nullptr) { + free(transformed_filter_addr_); + } + if (tile_buffer_ != nullptr) { + free(tile_buffer_); + tile_buffer_ = nullptr; + } + } int Init() override; int ReSize() override; int Run() override; @@ -41,24 +48,16 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { - if (tile_buffer_ != nullptr) { - free(tile_buffer_); - tile_buffer_ = nullptr; - } if (block_unit_buffer_ != nullptr) { - free(block_unit_buffer_); + ctx_->allocator->Free(block_unit_buffer_); block_unit_buffer_ = nullptr; } if (tmp_dst_buffer_ != nullptr) { - free(tmp_dst_buffer_); + ctx_->allocator->Free(tmp_dst_buffer_); tmp_dst_buffer_ = nullptr; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (nc4hw4_out_ != nullptr) { - free(nc4hw4_out_); + ctx_->allocator->Free(nc4hw4_out_); nc4hw4_out_ = nullptr; } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc index ee28925426..e54377de43 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc @@ -30,14 +30,17 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; int ConvolutionSWCPUKernel::InitWeightBias() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_channel = conv_param_->input_channel_; - int out_channel = conv_param_->output_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + int kernel_h = filter_tensor->Height(); + int kernel_w = filter_tensor->Width(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; + int ic4 = UP_DIV(input_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int oc_block = C4NUM; - int oc_block_num = UP_DIV(out_channel, C4NUM); + int oc_block_num = UP_DIV(output_channel, C4NUM); int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; // ==================================init weight======================================// @@ -48,13 +51,13 @@ int ConvolutionSWCPUKernel::InitWeightBias() { return RET_ERROR; } memset(packed_weight_, 0, pack_weight_size * sizeof(float)); - for (int oc = 0; oc < out_channel; ++oc) { - int src_oc_offset = oc * kernel_h * kernel_w * in_channel; + for (int oc = 0; oc < output_channel; ++oc) { + int src_oc_offset = oc * kernel_h * kernel_w * input_channel; int dst_oc_offset = oc * kernel_h * kernel_w * ic4 * C4NUM; for (int i = 0; i < kernel_h * kernel_w; ++i) { - const float *src = origin_weight + src_oc_offset + i * in_channel; + const float *src = origin_weight + src_oc_offset + i * input_channel; float *dst = packed_weight_ + dst_oc_offset + i * ic4 * C4NUM; - memcpy(dst, src, in_channel * sizeof(float)); + memcpy(dst, src, input_channel * sizeof(float)); } } @@ -67,7 +70,7 @@ int ConvolutionSWCPUKernel::InitWeightBias() { memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, out_channel * sizeof(float)); + memcpy(bias_data_, ori_bias, output_channel * sizeof(float)); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); } @@ -75,24 +78,13 @@ int ConvolutionSWCPUKernel::InitWeightBias() { } int ConvolutionSWCPUKernel::InitTmpBuffer() { - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); int out_channel = conv_param_->output_channel_; int oc4 = UP_DIV(out_channel, C4NUM); - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); + MS_ASSERT(ctx_->allocator != nullptr); /*=============================tmp_output_block_============================*/ - tmp_output_block_ = reinterpret_cast( - malloc(conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float))); + tmp_output_block_ = reinterpret_cast(ctx_->allocator->Malloc( + conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float))); if (tmp_output_block_ == nullptr) { MS_LOG(ERROR) << "malloc tmp output block failed."; return RET_ERROR; @@ -110,39 +102,49 @@ int ConvolutionSWCPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } + // config input output + ConfigInputOutput(); return ReSize(); } int ConvolutionSWCPUKernel::ReSize() { - FreeTmpBuffer(); + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; - return RET_ERROR; - } - // init tmp input, output - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + /*=============================nhwc4_input_============================*/ + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; return RET_ERROR; } + memset(nhwc4_input_, 0, nhwc4_input_size); + // init sliding window param slidingWindow_param_ = new SlidingWindowParam; InitSlidingParamConv(slidingWindow_param_, conv_param_, C4NUM); - // config input output - ConfigInputOutput(); return RET_OK; } @@ -169,20 +171,25 @@ int ConvolutionSWCPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + + // init tmp input, output + auto ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); - int in_batch = conv_param_->input_batch_; - int in_h = conv_param_->input_h_; - int in_w = conv_param_->input_w_; - int in_channel = conv_param_->input_channel_; - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, in_batch, in_h * in_w, in_channel); + PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = LiteBackendParallelLaunch(ConvolutionSWImpl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } - // output nhwc4 + auto out_tensor = out_tensors_.front(); auto out_data = reinterpret_cast(out_tensor->Data()); int oc4_res = conv_param_->output_channel_ % C4NUM; @@ -190,6 +197,7 @@ int ConvolutionSWCPUKernel::Run() { PackNHWC4ToNHWCFp32(tmp_output_block_, out_data, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h index 82877c75de..199b2d5991 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h @@ -32,7 +32,12 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel { const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConvolutionSWCPUKernel() override { FreeTmpBuffer(); } + ~ConvolutionSWCPUKernel() override { + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } + } int Init() override; int ReSize() override; @@ -44,12 +49,8 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } if (tmp_output_block_ != nullptr) { - free(tmp_output_block_); + ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; } if (slidingWindow_param_ != nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index b910c6027d..916a186e5e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -30,12 +30,12 @@ using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, ConvParameter *conv_param, int oc_block) { - // original weight format : ohwi + // =============original weight format : ohwi===============// auto channel_in = conv_param->input_channel_; auto channel_out = conv_param->output_channel_; int input_unit_square = input_unit * input_unit; - // generate matrix_G && matrix_GT + // =============generate matrix_G && matrix_GT===============// auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); ChooseMatrixG(matrix_g, matrix_gt); @@ -95,15 +95,20 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int } int ConvolutionWinogradCPUKernel::InitWeightBias() { - int output_channel = conv_param_->output_channel_; - int oc4 = UP_DIV(output_channel, C4NUM); + auto filter_tensor = in_tensors_.at(kWeightIndex); + int in_channel = filter_tensor->Channel(); + int out_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; + + int oc4 = UP_DIV(out_channel, C4NUM); int oc_block, oc_block_num; // #ifdef ENABLE_ARM32 // oc_block = C4NUM; // oc_block_num = UP_DIV(output_channel, C4NUM); // #else oc_block = C8NUM; - oc_block_num = UP_DIV(output_channel, C8NUM); + oc_block_num = UP_DIV(out_channel, C8NUM); // #endif // init weight @@ -112,8 +117,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { MS_LOG(ERROR) << "Malloc filter matrix failed."; return RET_ERROR; } - auto weight_tensor = in_tensors_.at(kWeightIndex); - auto weight_data = reinterpret_cast(weight_tensor->Data()); + auto weight_data = reinterpret_cast(filter_tensor->Data()); WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); // init bias @@ -122,7 +126,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { memset(bias_data_, 0, new_bias_size); if (in_tensors_.size() == kInputSize2) { auto ori_bias_addr = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias_addr, output_channel * sizeof(float)); + memcpy(bias_data_, ori_bias_addr, out_channel * sizeof(float)); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); } @@ -167,25 +171,15 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_ } int ConvolutionWinogradCPUKernel::InitTmpBuffer() { - int channel_in = conv_param_->input_channel_; int channel_out = conv_param_->output_channel_; int output_h = conv_param_->output_h_; int output_w = conv_param_->output_w_; - int ic4 = UP_DIV(channel_in, C4NUM); int oc4 = UP_DIV(channel_out, C4NUM); - - /*=============================trans_input_============================*/ - size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); - trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); - if (trans_input_ == nullptr) { - MS_LOG(ERROR) << "malloc trans_input_ failed."; - return RET_ERROR; - } - memset(trans_input_, 0, tile_buffer_size); + MS_ASSERT(ctx_->allocator != nullptr); /*=============================gemm_out_============================*/ gemm_out_ = reinterpret_cast( - malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float))); + ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float))); if (gemm_out_ == nullptr) { MS_LOG(ERROR) << "malloc gemm_out_ failed."; return RET_ERROR; @@ -194,35 +188,26 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { /*=============================tmp_out_data_============================*/ int out_w_block = UP_DIV(output_w, output_unit_); int out_h_block = UP_DIV(output_h, output_unit_); - tmp_out_data_ = reinterpret_cast(malloc(conv_param_->output_batch_ * out_w_block * out_h_block * - output_unit_ * output_unit_ * oc4 * C4NUM * sizeof(float))); + tmp_out_data_ = + reinterpret_cast(ctx_->allocator->Malloc(conv_param_->output_batch_ * out_w_block * out_h_block * + output_unit_ * output_unit_ * oc4 * C4NUM * sizeof(float))); if (tmp_out_data_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; return RET_ERROR; } /*=============================tmp_data_============================*/ - tmp_data_ = reinterpret_cast(malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float))); + tmp_data_ = reinterpret_cast( + ctx_->allocator->Malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float))); if (tmp_data_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_data_ failed."; return RET_ERROR; } - memset(tmp_data_, 0, C4NUM * input_unit_ * input_unit_ * sizeof(float)); tmp_buffer_address_list_[0] = trans_input_; tmp_buffer_address_list_[1] = gemm_out_; tmp_buffer_address_list_[2] = tmp_out_data_; tmp_buffer_address_list_[3] = tmp_data_; - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); return RET_OK; } @@ -253,37 +238,67 @@ int ConvolutionWinogradCPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + kernel_unit_ = conv_param_->kernel_h_; + input_unit_ = output_unit_ + kernel_unit_ - 1; + conv_param_->input_unit_ = input_unit_; + conv_param_->output_unit_ = output_unit_; + + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } return ReSize(); } int ConvolutionWinogradCPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); if (nhwc4_input_ != nullptr) { free(nhwc4_input_); nhwc4_input_ = nullptr; } + if (trans_input_ != nullptr) { + free(trans_input_); + trans_input_ = nullptr; + } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } + kernel_unit_ = conv_param_->kernel_h_; input_unit_ = output_unit_ + kernel_unit_ - 1; conv_param_->input_unit_ = input_unit_; conv_param_->output_unit_ = output_unit_; - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; + /*=============================nhwc4_input_============================*/ + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; return RET_ERROR; } - // malloc tmp buffer - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + memset(nhwc4_input_, 0, nhwc4_input_size); + + /*=============================trans_input_============================*/ + size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); + trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); + if (trans_input_ == nullptr) { + MS_LOG(ERROR) << "malloc trans_input_ failed."; return RET_ERROR; } + memset(trans_input_, 0, tile_buffer_size); + ret = ConfigInputOutput(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConfigInputOutput failed."; @@ -319,17 +334,21 @@ int ConvolutionWinogradCPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + // malloc tmp buffer + auto ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); - int in_batch = conv_param_->input_batch_; - int in_h = conv_param_->input_h_; - int in_w = conv_param_->input_w_; - int in_channel = conv_param_->input_channel_; - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, in_batch, in_h * in_w, in_channel); + PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = LiteBackendParallelLaunch(ConvolutionWinogradImpl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } @@ -346,6 +365,7 @@ int ConvolutionWinogradCPUKernel::Run() { UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index bd5373feaa..f1989f5292 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -30,10 +30,18 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { ConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::Context *ctx, const mindspore::lite::PrimitiveC *primitive, int output_unit) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive), output_unit_(output_unit), + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive), + output_unit_(output_unit), trans_weight_(nullptr) {} ~ConvolutionWinogradCPUKernel() override { - FreeTmpBuffer(); + if (trans_weight_ != nullptr) { + delete trans_weight_; + trans_weight_ = nullptr; + } + if (trans_input_ != nullptr) { + free(trans_input_); + trans_input_ = nullptr; + } }; int Init() override; int ReSize() override; @@ -47,25 +55,17 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { if (tmp_data_ != nullptr) { - free(tmp_data_); + ctx_->allocator->Free(tmp_data_); tmp_data_ = nullptr; } - if (trans_input_ != nullptr) { - free(trans_input_); - trans_input_ = nullptr; - } if (gemm_out_ != nullptr) { - free(gemm_out_); + ctx_->allocator->Free(gemm_out_); gemm_out_ = nullptr; } if (tmp_out_data_ != nullptr) { - free(tmp_out_data_); + ctx_->allocator->Free(tmp_out_data_); tmp_out_data_ = nullptr; } - if (trans_weight_ != nullptr) { - delete trans_weight_; - trans_weight_ = nullptr; - } } int kernel_unit_; int input_unit_; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc index 4a710be1c9..db5a36b5e1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc @@ -44,6 +44,21 @@ void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParamete } void Convolution3x3Int8CPUKernel::FreeTmpBuffer() { + if (block_unit_buffer_ != nullptr) { + ctx_->allocator->Free(block_unit_buffer_); + block_unit_buffer_ = nullptr; + } + if (tmp_dst_buffer_ != nullptr) { + ctx_->allocator->Free(tmp_dst_buffer_); + tmp_dst_buffer_ = nullptr; + } + if (tmp_out_ != nullptr) { + ctx_->allocator->Free(tmp_out_); + tmp_out_ = nullptr; + } +} + +Convolution3x3Int8CPUKernel::~Convolution3x3Int8CPUKernel() { if (transformed_filter_addr_ != nullptr) { free(transformed_filter_addr_); transformed_filter_addr_ = nullptr; @@ -56,26 +71,15 @@ void Convolution3x3Int8CPUKernel::FreeTmpBuffer() { free(tile_buffer_); tile_buffer_ = nullptr; } - if (block_unit_buffer_ != nullptr) { - free(block_unit_buffer_); - block_unit_buffer_ = nullptr; - } - if (tmp_dst_buffer_ != nullptr) { - free(tmp_dst_buffer_); - tmp_dst_buffer_ = nullptr; - } - if (tmp_out_ != nullptr) { - free(tmp_out_); - tmp_out_ = nullptr; - } FreeQuantParam(); } -Convolution3x3Int8CPUKernel::~Convolution3x3Int8CPUKernel() { FreeTmpBuffer(); } - int Convolution3x3Int8CPUKernel::InitWeightBias() { - auto input_channel = conv_param_->input_channel_; - auto output_channel = conv_param_->output_channel_; + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; int iC8 = UP_DIV(input_channel, C8NUM); int oC4 = UP_DIV(output_channel, C4NUM); // init weight @@ -107,59 +111,35 @@ int Convolution3x3Int8CPUKernel::InitWeightBias() { } int Convolution3x3Int8CPUKernel::InitTmpBuffer() { - int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); int oc4 = UP_DIV(conv_param_->output_channel_, C4NUM); - int in_batch = conv_param_->input_batch_; - int input_w = conv_param_->input_w_; - int input_h = conv_param_->input_h_; int output_batch = conv_param_->output_batch_; int output_w = conv_param_->output_w_; int output_h = conv_param_->output_h_; - - /*=============================tile_buffer_============================*/ - size_t tile_buffer_size = thread_count_ * TILE_NUM * 16 * ic8 * C8NUM * sizeof(int16_t); - tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); - if (tile_buffer_ == nullptr) { - MS_LOG(ERROR) << "malloc tile_buffer_ failed."; - return RET_ERROR; - } - memset(tile_buffer_, 0, tile_buffer_size); + MS_ASSERT(ctx_->allocator != nullptr); /*=============================block_unit_buffer_============================*/ size_t block_unit_buffer_size = thread_count_ * 4 * 4 * C8NUM * sizeof(int16_t); - block_unit_buffer_ = reinterpret_cast(malloc(block_unit_buffer_size)); + block_unit_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(block_unit_buffer_size)); if (block_unit_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc block_unit_buffer_ failed."; return RET_ERROR; } - memset(block_unit_buffer_, 0, block_unit_buffer_size); /*=============================tmp_dst_buffer_============================*/ size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t); - tmp_dst_buffer_ = reinterpret_cast(malloc(tmp_dst_buffer_size)); + tmp_dst_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_buffer_size)); if (tmp_dst_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; return RET_ERROR; } - memset(tmp_dst_buffer_, 0, tmp_dst_buffer_size); /*=============================tmp_out_============================*/ size_t tmp_out_size = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t); - tmp_out_ = reinterpret_cast(malloc(tmp_out_size)); + tmp_out_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_out_size)); if (tmp_out_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_ failed."; return RET_ERROR; } - memset(tmp_out_, 0, tmp_out_size); - - /*=============================input_data_============================*/ - size_t c8_input_size = in_batch * input_h * input_w * ic8 * C8NUM * sizeof(int16_t); - input_data_ = reinterpret_cast(malloc(c8_input_size)); - if (input_data_ == nullptr) { - MS_LOG(ERROR) << "malloc input_data_ failed."; - return RET_ERROR; - } - memset(input_data_, 0, c8_input_size); return RET_OK; } @@ -172,35 +152,63 @@ int Convolution3x3Int8CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + auto ret = SetQuantParam(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Set quant param failed."; + return ret; + } + ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } + // config input output + ConfigInputOutput(); return ReSize(); } int Convolution3x3Int8CPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Resize is invalid."; + return ret; + } + FreeTmpBuffer(); + if (input_data_ != nullptr) { + free(input_data_); + input_data_ = nullptr; + } + if (tile_buffer_ != nullptr) { + free(tile_buffer_); + tile_buffer_ = nullptr; + } - auto ret = ConvolutionBaseCPUKernel::Init(); + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } - ret = SetQuantParam(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Set quant param failed."; - return ret; - } - ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; + + /*=============================input_data_============================*/ + int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); + size_t c8_input_size = + conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t); + input_data_ = reinterpret_cast(malloc(c8_input_size)); + if (input_data_ == nullptr) { + MS_LOG(ERROR) << "malloc input_data_ failed."; return RET_ERROR; } - // init tmp input, output - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + memset(input_data_, 0, c8_input_size); + + /*=============================tile_buffer_============================*/ + size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t); + tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); + if (tile_buffer_ == nullptr) { + MS_LOG(ERROR) << "malloc tile_buffer_ failed."; return RET_ERROR; } - // config input output - ConfigInputOutput(); + memset(tile_buffer_, 0, tile_buffer_size); return RET_OK; } @@ -227,12 +235,19 @@ int Convolution3x3Int8CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } + // malloc tmp buffer + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } auto input_addr = reinterpret_cast(in_tensors_.at(kInputIndex)->Data()); PackInputToC8Int8(input_addr, input_data_, conv_param_); int error_code = LiteBackendParallelLaunch(Convolution3x3Int8Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv3x3 int8 error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } // get real output @@ -240,6 +255,7 @@ int Convolution3x3Int8CPUKernel::Run() { auto out_data = reinterpret_cast(out_tensor->Data()); PackNC4HW4ToNHWCInt8(tmp_out_, out_data, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc index e90d6512f5..7e0f42a58a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc @@ -60,12 +60,15 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() { } int ConvolutionInt8CPUKernel::InitWeightBias() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); - int out_channel = conv_param_->output_channel_; - int oc4 = UP_DIV(out_channel, C4NUM); + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + int kernel_h = filter_tensor->Height(); + int kernel_w = filter_tensor->Width(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; + int ic4 = UP_DIV(input_channel, C4NUM); + int oc4 = UP_DIV(output_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int plane_c4 = UP_DIV(kernel_plane, C4NUM); int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * plane_c4 * C4NUM; @@ -80,8 +83,8 @@ int ConvolutionInt8CPUKernel::InitWeightBias() { return RET_ERROR; } memset(packed_weight_, 0, pack_weight_size); - auto *weight_sum = reinterpret_cast(malloc(sizeof(int32_t) * out_channel)); - for (int i = 0; i < out_channel; i++) weight_sum[i] = 0; + auto *weight_sum = reinterpret_cast(malloc(sizeof(int32_t) * output_channel)); + for (int i = 0; i < output_channel; i++) weight_sum[i] = 0; PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum); // init bias @@ -93,42 +96,22 @@ int ConvolutionInt8CPUKernel::InitWeightBias() { memset(bias_data_, 0, oc4 * C4NUM * sizeof(int32_t)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, out_channel * sizeof(int32_t)); + memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t)); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); } auto *bias_data = reinterpret_cast(bias_data_); int c4_kernel_plane_size = kernel_plane * ic4 * C4NUM; if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { - for (int i = 0; i < out_channel; i++) { + for (int i = 0; i < output_channel; i++) { bias_data[i] += filter_arg[i].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; } } else { - for (int i = 0; i < out_channel; i++) { + for (int i = 0; i < output_channel; i++) { bias_data[i] += filter_arg[0].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; } } free(weight_sum); - return RET_OK; -} - -int ConvolutionInt8CPUKernel::InitTmpBuffer() { - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, tile_num_); - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); - int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; - int plane_c4 = UP_DIV(kernel_plane, C4NUM); - int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_num_ * unit_size; - - /*=============================packed_input_============================*/ - packed_input_ = reinterpret_cast(malloc(conv_param_->input_batch_ * packed_input_size)); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "malloc packed_input_ failed."; - return RET_ERROR; - } - memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size); /*=============================input_sum_============================*/ size_t input_sum_size; @@ -137,47 +120,45 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() { } else { input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t); } - input_sum_ = reinterpret_cast(malloc(input_sum_size)); + input_sum_ = reinterpret_cast(ctx_->allocator->Malloc(input_sum_size)); if (input_sum_ == nullptr) { MS_LOG(ERROR) << "malloc input_sum_ failed."; return RET_ERROR; } memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t)); + return RET_OK; +} +int ConvolutionInt8CPUKernel::InitTmpBuffer() { + MS_ASSERT(ctx_->allocator != nullptr); /*=============================tmp_dst_============================*/ size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); - tmp_dst_ = reinterpret_cast(malloc(tmp_dst_size)); + tmp_dst_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_size)); if (tmp_dst_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_ failed."; return RET_ERROR; } - memset(tmp_dst_, 0, tmp_dst_size); /*=============================tmp_out_============================*/ - tmp_out_ = reinterpret_cast(malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); + tmp_out_ = + reinterpret_cast(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); if (tmp_out_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_ failed."; return RET_ERROR; } - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); return RET_OK; } int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); - int out_channel = conv_param_->output_channel_; - int oc4 = UP_DIV(out_channel, C4NUM); + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + int kernel_h = filter_tensor->Height(); + int kernel_w = filter_tensor->Width(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; + int ic4 = UP_DIV(input_channel, C4NUM); + int oc4 = UP_DIV(output_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane; auto filter_arg = conv_param_->conv_quant_arg_.filter_quant_args_; @@ -191,8 +172,8 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { return RET_ERROR; } memset(packed_weight_, 0, pack_weight_size); - auto *weight_sum = reinterpret_cast(malloc(sizeof(int32_t) * out_channel)); - for (int i = 0; i < out_channel; i++) weight_sum[i] = 0; + auto *weight_sum = reinterpret_cast(malloc(sizeof(int32_t) * output_channel)); + for (int i = 0; i < output_channel; i++) weight_sum[i] = 0; PackWeightInt8Opt(origin_weight, conv_param_, packed_weight_, weight_sum); // init bias @@ -204,41 +185,22 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { memset(bias_data_, 0, oc4 * C4NUM * sizeof(int32_t)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, out_channel * sizeof(int32_t)); + memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t)); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); } auto *bias_data = reinterpret_cast(bias_data_); int c4_kernel_plane_size = kernel_plane * ic4 * C4NUM; if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { - for (int i = 0; i < out_channel; i++) { + for (int i = 0; i < output_channel; i++) { bias_data[i] += filter_arg[i].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; } } else { - for (int i = 0; i < out_channel; i++) { + for (int i = 0; i < output_channel; i++) { bias_data[i] += filter_arg[0].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; } } free(weight_sum); - return RET_OK; -} - -int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, tile_num_); - int in_channel = conv_param_->input_channel_; - int ic4 = UP_DIV(in_channel, C4NUM); - int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; - int unit_size = kernel_plane * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_num_ * unit_size; - - /*=============================packed_input_============================*/ - packed_input_ = reinterpret_cast(malloc(conv_param_->input_batch_ * packed_input_size)); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "malloc packed_input_ failed."; - return RET_ERROR; - } - memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size); /*=============================input_sum_============================*/ size_t input_sum_size; @@ -253,31 +215,26 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { return RET_ERROR; } memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t)); + return RET_OK; +} +int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { + MS_ASSERT(ctx_->allocator != nullptr); /*=============================tmp_dst_============================*/ size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); - tmp_dst_ = reinterpret_cast(malloc(tmp_dst_size)); + tmp_dst_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_size)); if (tmp_dst_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_ failed."; return RET_ERROR; } - memset(tmp_dst_, 0, tmp_dst_size); /*=============================tmp_out_============================*/ - tmp_out_ = reinterpret_cast(malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); + tmp_out_ = + reinterpret_cast(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); if (tmp_out_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_out_ failed."; return RET_ERROR; } - - /*=============================nhwc4_input_============================*/ - size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); return RET_OK; } @@ -296,62 +253,79 @@ int ConvolutionInt8CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } - return ReSize(); -} - -int ConvolutionInt8CPUKernel::InitOpt() { - auto ret = InitWeightBiasOpt(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; - return RET_ERROR; - } - // init tmp input, output - ret = InitTmpBufferOpt(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; - return RET_ERROR; - } - return RET_OK; -} - -int ConvolutionInt8CPUKernel::ReSize() { - FreeTmpBuffer(); - - auto ret = ConvolutionBaseCPUKernel::Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "ConvolutionBase init failed."; - return RET_ERROR; - } // config input output ConfigInputOutput(); CheckSupportOptimize(); - ret = SetQuantParam(); + auto ret = SetQuantParam(); if (ret != RET_OK) { MS_LOG(ERROR) << "Set quant param failed."; return ret; } // init for opt if (support_optimize_) { - ret = InitOpt(); + ret = InitWeightBiasOpt(); if (ret != RET_OK) { MS_LOG(ERROR) << "Initialization for optimized int8 conv failed."; return RET_ERROR; } - return RET_OK; + } else { + // init for situation that not support sdot + ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } } - // init for situation that not support sdot - ret = InitWeightBias(); + return ReSize(); +} + +int ConvolutionInt8CPUKernel::ReSize() { + auto ret = ConvolutionBaseCPUKernel::CheckResizeValid(); if (ret != RET_OK) { - MS_LOG(ERROR) << "Init weight bias failed."; - return RET_ERROR; + MS_LOG(ERROR) << "Resize is invalid."; + return ret; } - // init tmp input, output - ret = InitTmpBuffer(); + + FreeTmpBuffer(); + if (nhwc4_input_ != nullptr) { + free(nhwc4_input_); + nhwc4_input_ = nullptr; + } + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + + ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; + MS_LOG(ERROR) << "ConvolutionBase init failed."; + return RET_ERROR; + } + /*=============================nhwc4_input_============================*/ + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; + return RET_ERROR; + } + memset(nhwc4_input_, 0, nhwc4_input_size); + + /*=============================packed_input_============================*/ + int output_count = conv_param_->output_h_ * conv_param_->output_w_; + int output_tile_count = UP_DIV(output_count, tile_num_); + int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; + int plane_c4 = UP_DIV(kernel_plane, C4NUM); + int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; + int packed_input_size = output_tile_count * tile_num_ * unit_size; + packed_input_ = reinterpret_cast(malloc(conv_param_->input_batch_ * packed_input_size)); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "malloc packed_input_ failed."; return RET_ERROR; } + memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size); + return RET_OK; } @@ -369,7 +343,7 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) { return RET_OK; } -int ConvolutionInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { +int ConvolutionInt8Impl(int task_id, LiteParallelGroupEnv *mpenv, void *cdata) { auto conv = reinterpret_cast(cdata); auto error_code = conv->RunImpl(task_id); if (error_code != RET_OK) { @@ -385,19 +359,33 @@ int ConvolutionInt8CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } + if (support_optimize_) { + ret = InitTmpBufferOpt(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + } else { + // init tmp input, output + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + } + auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); - int in_batch = conv_param_->input_batch_; - int in_h = conv_param_->input_h_; - int in_w = conv_param_->input_w_; - int in_channel = conv_param_->input_channel_; - convert_func_(ori_input_data, nhwc4_input_, in_batch, in_h * in_w, in_channel); + convert_func_(ori_input_data, nhwc4_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, + conv_param_->input_channel_); int error_code = LiteBackendParallelLaunch(ConvolutionInt8Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } + FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h index c577833618..4b6ddacf5b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h @@ -30,14 +30,27 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { const std::vector &outputs, const Context *ctx, const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConvolutionInt8CPUKernel() override { FreeTmpBuffer(); } + ~ConvolutionInt8CPUKernel() override { + FreeQuantParam(); + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + if (input_sum_ != nullptr) { + free(input_sum_); + input_sum_ = nullptr; + } + } int Init() override; int ReSize() override; int Run() override; int RunImpl(int task_id); void CheckSupportOptimize(); - int InitOpt(); int InitWeightBiasOpt(); int InitTmpBufferOpt(); int InitWeightBias(); @@ -46,27 +59,14 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } - if (packed_input_ != nullptr) { - free(packed_input_); - packed_input_ = nullptr; - } - if (input_sum_ != nullptr) { - free(input_sum_); - input_sum_ = nullptr; - } if (tmp_dst_ != nullptr) { - free(tmp_dst_); + ctx_->allocator->Free(tmp_dst_); tmp_dst_ = nullptr; } if (tmp_out_ != nullptr) { - free(tmp_out_); + ctx_->allocator->Free(tmp_out_); tmp_out_ = nullptr; } - FreeQuantParam(); } bool support_optimize_ = true; int8_t *packed_weight_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c index ec85e8cf52..539d4cb3bb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c @@ -228,10 +228,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa #ifdef ENABLE_NEON vst1q_f32(packed_input + channel_block_offset, vld1q_f32(input_data + channel_block_stride)); #else - (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0]; - (packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1]; - (packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2]; - (packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3]; + for (int k = 0; k < C4NUM; ++k) { + (packed_input + channel_block_offset)[k] = (input_data + channel_block_stride)[k]; + } #endif } // channel_block loop } // kernel_w loop @@ -349,10 +348,9 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r for (int m = 0; m < ic4; m++) { int channel_block_stride = input_x_stride + m * C4NUM; int channel_block_offset = input_plane_offset + m * tile_num * C4NUM; - (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0]; - (packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1]; - (packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2]; - (packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3]; + for (int k = 0; k < C4NUM; k++) { + (packed_input + channel_block_offset)[k] = (input_data + channel_block_stride)[k]; + } } // channel_block loop } // kernel_w loop } // kernel_h loop