From 2bf61d2da1689b356bb0a1e1b48e132bdbbf5f8e Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Fri, 21 Aug 2020 16:56:41 +0800 Subject: [PATCH] [MS][LITE] arm cpu fp32 op: move weight and bias initing to function Init --- .../arm/fp16/convolution_depthwise_fp16.cc | 111 ++++++++++-------- .../arm/fp16/convolution_depthwise_fp16.h | 1 + .../arm/fp16/deconvolution_depthwise_fp16.cc | 87 +++++++------- .../arm/fp16/deconvolution_depthwise_fp16.h | 1 + .../kernel/arm/fp32/convolution_depthwise.cc | 50 ++++---- .../arm/fp32/convolution_depthwise_3x3.cc | 76 ++++++------ .../arm/fp32/convolution_depthwise_3x3.h | 8 +- .../arm/fp32/deconvolution_depthwise.cc | 50 ++++---- .../arm/int8/convolution_depthwise_int8.cc | 57 ++++----- .../arm/int8/deconvolution_depthwise_int8.cc | 62 ++++------ .../lite/src/runtime/kernel/arm/nnacl/pack.c | 18 +-- .../lite/src/runtime/kernel/arm/nnacl/pack.h | 3 +- 12 files changed, 254 insertions(+), 270 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index cfe10e7c3c..ee1e750a79 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -29,66 +29,67 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { -ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); } - -void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() { +ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { if (sliding_ != nullptr) { delete sliding_; sliding_ = nullptr; } - if (packed_weight_ != nullptr) { delete packed_weight_; packed_weight_ = nullptr; } - if (packed_input_ != nullptr) { - delete packed_input_; - packed_input_ = nullptr; - } - if (packed_output_ != nullptr) { - delete packed_output_; - packed_output_ = nullptr; + FreeTmpBuffer(); +} + +void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() { + if (need_align_) { + if (packed_input_ != nullptr) { + delete packed_input_; + packed_input_ = nullptr; + } + if (packed_output_ != nullptr) { + delete packed_output_; + packed_output_ = nullptr; + } } } int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { - // malloc pack input buffer - int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); - int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; - packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; + int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; + packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } - // malloc pack output buffer - int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; - packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); - if (packed_output_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; + packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } } return RET_OK; } int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 - int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); auto weight_tensor = in_tensors_[kWeightIndex]; + int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); - PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, - conv_param_->output_channel_); + PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); - // init bias bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); auto bias_fp16 = reinterpret_cast(bias_data_); if (in_tensors_.size() == kInputSize2) { - auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - for (int i = 0; i < conv_param_->output_channel_; i++) { + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + for (int i = 0; i < bias_tensor->ElementsNum(); i++) { bias_fp16[i] = (float16_t)ori_bias[i]; } } @@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { } int ConvolutionDepthwiseFp16CPUKernel::Init() { + sliding_ = new (std::nothrow) SlidingWindowParam; + if (sliding_ == nullptr) { + MS_LOG(ERROR) << "new sliding window param failed."; + return RET_ERROR; + } + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; + return RET_ERROR; + } + if (!InferShapeDone()) { return RET_OK; } @@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() { int ConvolutionDepthwiseFp16CPUKernel::ReSize() { FreeTmpBuffer(); - // conv base init auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { return ret; } - // init sliding_ window param - sliding_ = new SlidingWindowParam; InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); - ret = InitWeightBias(); - if (ret != 0) { - MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; - return RET_ERROR; - } - ret = InitBuffer(); if (ret != 0) { MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; @@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { MS_LOG(ERROR) << "Get Execute tensor failed."; return ret; } - // pack input: to nhwc8 - PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + if (need_align_) { + PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + } else { + packed_input_ = execute_input_; + } + if (!need_align_) { + packed_output_ = execute_output_; + } ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; return RET_ERROR; } - - PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - + if (need_align_) { + PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + } ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h index f325a30ece..c68df7f36e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h @@ -56,6 +56,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { float16_t *packed_weight_ = nullptr; float16_t *packed_input_ = nullptr; float16_t *packed_output_ = nullptr; + bool need_align_ = false; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index 146e130502..049fc4ed23 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -28,25 +28,28 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; namespace mindspore::kernel { -DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); } - -void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() { +DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() { if (sliding_ != nullptr) { delete sliding_; sliding_ = nullptr; } - if (packed_weight_ != nullptr) { delete packed_weight_; packed_weight_ = nullptr; } - if (packed_input_ != nullptr) { - delete packed_input_; - packed_input_ = nullptr; - } - if (packed_output_ != nullptr) { - delete packed_output_; - packed_output_ = nullptr; + FreeTmpBuffer(); +} + +void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() { + if (need_align_) { + if (packed_input_ != nullptr) { + delete packed_input_; + packed_input_ = nullptr; + } + if (packed_output_ != nullptr) { + delete packed_output_; + packed_output_ = nullptr; + } } } @@ -59,14 +62,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H); conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); - - // init sliding_ window param InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); return RET_OK; } int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { - // malloc pack input buffer int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); @@ -74,7 +74,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); @@ -88,21 +87,19 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 - int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); auto weight_tensor = in_tensors_[kWeightIndex]; + int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); - PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, - conv_param_->output_channel_); + PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); - // init bias bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -110,8 +107,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { } memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); if (in_tensors_.size() == kInputSize2) { - auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - for (int i = 0; i < conv_param_->output_channel_; i++) { + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + for (int i = 0; i < bias_tensor->ElementsNum(); i++) { reinterpret_cast(bias_data_)[i] = (float16_t)ori_bias[i]; } } @@ -121,6 +119,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { } int DeconvolutionDepthwiseFp16CPUKernel::Init() { + sliding_ = new (std::nothrow) SlidingWindowParam; + if (sliding_ == nullptr) { + MS_LOG(ERROR) << "new SlidingWindowParam fail!"; + return RET_ERROR; + } + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed."; + return RET_ERROR; + } if (!InferShapeDone()) { return RET_OK; } @@ -129,25 +138,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() { int DeconvolutionDepthwiseFp16CPUKernel::ReSize() { FreeTmpBuffer(); - - sliding_ = new (std::nothrow) SlidingWindowParam; - if (sliding_ == nullptr) { - MS_LOG(ERROR) << "new SlidingWindowParam fail!"; - return RET_ERROR; - } InitSlideParam(); - // conv base init auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { return ret; } - - ret = InitWeightBias(); - if (ret != 0) { - MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed."; - return RET_ERROR; - } - ret = InitBuffer(); if (ret != 0) { MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed."; @@ -188,18 +183,26 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { MS_LOG(ERROR) << "Get Execute tensor failed."; return ret; } - // pack input: to nhwc8 - PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + if (need_align_) { + PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + } else { + packed_input_ = execute_input_; + } + if (!need_align_) { + packed_output_ = execute_output_; + } ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]"; return RET_ERROR; } - PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + if (need_align_) { + PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + } ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h index cb7bc4b83d..fe1a4bcebb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h @@ -57,6 +57,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel float16_t *packed_weight_ = nullptr; float16_t *packed_input_ = nullptr; float16_t *packed_output_ = nullptr; + bool need_align_ = false; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc index d9bcba5f67..b5cc0854d2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc @@ -29,18 +29,19 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { -ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); } - -void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() { +ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() { if (sliding_ != nullptr) { delete sliding_; sliding_ = nullptr; } - if (packed_weight_ != nullptr) { delete packed_weight_; packed_weight_ = nullptr; } + FreeTmpBuffer(); +} + +void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() { if (need_align_) { if (packed_input_ != nullptr) { delete packed_input_; @@ -57,19 +58,17 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = in_tensors_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); - int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); + int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(float)); - PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, - conv_param_->output_channel_); + PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); - // init bias bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(float))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -78,16 +77,14 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() { memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); + memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float)); } - // init threadNum; conv_param_->thread_num_ = MSMIN(thread_count_, OC4); return RET_OK; } int ConvolutionDepthwiseCPUKernel::InitBuffer() { - // malloc pack input and output buffer if (conv_param_->input_channel_ % C4NUM != 0) { need_align_ = true; int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); @@ -97,7 +94,6 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_input_, 0, pack_input_size * sizeof(float)); int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; @@ -111,32 +107,29 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() { } int ConvolutionDepthwiseCPUKernel::Init() { - if (!InferShapeDone()) { - return RET_OK; - } - return ReSize(); -} - -int ConvolutionDepthwiseCPUKernel::ReSize() { - FreeTmpBuffer(); - // conv base init - ConvolutionBaseCPUKernel::Init(); - - // init sliding window param sliding_ = new (std::nothrow) SlidingWindowParam; if (sliding_ == nullptr) { MS_LOG(ERROR) << "new sliding window param failed."; return RET_ERROR; } - InitSlidingParamConvDw(sliding_, conv_param_, C4NUM); auto ret = InitWeightBias(); if (ret != 0) { MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed."; return RET_ERROR; } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ConvolutionDepthwiseCPUKernel::ReSize() { + FreeTmpBuffer(); + ConvolutionBaseCPUKernel::Init(); + InitSlidingParamConvDw(sliding_, conv_param_, C4NUM); - ret = InitBuffer(); + auto ret = InitBuffer(); if (ret != 0) { MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; return RET_ERROR; @@ -173,7 +166,6 @@ int ConvolutionDepthwiseCPUKernel::Run() { auto input_tensor = in_tensors_.at(kInputIndex); auto input_addr = reinterpret_cast(input_tensor->Data()); - // pack input: to nhwc4 if (need_align_) { PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc index 5d8bcd73b4..b56df7423c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc @@ -27,12 +27,41 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { +ConvolutionDepthwise3x3CPUKernel::~ConvolutionDepthwise3x3CPUKernel() { + FreeTmpBufer(); + if (block_buffer_ != nullptr) { + free(block_buffer_); + block_buffer_ = nullptr; + } + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } +} + +void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() { + if (need_align_) { + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + if (packed_output_ != nullptr) { + free(packed_output_); + packed_output_ = nullptr; + } + } + if (trans_buffer_ != nullptr) { + free(trans_buffer_); + trans_buffer_ = nullptr; + } +} + int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = in_tensors_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); // o h w 1 -> o/4 h w 1 4 - int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); int weight_c4_size = OC4 * C4NUM * 9; auto tmp_weight = reinterpret_cast(malloc(weight_c4_size * sizeof(float))); if (tmp_weight == nullptr) { @@ -40,8 +69,8 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { return RET_ERROR; } memset(tmp_weight, 0, weight_c4_size * sizeof(float)); - PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, - conv_param_->output_channel_); + PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); // weight transform int packed_weight_size = OC4 * C4NUM * 16; @@ -62,8 +91,9 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); + memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float)); } + conv_param_->thread_num_ = MSMIN(thread_count_, OC4); return RET_OK; } @@ -106,48 +136,22 @@ int ConvolutionDepthwise3x3CPUKernel::Init() { MS_LOG(ERROR) << "malloc block buffer failed."; return RET_ERROR; } + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret; + return ret; + } if (!InferShapeDone()) { return RET_OK; } return ReSize(); } -void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() { - if (need_align_) { - if (packed_input_ != nullptr) { - free(packed_input_); - packed_input_ = nullptr; - } - if (packed_output_ != nullptr) { - free(packed_output_); - packed_output_ = nullptr; - } - } - if (trans_buffer_ != nullptr) { - free(trans_buffer_); - trans_buffer_ = nullptr; - } - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } -} - int ConvolutionDepthwise3x3CPUKernel::ReSize() { FreeTmpBufer(); - - // conv base init ConvolutionBaseCPUKernel::Init(); - auto ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret; - return ret; - } - // init threadNum; - conv_param_->thread_num_ = MSMIN(thread_count_, UP_DIV(conv_param_->output_channel_, C4NUM)); - - ret = InitBuffer(); + auto ret = InitBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Depthwise3x3 fp32 initBuffer error!ret: " << ret; return ret; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h index 0e04d764ec..bc4651a0d5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h @@ -30,13 +30,7 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel { const mindspore::lite::PrimitiveC *primitive) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConvolutionDepthwise3x3CPUKernel() override { - FreeTmpBufer(); - if (block_buffer_ != nullptr) { - free(block_buffer_); - block_buffer_ = nullptr; - } - }; + ~ConvolutionDepthwise3x3CPUKernel() override; int Init() override; int ReSize() override; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc index 3b83cd1d48..7af1563963 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc @@ -27,18 +27,19 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; namespace mindspore::kernel { -DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); } - -void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() { +DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() { if (sliding_ != nullptr) { delete sliding_; sliding_ = nullptr; } - if (packed_weight_ != nullptr) { delete packed_weight_; packed_weight_ = nullptr; } + FreeTmpBuffer(); +} + +void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() { if (need_align_) { if (packed_input_ != nullptr) { delete packed_input_; @@ -60,9 +61,6 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H); conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); - - // init sliding window param - sliding_ = new SlidingWindowParam; InitSlidingParamConvDw(sliding_, conv_param_, C4NUM); return RET_OK; } @@ -71,19 +69,17 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = in_tensors_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); - int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); + int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(float)); - PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, - conv_param_->output_channel_); + PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); - // init bias bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(float))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -92,16 +88,14 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); + memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float)); } - // init threadNum; - conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4); + conv_param_->thread_num_ = MSMIN(thread_count_, OC4); return RET_OK; } int DeconvolutionDepthwiseCPUKernel::InitBuffer() { - // malloc pack input and output buffer if (conv_param_->input_channel_ % C4NUM != 0) { need_align_ = true; int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); @@ -111,7 +105,6 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_input_, 0, pack_input_size * sizeof(float)); int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; @@ -126,6 +119,17 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() { } int DeconvolutionDepthwiseCPUKernel::Init() { + sliding_ = new (std::nothrow) SlidingWindowParam; + if (sliding_ == nullptr) { + MS_LOG(ERROR) << "new sliding window param failed."; + return RET_ERROR; + } + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret; + return ret; + } if (!InferShapeDone()) { return RET_OK; } @@ -135,16 +139,9 @@ int DeconvolutionDepthwiseCPUKernel::Init() { int DeconvolutionDepthwiseCPUKernel::ReSize() { FreeTmpBuffer(); InitSlideParam(); - // conv base init ConvolutionBaseCPUKernel::Init(); - auto ret = InitWeightBias(); - if (ret != 0) { - MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret; - return ret; - } - - ret = InitBuffer(); + auto ret = InitBuffer(); if (ret != 0) { MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret; return ret; @@ -181,7 +178,6 @@ int DeconvolutionDepthwiseCPUKernel::Run() { auto input_tensor = in_tensors_.at(kInputIndex); auto input_addr = reinterpret_cast(input_tensor->Data()); - // pack input: to nhwc4 if (need_align_) { PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc index fe2bae2198..716b696fa8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc @@ -29,15 +29,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { - if (sliding != nullptr) { - delete sliding; - sliding = nullptr; - } - - if (packed_weight_ != nullptr) { - free(packed_weight_); - packed_weight_ = nullptr; - } if (packed_input_ != nullptr) { free(packed_input_); packed_input_ = nullptr; @@ -51,6 +42,14 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { } ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { + if (sliding != nullptr) { + delete sliding; + sliding = nullptr; + } + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } FreeTmpBuffer(); FreeQuantParam(); } @@ -58,18 +57,18 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { // init weight, int8 -> int16 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 - auto origin_weight = reinterpret_cast(in_tensors_[kWeightIndex]->Data()); - int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); - int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + auto weight_tensor = in_tensors_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); + int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); - PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); + PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); - // init bias, add output zp bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -77,18 +76,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { } memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); if (in_tensors_.size() == kInputSize2) { - auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t)); + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); } + + conv_param_->thread_num_ = MSMIN(thread_count_, OC4); return RET_OK; } int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { - // malloc packed input buffer int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * UP_DIV(conv_param_->input_channel_, 4); packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(int16_t))); - memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); if (packed_input_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; @@ -108,6 +108,11 @@ int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { } int ConvolutionDepthwiseInt8CPUKernel::Init() { + sliding = new (std::nothrow) SlidingWindowParam; + if (sliding == nullptr) { + MS_LOG(ERROR) << "new sliding window param."; + return RET_ERROR; + } if (!InferShapeDone()) { return RET_OK; } @@ -116,32 +121,19 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { int ConvolutionDepthwiseInt8CPUKernel::ReSize() { FreeTmpBuffer(); - - // conv base init ConvolutionBaseCPUKernel::Init(); - - // init sliding window param - sliding = new (std::nothrow) SlidingWindowParam; - if (sliding == nullptr) { - MS_LOG(ERROR) << "new sliding window param."; - return RET_ERROR; - } InitSlidingParamConvDw(sliding, conv_param_, C4NUM); - // init quant param auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); if (ret != RET_OK) { MS_LOG(ERROR) << "Set quant param failed."; return ret; } - - // init weight and bias ret = InitWeightBias(); if (ret != RET_OK) { MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; return ret; } - ret = InitBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; @@ -177,7 +169,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { return RET_ERROR; } - // pack input, assume input format: NHWC -> NHWC4 auto input_tensor = in_tensors_.at(kInputIndex); auto input_addr = reinterpret_cast(input_tensor->Data()); PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc index 059a790480..f878f95276 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc @@ -29,11 +29,6 @@ using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; namespace mindspore::kernel { DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() { - FreeTmpBuffer(); - FreeQuantParam(); -} - -void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { if (sliding != nullptr) { delete sliding; sliding = nullptr; @@ -42,6 +37,11 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { delete packed_weight_; packed_weight_ = nullptr; } + FreeTmpBuffer(); + FreeQuantParam(); +} + +void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { if (packed_input_ != nullptr) { delete packed_input_; packed_input_ = nullptr; @@ -61,18 +61,18 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { // init weight: int8 -> int16 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 - auto origin_weight = reinterpret_cast(in_tensors_[kWeightIndex]->Data()); - int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); - int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + auto weight_tensor = in_tensors_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); + int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); - PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); + PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); - // init bias, add output zp bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -80,9 +80,11 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { } memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); if (in_tensors_.size() == kInputSize2) { - auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t)); + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); } + conv_param_->thread_num_ = MSMIN(thread_count_, OC4); return RET_OK; } @@ -96,7 +98,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); - // init sliding window param InitSlidingParamConvDw(sliding, conv_param_, C4NUM); sliding->in_h_step_ = conv_param_->input_w_ * C4NUM; @@ -108,11 +109,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { } int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { - // malloc packed input buffer int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * UP_DIV(conv_param_->input_channel_, 4); packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(int16_t))); - memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); if (packed_input_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; @@ -130,7 +129,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { memset(packed_output_, 0, pack_output_size * sizeof(int8_t)); } - // malloc tmp buffer for int32 output output_buffer_ = reinterpret_cast(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); if (output_buffer_ == nullptr) { @@ -145,41 +143,33 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { } int DeconvolutionDepthwiseInt8CPUKernel::Init() { - if (!InferShapeDone()) { - return RET_OK; - } - return ReSize(); -} - -int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { - FreeTmpBuffer(); - sliding = new (std::nothrow) SlidingWindowParam; if (sliding == nullptr) { MS_LOG(ERROR) << "new SlidingWindowParam fail!"; return RET_ERROR; } - - InitSlideParam(); - - // conv base init - ConvolutionBaseCPUKernel::Init(); - - // init quant param auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); if (ret != RET_OK) { MS_LOG(ERROR) << "Set quant param failed."; return ret; } - - // init weight and bias ret = InitWeightBias(); if (ret != RET_OK) { MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; return ret; } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { + FreeTmpBuffer(); + InitSlideParam(); + ConvolutionBaseCPUKernel::Init(); - ret = InitBuffer(); + auto ret = InitBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; return ret; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c index 834634e53b..148040fa73 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c @@ -1035,18 +1035,18 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter } } -void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, const ConvParameter *conv_param) { - int weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[0].zp_; - int unit = conv_param->kernel_h_ * conv_param->kernel_w_; - for (int c = 0; c < conv_param->output_channel_; c++) { - if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { - weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[c].zp_; +void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel, + ConvQuantArg *quant_qrg) { + int weight_zp = quant_qrg->filter_quant_args_[0].zp_; + for (int c = 0; c < channel; c++) { + if (quant_qrg->per_channel_ & FILTER_PER_CHANNEL) { + weight_zp = quant_qrg->filter_quant_args_[c].zp_; } int c4_block_num = c / C4NUM; int c4_block_rem = c % C4NUM; - const int8_t *src_c = origin_weight + c * unit; - int16_t *dst_c = packed_weight_ + c4_block_num * unit * C4NUM; - for (int k = 0; k < unit; k++) { + const int8_t *src_c = origin_weight + c * plane; + int16_t *dst_c = packed_weight_ + c4_block_num * plane * C4NUM; + for (int k = 0; k < plane; k++) { const int8_t *src_kernel = src_c + k; int16_t *dst_kernel = dst_c + C4NUM * k + c4_block_rem; *dst_kernel = (int16_t)(src_kernel[0] - weight_zp); diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h index e6d1fb0997..90786b7aa1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h @@ -100,7 +100,8 @@ void PackNCHWToNHWCInt8(const void *src, void *dst, int batch, int plane, int ch void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter *conv_param); -void PackDepthwiseInt8Weight(const int8_t *src, int16_t *dst, const ConvParameter *conv_param); +void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel, + ConvQuantArg *quant_qrg); #ifdef __cplusplus } #endif