From 2bf61d2da1689b356bb0a1e1b48e132bdbbf5f8e Mon Sep 17 00:00:00 2001
From: yangruoqi713 <yangruoqi@huawei.com>
Date: Fri, 21 Aug 2020 16:56:41 +0800
Subject: [PATCH] [MS][LITE] arm cpu fp32 op: move weight and bias initing to
 function Init

---
 .../arm/fp16/convolution_depthwise_fp16.cc    | 111 ++++++++++--------
 .../arm/fp16/convolution_depthwise_fp16.h     |   1 +
 .../arm/fp16/deconvolution_depthwise_fp16.cc  |  87 +++++++-------
 .../arm/fp16/deconvolution_depthwise_fp16.h   |   1 +
 .../kernel/arm/fp32/convolution_depthwise.cc  |  50 ++++----
 .../arm/fp32/convolution_depthwise_3x3.cc     |  76 ++++++------
 .../arm/fp32/convolution_depthwise_3x3.h      |   8 +-
 .../arm/fp32/deconvolution_depthwise.cc       |  50 ++++----
 .../arm/int8/convolution_depthwise_int8.cc    |  57 ++++-----
 .../arm/int8/deconvolution_depthwise_int8.cc  |  62 ++++------
 .../lite/src/runtime/kernel/arm/nnacl/pack.c  |  18 +--
 .../lite/src/runtime/kernel/arm/nnacl/pack.h  |   3 +-
 12 files changed, 254 insertions(+), 270 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index cfe10e7c3c..ee1e750a79 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -29,66 +29,67 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
-ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); }
-
-void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
+ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
   if (sliding_ != nullptr) {
     delete sliding_;
     sliding_ = nullptr;
   }
-
   if (packed_weight_ != nullptr) {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  if (packed_input_ != nullptr) {
-    delete packed_input_;
-    packed_input_ = nullptr;
-  }
-  if (packed_output_ != nullptr) {
-    delete packed_output_;
-    packed_output_ = nullptr;
+  FreeTmpBuffer();
+}
+
+void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
+  if (need_align_) {
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
   }
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
-  // malloc pack input buffer
-  int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
-  packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-  memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
+    int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+    int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
+    packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
 
-  // malloc pack output buffer
-  int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
-  packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
-  if (packed_output_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
+    packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
+    if (packed_output_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
   return RET_OK;
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
-  int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
   auto weight_tensor = in_tensors_[kWeightIndex];
+  int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
-  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
-                           conv_param_->output_channel_);
+  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                           weight_tensor->Batch());
 
-  // init bias
   bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
   auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    for (int i = 0; i < conv_param_->output_channel_; i++) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
+    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
       bias_fp16[i] = (float16_t)ori_bias[i];
     }
   }
@@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Init() {
+  sliding_ = new (std::nothrow) SlidingWindowParam;
+  if (sliding_ == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param failed.";
+    return RET_ERROR;
+  }
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
 
 int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
   FreeTmpBuffer();
-  // conv base init
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
-  // init sliding_ window param
-  sliding_ = new SlidingWindowParam;
   InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
 
-  ret = InitWeightBias();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
-    return RET_ERROR;
-  }
-
   ret = InitBuffer();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
@@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
     MS_LOG(ERROR) << "Get Execute tensor failed.";
     return ret;
   }
-  // pack input: to nhwc8
-  PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  if (need_align_) {
+    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  } else {
+    packed_input_ = execute_input_;
+  }
+  if (!need_align_) {
+    packed_output_ = execute_output_;
+  }
 
   ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
   }
-
-  PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
-                      conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-
+  if (need_align_) {
+    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index f325a30ece..c68df7f36e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -56,6 +56,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   float16_t *packed_weight_ = nullptr;
   float16_t *packed_input_ = nullptr;
   float16_t *packed_output_ = nullptr;
+  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 146e130502..049fc4ed23 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -28,25 +28,28 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
-DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); }
-
-void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
+DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() {
   if (sliding_ != nullptr) {
     delete sliding_;
     sliding_ = nullptr;
   }
-
   if (packed_weight_ != nullptr) {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
-  if (packed_input_ != nullptr) {
-    delete packed_input_;
-    packed_input_ = nullptr;
-  }
-  if (packed_output_ != nullptr) {
-    delete packed_output_;
-    packed_output_ = nullptr;
+  FreeTmpBuffer();
+}
+
+void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
+  if (need_align_) {
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
   }
 }
 
@@ -59,14 +62,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
   conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H);
   conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
   conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
-
-  // init sliding_ window param
   InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
   return RET_OK;
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
-  // malloc pack input buffer
   int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
   packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
@@ -74,7 +74,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
 
   int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
   packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
@@ -88,21 +87,19 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
 
 int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
-  int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
   auto weight_tensor = in_tensors_[kWeightIndex];
+  int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
-  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
-                           conv_param_->output_channel_);
+  PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                           weight_tensor->Batch());
 
-  // init bias
   bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -110,8 +107,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   }
   memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
   if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    for (int i = 0; i < conv_param_->output_channel_; i++) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
+    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
       reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
     }
   }
@@ -121,6 +119,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::Init() {
+  sliding_ = new (std::nothrow) SlidingWindowParam;
+  if (sliding_ == nullptr) {
+    MS_LOG(ERROR) << "new SlidingWindowParam fail!";
+    return RET_ERROR;
+  }
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
+    return RET_ERROR;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -129,25 +138,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
 
 int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
   FreeTmpBuffer();
-
-  sliding_ = new (std::nothrow) SlidingWindowParam;
-  if (sliding_ == nullptr) {
-    MS_LOG(ERROR) << "new SlidingWindowParam fail!";
-    return RET_ERROR;
-  }
   InitSlideParam();
-  // conv base init
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
-
-  ret = InitWeightBias();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
-    return RET_ERROR;
-  }
-
   ret = InitBuffer();
   if (ret != 0) {
     MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
@@ -188,18 +183,26 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
     MS_LOG(ERROR) << "Get Execute tensor failed.";
     return ret;
   }
-  // pack input: to nhwc8
-  PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  if (need_align_) {
+    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  } else {
+    packed_input_ = execute_input_;
+  }
 
+  if (!need_align_) {
+    packed_output_ = execute_output_;
+  }
   ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
   }
 
-  PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
-                      conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  if (need_align_) {
+    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
index cb7bc4b83d..fe1a4bcebb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@@ -57,6 +57,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel
   float16_t *packed_weight_ = nullptr;
   float16_t *packed_input_ = nullptr;
   float16_t *packed_output_ = nullptr;
+  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
index d9bcba5f67..b5cc0854d2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@@ -29,18 +29,19 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
-ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); }
-
-void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
+ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() {
   if (sliding_ != nullptr) {
     delete sliding_;
     sliding_ = nullptr;
   }
-
   if (packed_weight_ != nullptr) {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
+  FreeTmpBuffer();
+}
+
+void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
   if (need_align_) {
     if (packed_input_ != nullptr) {
       delete packed_input_;
@@ -57,19 +58,17 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
-  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
-                       conv_param_->output_channel_);
+  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                       weight_tensor->Batch());
 
-  // init bias
   bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -78,16 +77,14 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
   if (in_tensors_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
+    memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
   }
 
-  // init threadNum;
   conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
   return RET_OK;
 }
 
 int ConvolutionDepthwiseCPUKernel::InitBuffer() {
-  // malloc pack input and output buffer
   if (conv_param_->input_channel_ % C4NUM != 0) {
     need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
@@ -97,7 +94,6 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
     }
-    memset(packed_input_, 0, pack_input_size * sizeof(float));
 
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
@@ -111,32 +107,29 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
 }
 
 int ConvolutionDepthwiseCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int ConvolutionDepthwiseCPUKernel::ReSize() {
-  FreeTmpBuffer();
-  // conv base init
-  ConvolutionBaseCPUKernel::Init();
-
-  // init sliding window param
   sliding_ = new (std::nothrow) SlidingWindowParam;
   if (sliding_ == nullptr) {
     MS_LOG(ERROR) << "new sliding window param failed.";
     return RET_ERROR;
   }
-  InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
 
   auto ret = InitWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed.";
     return RET_ERROR;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ConvolutionDepthwiseCPUKernel::ReSize() {
+  FreeTmpBuffer();
+  ConvolutionBaseCPUKernel::Init();
+  InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
 
-  ret = InitBuffer();
+  auto ret = InitBuffer();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
     return RET_ERROR;
@@ -173,7 +166,6 @@ int ConvolutionDepthwiseCPUKernel::Run() {
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
 
-  // pack input: to nhwc4
   if (need_align_) {
     PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
                         conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
index 5d8bcd73b4..b56df7423c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
@@ -27,12 +27,41 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
+ConvolutionDepthwise3x3CPUKernel::~ConvolutionDepthwise3x3CPUKernel() {
+  FreeTmpBufer();
+  if (block_buffer_ != nullptr) {
+    free(block_buffer_);
+    block_buffer_ = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    free(packed_weight_);
+    packed_weight_ = nullptr;
+  }
+}
+
+void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() {
+  if (need_align_) {
+    if (packed_input_ != nullptr) {
+      free(packed_input_);
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      free(packed_output_);
+      packed_output_ = nullptr;
+    }
+  }
+  if (trans_buffer_ != nullptr) {
+    free(trans_buffer_);
+    trans_buffer_ = nullptr;
+  }
+}
+
 int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
   // o h w 1 -> o/4 h w 1 4
-  int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
   int weight_c4_size = OC4 * C4NUM * 9;
   auto tmp_weight = reinterpret_cast<float *>(malloc(weight_c4_size * sizeof(float)));
   if (tmp_weight == nullptr) {
@@ -40,8 +69,8 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(tmp_weight, 0, weight_c4_size * sizeof(float));
-  PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
-                       conv_param_->output_channel_);
+  PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
+                       weight_tensor->Batch());
 
   // weight transform
   int packed_weight_size = OC4 * C4NUM * 16;
@@ -62,8 +91,9 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
   if (in_tensors_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
+    memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
   }
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
   return RET_OK;
 }
 
@@ -106,48 +136,22 @@ int ConvolutionDepthwise3x3CPUKernel::Init() {
     MS_LOG(ERROR) << "malloc block buffer failed.";
     return RET_ERROR;
   }
+  auto ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret;
+    return ret;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
   return ReSize();
 }
 
-void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() {
-  if (need_align_) {
-    if (packed_input_ != nullptr) {
-      free(packed_input_);
-      packed_input_ = nullptr;
-    }
-    if (packed_output_ != nullptr) {
-      free(packed_output_);
-      packed_output_ = nullptr;
-    }
-  }
-  if (trans_buffer_ != nullptr) {
-    free(trans_buffer_);
-    trans_buffer_ = nullptr;
-  }
-  if (packed_weight_ != nullptr) {
-    free(packed_weight_);
-    packed_weight_ = nullptr;
-  }
-}
-
 int ConvolutionDepthwise3x3CPUKernel::ReSize() {
   FreeTmpBufer();
-
-  // conv base init
   ConvolutionBaseCPUKernel::Init();
 
-  auto ret = InitWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret;
-    return ret;
-  }
-  // init threadNum;
-  conv_param_->thread_num_ = MSMIN(thread_count_, UP_DIV(conv_param_->output_channel_, C4NUM));
-
-  ret = InitBuffer();
+  auto ret = InitBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Depthwise3x3 fp32 initBuffer error!ret: " << ret;
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
index 0e04d764ec..bc4651a0d5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
@@ -30,13 +30,7 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
                                    const mindspore::lite::PrimitiveC *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
 
-  ~ConvolutionDepthwise3x3CPUKernel() override {
-    FreeTmpBufer();
-    if (block_buffer_ != nullptr) {
-      free(block_buffer_);
-      block_buffer_ = nullptr;
-    }
-  };
+  ~ConvolutionDepthwise3x3CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
index 3b83cd1d48..7af1563963 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@@ -27,18 +27,19 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
-DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); }
-
-void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
+DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() {
   if (sliding_ != nullptr) {
     delete sliding_;
     sliding_ = nullptr;
   }
-
   if (packed_weight_ != nullptr) {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
+  FreeTmpBuffer();
+}
+
+void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
   if (need_align_) {
     if (packed_input_ != nullptr) {
       delete packed_input_;
@@ -60,9 +61,6 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
   conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H);
   conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
   conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
-
-  // init sliding window param
-  sliding_ = new SlidingWindowParam;
   InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
   return RET_OK;
 }
@@ -71,19 +69,17 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
-  int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
-  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
-                       conv_param_->output_channel_);
+  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                       weight_tensor->Batch());
 
-  // init bias
   bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -92,16 +88,14 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
   if (in_tensors_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
+    memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
   }
 
-  // init threadNum;
-  conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4);
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
   return RET_OK;
 }
 
 int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
-  // malloc pack input and output buffer
   if (conv_param_->input_channel_ % C4NUM != 0) {
     need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
@@ -111,7 +105,6 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
     }
-    memset(packed_input_, 0, pack_input_size * sizeof(float));
 
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
@@ -126,6 +119,17 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
 }
 
 int DeconvolutionDepthwiseCPUKernel::Init() {
+  sliding_ = new (std::nothrow) SlidingWindowParam;
+  if (sliding_ == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param failed.";
+    return RET_ERROR;
+  }
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret;
+    return ret;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -135,16 +139,9 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
 int DeconvolutionDepthwiseCPUKernel::ReSize() {
   FreeTmpBuffer();
   InitSlideParam();
-  // conv base init
   ConvolutionBaseCPUKernel::Init();
 
-  auto ret = InitWeightBias();
-  if (ret != 0) {
-    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret;
-    return ret;
-  }
-
-  ret = InitBuffer();
+  auto ret = InitBuffer();
   if (ret != 0) {
     MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret;
     return ret;
@@ -181,7 +178,6 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
 
-  // pack input: to nhwc4
   if (need_align_) {
     PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
                         conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index fe2bae2198..716b696fa8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -29,15 +29,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
 void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
-  if (sliding != nullptr) {
-    delete sliding;
-    sliding = nullptr;
-  }
-
-  if (packed_weight_ != nullptr) {
-    free(packed_weight_);
-    packed_weight_ = nullptr;
-  }
   if (packed_input_ != nullptr) {
     free(packed_input_);
     packed_input_ = nullptr;
@@ -51,6 +42,14 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
 }
 
 ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
+  if (sliding != nullptr) {
+    delete sliding;
+    sliding = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    free(packed_weight_);
+    packed_weight_ = nullptr;
+  }
   FreeTmpBuffer();
   FreeQuantParam();
 }
@@ -58,18 +57,18 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
 int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight, int8 -> int16
   // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
-  auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_[kWeightIndex]->Data());
-  int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  auto weight_tensor = in_tensors_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
   packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
-  PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
+  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
+                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
 
-  // init bias, add output zp
   bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -77,18 +76,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
   if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t));
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
+    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
   }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
   return RET_OK;
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
-  // malloc packed input buffer
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                         UP_DIV(conv_param_->input_channel_, 4);
   packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
-  memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -108,6 +108,11 @@ int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::Init() {
+  sliding = new (std::nothrow) SlidingWindowParam;
+  if (sliding == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param.";
+    return RET_ERROR;
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
@@ -116,32 +121,19 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
 
 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
   FreeTmpBuffer();
-
-  // conv base init
   ConvolutionBaseCPUKernel::Init();
-
-  // init sliding window param
-  sliding = new (std::nothrow) SlidingWindowParam;
-  if (sliding == nullptr) {
-    MS_LOG(ERROR) << "new sliding window param.";
-    return RET_ERROR;
-  }
   InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
 
-  // init quant param
   auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Set quant param failed.";
     return ret;
   }
-
-  // init weight and bias
   ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
     return ret;
   }
-
   ret = InitBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
@@ -177,7 +169,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  // pack input, assume input format: NHWC -> NHWC4
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
   PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
index 059a790480..f878f95276 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
@@ -29,11 +29,6 @@ using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
 DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
-  FreeTmpBuffer();
-  FreeQuantParam();
-}
-
-void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
   if (sliding != nullptr) {
     delete sliding;
     sliding = nullptr;
@@ -42,6 +37,11 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
     delete packed_weight_;
     packed_weight_ = nullptr;
   }
+  FreeTmpBuffer();
+  FreeQuantParam();
+}
+
+void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
   if (packed_input_ != nullptr) {
     delete packed_input_;
     packed_input_ = nullptr;
@@ -61,18 +61,18 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
 int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight: int8 -> int16
   // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
-  auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_[kWeightIndex]->Data());
-  int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  auto weight_tensor = in_tensors_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
   packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
-  PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
+  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
+                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
 
-  // init bias, add output zp
   bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -80,9 +80,11 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
   if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t));
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
+    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
   }
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
   return RET_OK;
 }
 
@@ -96,7 +98,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
   conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
   conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
 
-  // init sliding window param
   InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
 
   sliding->in_h_step_ = conv_param_->input_w_ * C4NUM;
@@ -108,11 +109,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
 }
 
 int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
-  // malloc packed input buffer
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                         UP_DIV(conv_param_->input_channel_, 4);
   packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
-  memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -130,7 +129,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
     memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
   }
 
-  // malloc tmp buffer for int32 output
   output_buffer_ =
     reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
   if (output_buffer_ == nullptr) {
@@ -145,41 +143,33 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
 }
 
 int DeconvolutionDepthwiseInt8CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
-  FreeTmpBuffer();
-
   sliding = new (std::nothrow) SlidingWindowParam;
   if (sliding == nullptr) {
     MS_LOG(ERROR) << "new SlidingWindowParam fail!";
     return RET_ERROR;
   }
-
-  InitSlideParam();
-
-  // conv base init
-  ConvolutionBaseCPUKernel::Init();
-
-  // init quant param
   auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Set quant param failed.";
     return ret;
   }
-
-  // init weight and bias
   ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
     return ret;
   }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
+  FreeTmpBuffer();
+  InitSlideParam();
+  ConvolutionBaseCPUKernel::Init();
 
-  ret = InitBuffer();
+  auto ret = InitBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
index 834634e53b..148040fa73 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
@@ -1035,18 +1035,18 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
   }
 }
 
-void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, const ConvParameter *conv_param) {
-  int weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[0].zp_;
-  int unit = conv_param->kernel_h_ * conv_param->kernel_w_;
-  for (int c = 0; c < conv_param->output_channel_; c++) {
-    if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
-      weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[c].zp_;
+void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
+                             ConvQuantArg *quant_qrg) {
+  int weight_zp = quant_qrg->filter_quant_args_[0].zp_;
+  for (int c = 0; c < channel; c++) {
+    if (quant_qrg->per_channel_ & FILTER_PER_CHANNEL) {
+      weight_zp = quant_qrg->filter_quant_args_[c].zp_;
     }
     int c4_block_num = c / C4NUM;
     int c4_block_rem = c % C4NUM;
-    const int8_t *src_c = origin_weight + c * unit;
-    int16_t *dst_c = packed_weight_ + c4_block_num * unit * C4NUM;
-    for (int k = 0; k < unit; k++) {
+    const int8_t *src_c = origin_weight + c * plane;
+    int16_t *dst_c = packed_weight_ + c4_block_num * plane * C4NUM;
+    for (int k = 0; k < plane; k++) {
       const int8_t *src_kernel = src_c + k;
       int16_t *dst_kernel = dst_c + C4NUM * k + c4_block_rem;
       *dst_kernel = (int16_t)(src_kernel[0] - weight_zp);
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
index e6d1fb0997..90786b7aa1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
@@ -100,7 +100,8 @@ void PackNCHWToNHWCInt8(const void *src, void *dst, int batch, int plane, int ch
 
 void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter *conv_param);
 
-void PackDepthwiseInt8Weight(const int8_t *src, int16_t *dst, const ConvParameter *conv_param);
+void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
+                             ConvQuantArg *quant_qrg);
 #ifdef __cplusplus
 }
 #endif