!8813 [MS][LITE][Develop]optimization for quantized convolution per oc

From: @lx0095 Reviewed-by: Signed-off-by:
4 years ago · a86c0da849
parent 0d49650bd5 451881bbac
commit a86c0da849
12 changed files with 797 additions and 1297 deletions
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
--- a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
--- a/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S
+++ b/mindspore/lite/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S
--- a/mindspore/lite/nnacl/int8/common_func_int8.h
+++ b/mindspore/lite/nnacl/int8/common_func_int8.h
@ -62,10 +62,6 @@ int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int3
 #endif

 #ifdef ENABLE_ARM32
-void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
-                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
-                          size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before,
-                          int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
 void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
                              int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp,
                              int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min,
@ -76,10 +72,6 @@ void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *wei
 void PostFuncInt8C4Neon64(const int32_t *in, const int32_t *bias, int8_t *out, size_t oc4div, size_t oc4res,
                          size_t plane, size_t stride, int32_t multiplier, int32_t left_shift, int32_t right_shift,
                          int32_t zp, int32_t mini, int32_t maxi);
-void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
-                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
-                          size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before,
-                          int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
 void ConvDw3x3Int8Neon64(int8_t *output, const int8_t *input, const int16_t *weight, const int32_t *bias,
                         int input_col_size, int input_row_size, int channel, int output_h, int output_w, int8_t in_zp,
                         int32_t out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift,
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@ -811,38 +811,25 @@ void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int
  return;
 }

-void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                 const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
-                      int32_t *multiplier, ConvParameter *conv_param) {
-  int is_per_channel = conv_param->conv_quant_arg_.filter_arg_num_ != 1 ? true : false;
+                 int32_t *multiplier, ConvParameter *conv_param, int32_t *filter_zp) {
+  int is_per_oc = (int)conv_param->conv_quant_arg_.filter_arg_num_ != 1;
 #ifdef ENABLE_ARM32
-  MatmulInt8Neon32(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
+  MatmulInt8Neon32Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
                      conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
-                   conv_param->output_channel_, is_per_channel);
-#else
-  MatMulInt8_4x2_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
-                   left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
-                   conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
-                   is_per_channel);
-#endif
-  return;
-}
-
-void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
-                 const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
-                 int32_t *multiplier, ConvParameter *conv_param) {
-  int is_per_oc = (int)conv_param->conv_quant_arg_.filter_arg_num_ != 1;
-#ifdef ENABLE_ARM64
-  MatmulInt8Neon64(packed_input, packed_weight, dst, UP_ROUND(row, C4NUM), UP_ROUND(col, C4NUM), deep16, input_sum,
+                      conv_param->output_channel_, is_per_oc, filter_zp);
+#elif ENABLE_ARM64
+  MatmulInt8Neon64Opt(packed_input, packed_weight, dst, UP_ROUND(row, C4NUM), UP_ROUND(col, C4NUM), deep16, input_sum,
                      bias, conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
-                   conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift, row, col,
-                   conv_param->output_channel_, is_per_oc);
+                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift, row,
+                      col, conv_param->output_channel_, is_per_oc, filter_zp);
 #else
-  MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
-                    left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+  MatmulInt8Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
                conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
-                    is_per_oc);
+                conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
+                conv_param->output_channel_, is_per_oc, filter_zp);
 #endif
  return;
 }
--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@ -43,13 +43,10 @@ void Conv1x1PreOptPert(const int8_t *src_input, int8_t *packed_input, int32_t *i
                       size_t plane_size, ConvParameter *conv_param);
 void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                 const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
-                 int32_t *multiplier, ConvParameter *conv_param);
+                 int32_t *multiplier, ConvParameter *conv_param, int32_t *filter_zp);
 void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                    const int32_t *bias, int row, int col, int deep4, int32_t *left_shift, int32_t *right_shift,
                    int32_t *multiplier, ConvParameter *conv_param, MATMUL_OPT_DP_FUNC matmul_func, int32_t *filter_zp);
-void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
-                      const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
-                      int32_t *multiplier, ConvParameter *conv_param);

 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@ -250,6 +250,41 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
  return;
 }

+#ifndef ENABLE_ARM
+void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
+                   const int *bias, int mini, int maxi, int out_zp, int32_t *multiplier, int32_t *left_shift,
+                   int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp) {
+  int col_tile = C4NUM;
+  /* support per-layer && weight per-channel */
+  /*  row4x16-major * row16x2-major => (int8)row-major*/
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM;
+      int c4div = c / col_tile, c4mod = c % col_tile;
+      size_t ci = r * stride + c;
+      int32_t value = 0;
+      for (int d = 0; d < deep16; d++) {
+        int d16div = d / C16NUM, d16mod = d % C16NUM;
+        size_t ai = r4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c4div * deep16 * col_tile + d16div * col_tile * C16NUM + c4mod * C16NUM + d16mod;
+        value = value + a[ai] * b[bi];
+      }
+      int32_t cur_input_sum = filter_peroc ? a_sums[r] * filter_zp[c] : a_sums[r];
+      value -= cur_input_sum;
+      value += bias[c];
+      int32_t cur_left_shift = filter_peroc ? left_shift[c] : left_shift[0];
+      int32_t cur_right_shift = filter_peroc ? right_shift[c] : right_shift[0];
+      int32_t cur_multiplier = filter_peroc ? multiplier[c] : multiplier[0];
+      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + out_zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      dst[ci] = (int8_t)value;
+    }
+  }
+  return;
+}
+#endif
+
 void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@ -60,6 +60,9 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
                       size_t per_channel, int32_t *filter_zp);
+void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
+                   const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
+                   int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);

 #ifdef ENABLE_ARM64
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
@ -68,11 +71,18 @@ void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, i

 void MatMulR4Int8Neon64(const int8_t *a, const int8_t *b, int32_t *dst, int row4, int col4, int deep16,
                        const int *input_sum, const int *bias);
+void MatmulInt8Neon64Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16,
+                         const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
+                         int32_t *left_shift, int32_t *right_shift, int row, int col, int stride, int filter_peroc,
+                         int32_t *filter_zp);
 #endif
 #ifdef ENABLE_ARM32
 void MatmulInt8Neon32(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
                      const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
                      int *multiplier, int *left_shift, int *right_shift, int stride, int per_channel);
+void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
+                         const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
+                         int32_t *left_shift, int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);
 #endif
 #ifdef __cplusplus
 }
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@ -92,27 +92,19 @@ int Convolution1x1Int8OcOptPre(void *cdata, int task_id) {
 }

 int Convolution1x1Int8CPUKernel::OcRun(int task_id) {
-#ifdef ENABLE_ARM32
-  return RunArm32Oc(task_id);
-#else
  if (support_optimize_) {
    return RunArm64OptOc(task_id);
  } else {
-    return RunArm64Oc(task_id);
+    return RunArmOc(task_id);
  }
-#endif
 }

 int Convolution1x1Int8CPUKernel::HwRun(int task_id) {
-#ifdef ENABLE_ARM32
-  return RunArm32Hw(task_id);
-#else
  if (support_optimize_) {
    return RunArm64OptHw(task_id);
  } else {
-    return RunArm64Hw(task_id);
+    return RunArmHw(task_id);
  }
-#endif
 }

 int Convolution1x1Int8CPUKernel::InitRunBuf() {
@ -124,6 +116,7 @@ int Convolution1x1Int8CPUKernel::InitRunBuf() {

  size_t size = support_optimize_ ? UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM)
                                  : UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM);
+
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(size * sizeof(int8_t)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!";
@ -333,8 +326,8 @@ int Convolution1x1Int8CPUKernel::InitParam() {
  matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
  matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);

-  int row_pack_count = 0;
-  int col_pack_count = 0;
+  int row_pack_count;
+  int col_pack_count;

 #ifdef ENABLE_ARM32
  row_pack_count = C4NUM;
@ -350,15 +343,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
 #endif

  /* init input sum size */
-  if (support_optimize_) {
  input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
-  } else {
-    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
-      input_sum_size_ = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count);
-    } else {
-      input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
-    }
-  }

  if (pre_trans_input_) {
    input_ptr_ = reinterpret_cast<int8_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
@ -404,7 +389,7 @@ void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_out
  return;
 }

-int Convolution1x1Int8CPUKernel::RunArm64Hw(int task_id) {
+int Convolution1x1Int8CPUKernel::RunArmHw(int task_id) {
  int cur_stride = thread_stride_hw_ * C4NUM;
  int res_stride = matmul_param_->row_ - task_id * thread_stride_hw_ * C4NUM;
  int cur_hw = MSMIN(cur_stride, res_stride);
@ -415,51 +400,20 @@ int Convolution1x1Int8CPUKernel::RunArm64Hw(int task_id) {
  int8_t *hw_in = input_ptr_ + task_id * thread_stride_hw_ * C4NUM * conv_param_->input_channel_;
  int8_t *hw_out = output_ptr_ + task_id * thread_stride_hw_ * C4NUM * conv_param_->output_channel_;
  int8_t *hw_packed_in = packed_input_ + task_id * thread_stride_hw_ * C4NUM * matmul_param_->deep_16_;
-  int32_t *hw_input_sum = filter_peroc_ ? input_sum_ + task_id * thread_stride_hw_ * C4NUM * matmul_param_->col_4_
-                                        : input_sum_ + task_id * thread_stride_hw_ * C4NUM;
+  int32_t *hw_input_sum = input_sum_ + task_id * thread_stride_hw_ * C4NUM;

  RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, matmul_param_->deep_);

  if (filter_peroc_) {
-    PackInputSum16x4PerChannel(hw_packed_in, hw_input_sum, filter_zp_ptr_, cur_hw, matmul_param_->deep_,
-                               matmul_param_->col_);
+    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), matmul_param_->deep_16_);
  } else {
    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
                             UP_ROUND(cur_hw, C4NUM), matmul_param_->deep_16_);
  }

  Conv1x1Int8(hw_packed_in, packed_weight_, hw_out, hw_input_sum, reinterpret_cast<int32_t *>(bias_data_), cur_hw,
-              matmul_param_->col_, matmul_param_->deep_16_, left_shift_, right_shift_, multiplier_, conv_param_);
-  return RET_OK;
-}
-
-int Convolution1x1Int8CPUKernel::RunArm32Hw(int task_id) {
-  int cur_stride = thread_stride_hw_ * C4NUM;
-  int res_stride = matmul_param_->row_ - task_id * thread_stride_hw_ * C4NUM;
-  int cur_hw = MSMIN(cur_stride, res_stride);
-  if (cur_hw <= 0) {
-    return RET_OK;
-  }
-
-  int8_t *hw_in = input_ptr_ + task_id * thread_stride_hw_ * C4NUM * conv_param_->input_channel_;
-  int8_t *hw_out = output_ptr_ + task_id * thread_stride_hw_ * C4NUM * conv_param_->output_channel_;
-  int8_t *hw_packed_in = packed_input_ + task_id * thread_stride_hw_ * C4NUM * matmul_param_->deep_16_;
-  int32_t *hw_input_sum = filter_peroc_ ? input_sum_ + task_id * thread_stride_hw_ * C4NUM * matmul_param_->col_2_
-                                        : input_sum_ + task_id * thread_stride_hw_ * C4NUM;
-
-  RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, matmul_param_->deep_);
-
-  if (filter_peroc_) {
-    PackInputSum16x4PerChannelArm32(hw_packed_in, hw_input_sum, filter_zp_ptr_, cur_hw, conv_param_->input_channel_,
-                                    conv_param_->output_channel_);
-  } else {
-    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
-                             UP_ROUND(cur_hw, C4NUM), matmul_param_->deep_16_);
-  }
-
-  Conv1x1Int8Arm32(hw_packed_in, packed_weight_, hw_out, hw_input_sum, reinterpret_cast<int32_t *>(bias_data_), cur_hw,
-                   matmul_param_->col_, matmul_param_->deep_16_, left_shift_, right_shift_, multiplier_, conv_param_);
-
+              matmul_param_->col_, matmul_param_->deep_16_, left_shift_, right_shift_, multiplier_, conv_param_,
+              filter_zp_ptr_);
  return RET_OK;
 }

@ -489,26 +443,6 @@ int Convolution1x1Int8CPUKernel::RunArm64OptHw(int task_id) {
  return RET_OK;
 }

-int Convolution1x1Int8CPUKernel::RunArm32Oc(int task_id) {
-  int stride = thread_stride_oc_ * C2NUM;
-  int cur_stride = task_id * stride;
-  int res_stride = matmul_param_->col_ - cur_stride;
-  int cur_oc = MSMIN(stride, res_stride);
-  if (cur_oc <= 0) {
-    return RET_OK;
-  }
-
-  int32_t *cur_input_sum = filter_peroc_ ? input_sum_ + cur_stride * matmul_param_->row_4_ : input_sum_;
-  int32_t *cur_left_shift = filter_peroc_ ? left_shift_ + cur_stride : conv_param_->conv_quant_arg_.left_shift_;
-  int32_t *cur_right_shift = filter_peroc_ ? right_shift_ + cur_stride : conv_param_->conv_quant_arg_.right_shift_;
-  int32_t *cur_multiplier = filter_peroc_ ? multiplier_ + cur_stride : conv_param_->conv_quant_arg_.quant_multiplier_;
-
-  Conv1x1Int8Arm32(packed_input_, packed_weight_ + cur_stride * matmul_param_->deep_16_, output_ptr_ + cur_stride,
-                   cur_input_sum, reinterpret_cast<int32_t *>(bias_data_) + cur_stride, matmul_param_->row_, cur_oc,
-                   matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
-  return RET_OK;
-}
-
 int Convolution1x1Int8CPUKernel::RunArm64OptOc(int task_id) {
  int stride = thread_stride_oc_ * C16NUM;
  int cur_stride = task_id * stride;
@ -531,8 +465,13 @@ int Convolution1x1Int8CPUKernel::RunArm64OptOc(int task_id) {
  return RET_OK;
 }

-int Convolution1x1Int8CPUKernel::RunArm64Oc(int task_id) {
-  int stride = thread_stride_oc_ * C4NUM;
+int Convolution1x1Int8CPUKernel::RunArmOc(int task_id) {
+#ifdef ENABLE_ARM32
+  int col_tile = C2NUM;
+#else
+  int col_tile = C4NUM;
+#endif
+  int stride = thread_stride_oc_ * col_tile;
  int cur_stride = task_id * stride;
  int res_stride = matmul_param_->col_ - cur_stride;
  int cur_oc = MSMIN(stride, res_stride);
@ -540,14 +479,14 @@ int Convolution1x1Int8CPUKernel::RunArm64Oc(int task_id) {
    return RET_OK;
  }

-  int32_t *cur_input_sum = filter_peroc_ ? input_sum_ + cur_stride * matmul_param_->row_4_ : input_sum_;
  int32_t *cur_left_shift = filter_peroc_ ? left_shift_ + cur_stride : conv_param_->conv_quant_arg_.left_shift_;
  int32_t *cur_right_shift = filter_peroc_ ? right_shift_ + cur_stride : conv_param_->conv_quant_arg_.right_shift_;
  int32_t *cur_multiplier = filter_peroc_ ? multiplier_ + cur_stride : conv_param_->conv_quant_arg_.quant_multiplier_;
+  int32_t *cur_zp = filter_peroc_ ? filter_zp_ptr_ + cur_stride : filter_zp_ptr_;

  Conv1x1Int8(packed_input_, packed_weight_ + cur_stride * matmul_param_->deep_16_, output_ptr_ + cur_stride,
-              cur_input_sum, reinterpret_cast<int32_t *>(bias_data_) + cur_stride, matmul_param_->row_, cur_oc,
-              matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
+              input_sum_, reinterpret_cast<int32_t *>(bias_data_) + cur_stride, matmul_param_->row_, cur_oc,
+              matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_, cur_zp);

  return RET_OK;
 }
@ -592,7 +531,12 @@ int Convolution1x1Int8CPUKernel::Run() {
        ParallelLaunch(this->context_->thread_pool_, Convolution1x1Int8OcOptPre, this, thread_count_hw_);
      } else {
        RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_);
-        PackInputSum16x4Int8(packed_input_, input_sum_, filter_zp_ptr_, conv_param_);
+        if (filter_peroc_) {
+          PackInputSum16x4PerLayer(packed_input_, input_sum_, 1, matmul_param_->row_4_, matmul_param_->deep_16_);
+        } else {
+          PackInputSum16x4PerLayer(packed_input_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
+                                   matmul_param_->row_4_, matmul_param_->deep_16_);
+        }
      }
      /* matmul parallel by oc */
      error_code = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Int8OcRun, this, thread_count_oc_);
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@ -50,11 +50,9 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
  int OcOptPre(int task_id);

 private:
-  int RunArm32Oc(int task_id);
-  int RunArm64Oc(int task_id);
+  int RunArmOc(int task_id);
  int RunArm64OptOc(int task_id);
-  int RunArm32Hw(int task_id);
-  int RunArm64Hw(int task_id);
+  int RunArmHw(int task_id);
  int RunArm64OptHw(int task_id);

 private:
--- a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc
@ -21,11 +21,6 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void IndirectGemmInt8_24x4_dp(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
-                                     size_t ksize, size_t ic4, size_t output_channel, size_t offset,
-                                     const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
-                                     int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
-                                     size_t asymmetric, size_t per_channel, size_t per_channel_offset);

 extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                  const int *input_sum, const int *bias);
@ -38,15 +33,6 @@ extern void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_
                            int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);

 #ifdef ENABLE_ARM64
-void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
-                                       size_t ksize, size_t ic4, size_t output_channel, size_t offset,
-                                       const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
-                                       int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
-                                       size_t asymmetric, size_t per_channel, size_t per_channel_offset) {
-  return IndirectGemmInt8_24x4_dp(dst, src, weight, bias, ksize, ic4, output_channel, offset, input_sum, act_min,
-                                  act_max, out_zp, out_multiplier, shift_before, shift_after, asymmetric, per_channel,
-                                  per_channel_offset);
-}

 void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                   const int *input_sum, const int *bias) {