[MS][LITE] optimize arm cpu int8 op conv depthwise: add common and slide window functions to select

5 years ago · 8d06c2b8be
parent e6112ed1ba
commit 8d06c2b8be
10 changed files with 686 additions and 76 deletions
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
@ -29,7 +29,7 @@ mov x6, x1
 mov x7, x2
 mov x8, x4
-    LoopInputDepth16In:
+    LoopDepth16In:
    cmp x8, #16
    blt L4
    sub x8, x8, #16
@ -39,8 +39,8 @@ mov x8, x4
    ld1 {v16.4s, v17.4s}, [x0], #32
    cmp x8, #16
-    blt LoopInputDepth16Out
+    blt LoopDepth16Out
-    LoopInputDepth16:
+    LoopDepth16:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s
@ -61,9 +61,9 @@ mov x8, x4
    sub x8, x8, #16
    cmp x8, #16
-    bge LoopInputDepth16
+    bge LoopDepth16
-    LoopInputDepth16Out:
+    LoopDepth16Out:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s
    st1 {v16.4s, v17.4s}, [x9], #32
@ -81,7 +81,7 @@ mov x8, x4
    cmp x8, #4
    blt L0
-    LoopInputDepth4:
+    LoopDepth4:
    ld1 {v0.4s}, [x6], #16
    ld1 {v2.4s}, [x7], #16
    ld1 {v16.4s}, [x0], #16
@ -89,13 +89,13 @@ mov x8, x4
    st1 {v16.4s}, [x9], #16
    sub x8, x8, #4
    cmp x8, #4
-    bge LoopInputDepth4
+    bge LoopDepth4
    L0:
    cmp x8, #0
    beq Loop16LineEnd
-    LoopInputDepth0:
+    LoopDepth0:
    ldr s0, [x6], #4
    ldr s1, [x7], #4
    ldr s2, [x0], #4
@ -103,7 +103,7 @@ mov x8, x4
    fadd s2, s2, s0
    str s2, [x9], #4
    subs x8, x8, #1
-    bne LoopInputDepth0
+    bne LoopDepth0
    Loop16LineEnd:
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
@ -0,0 +1,169 @@
 #ifdef __aarch64__
 .text
 .align 5
 .global ConvDwInt8PostAlign4
 #ifndef __APPLE__
 .type ConvDwInt8PostAlign4, %function
 #endif
 // void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
 //                           int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
 // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
 // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
 ConvDwInt8PostAlign4:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    ldr x8, [sp]
    dup v26.4s, w5
    dup v27.4s, w4
    dup v28.4s, w6
    dup v29.4s, w3
    dup v30.4s, w7
    dup v31.4s, w8
    cmp x2, 16
    blt LoopDepth8
    LoopDepth16:
        ld1 {v0.4s}, [x1], #16
        ld1 {v1.4s}, [x1], #16
        ld1 {v2.4s}, [x1], #16
        ld1 {v3.4s}, [x1], #16
        sqshl v0.4s, v0.4s, v26.4s
        sqshl v1.4s, v1.4s, v26.4s
        sqshl v2.4s, v2.4s, v26.4s
        sqshl v3.4s, v3.4s, v26.4s
        sqrdmulh v0.4s, v0.4s, v27.4s
        sqrdmulh v1.4s, v1.4s, v27.4s
        sqrdmulh v2.4s, v2.4s, v27.4s
        sqrdmulh v3.4s, v3.4s, v27.4s
        and v16.16b, v28.16b, v0.16b
        sshr v16.4s, v16.4s, #31
        sqadd v0.4s, v0.4s, v16.4s
        srshl v0.4s, v0.4s, v28.4s
        and v17.16b, v28.16b, v1.16b
        sshr v17.4s, v17.4s, #31
        sqadd v1.4s, v1.4s, v17.4s
        srshl v1.4s, v1.4s, v28.4s
        and v18.16b, v28.16b, v2.16b
        sshr v18.4s, v18.4s, #31
        sqadd v2.4s, v2.4s, v18.4s
        srshl v2.4s, v2.4s, v28.4s
        and v19.16b, v28.16b, v3.16b
        sshr v19.4s, v19.4s, #31
        sqadd v3.4s, v3.4s, v19.4s
        srshl v3.4s, v3.4s, v28.4s
        add v0.4s, v0.4s, v29.4s
        add v1.4s, v1.4s, v29.4s
        add v2.4s, v2.4s, v29.4s
        add v3.4s, v3.4s, v29.4s
        smax v0.4s, v0.4s, v30.4s
        smax v1.4s, v1.4s, v30.4s
        smax v2.4s, v2.4s, v30.4s
        smax v3.4s, v3.4s, v30.4s
        smin v0.4s, v0.4s, v31.4s
        smin v1.4s, v1.4s, v31.4s
        smin v2.4s, v2.4s, v31.4s
        smin v3.4s, v3.4s, v31.4s
        sqxtn v0.4h, v0.4s
        sqxtn v1.4h, v1.4s
        sqxtn v2.4h, v2.4s
        sqxtn v3.4h, v3.4s
        sqxtn v0.8b, v0.8h
        sqxtn v1.8b, v1.8h
        sqxtn v2.8b, v2.8h
        sqxtn v3.8b, v3.8h
        st1 {v0.s}[0], [x0], #4
        st1 {v1.s}[0], [x0], #4
        st1 {v2.s}[0], [x0], #4
        st1 {v3.s}[0], [x0], #4
        sub x2, x2, #16
        cmp x2, #16
        bge LoopDepth16
    LoopDepth8:
        cmp x2, #8
        blt LoopDepth4
        ld1 {v0.4s}, [x1], #16
        ld1 {v1.4s}, [x1], #16
        sqshl v0.4s, v0.4s, v26.4s
        sqshl v1.4s, v1.4s, v26.4s
        sqrdmulh v0.4s, v0.4s, v27.4s
        sqrdmulh v1.4s, v1.4s, v27.4s
        and v16.16b, v28.16b, v0.16b
        sshr v16.4s, v16.4s, #31
        sqadd v0.4s, v0.4s, v16.4s
        srshl v0.4s, v0.4s, v28.4s
        and v17.16b, v28.16b, v1.16b
        sshr v17.4s, v17.4s, #31
        sqadd v1.4s, v1.4s, v17.4s
        srshl v1.4s, v1.4s, v28.4s
        add v0.4s, v0.4s, v29.4s
        add v1.4s, v1.4s, v29.4s
        smax v0.4s, v0.4s, v30.4s
        smax v1.4s, v1.4s, v30.4s
        smin v0.4s, v0.4s, v31.4s
        smin v1.4s, v1.4s, v31.4s
        sqxtn v0.4h, v0.4s
        sqxtn v1.4h, v1.4s
        sqxtn v0.8b, v0.8h
        sqxtn v1.8b, v1.8h
        st1 {v0.s}[0], [x0], #4
        st1 {v1.s}[0], [x0], #4
        sub x2, x2, #8
        cmp x2, #8
        bge LoopDepth8
    LoopDepth4:
        cmp x2, #4
        blt End
        ld1 {v0.4s}, [x1], #16
        sqshl v0.4s, v0.4s, v26.4s
        sqrdmulh v0.4s, v0.4s, v27.4s
        and v16.16b, v28.16b, v0.16b
        sshr v16.4s, v16.4s, #31
        sqadd v0.4s, v0.4s, v16.4s
        srshl v0.4s, v0.4s, v28.4s
        add v0.4s, v0.4s, v29.4s
        smax v0.4s, v0.4s, v30.4s
        smin v0.4s, v0.4s, v31.4s
        sqxtn v0.4h, v0.4s
        sqxtn v0.8b, v0.8h
        st1 {v0.s}[0], [x0], #4
        sub x2, x2, #4
        bge LoopDepth4
    End:
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
@ -0,0 +1,122 @@
 #ifdef __aarch64__
 .text
 .align 5
 .global ConvDwInt8Row
 #ifndef __APPLE__
 .type ConvDwInt8Row, %function
 #endif
 // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
 //                    int output_channel, int input_step, int8_t input_zp)
 // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
 // x4: output_channel, x5: input_step, x6: input_zp
 //
 ConvDwInt8Row:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
 cmp x3, #0
 beq End
 mov x10, x0
 dup v31.8b, w6
 LoopOutPixel:
 mov x7, x1
 mov x8, x2
 mov x9, x4
    LoopDepth16In:
    cmp x9, #16
    blt L8
    sub x9, x9, #16
    ld1 {v0.8b, v1.8b}, [x7], #16
    ld1 {v2.8h, v3.8h}, [x8], #32
    ld1 {v16.4s, v17.4s}, [x0], #32
    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h
    cmp x9, #16
    blt LoopDepth16Out
    LoopDepth16:
    st1 {v16.4s, v17.4s}, [x10], #32
    ld1 {v18.4s, v19.4s}, [x0], #32
    ssubl v21.8h, v1.8b, v31.8b
    smlal v18.4s, v21.4h, v3.4h
    smlal2 v19.4s, v21.8h, v3.8h
    st1 {v18.4s, v19.4s}, [x10], #32
    ld1 {v0.8b, v1.8b}, [x7], #16
    ld1 {v2.8h, v3.8h}, [x8], #32
    ld1 {v16.4s, v17.4s}, [x0], #32
    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h
    sub x9, x9, #16
    cmp x9, #16
    bge LoopDepth16
    LoopDepth16Out:
    st1 {v16.4s, v17.4s}, [x10], #32
    ld1 {v18.4s, v19.4s}, [x0], #32
    ssubl v21.8h, v1.8b, v31.8b
    smlal v18.4s, v21.4h, v3.4h
    smlal2 v19.4s, v21.8h, v3.8h
    st1 {v18.4s, v19.4s}, [x10], #32
    L8:
    cmp x9, #8
    blt L0
    LoopDepth8:
    ld1 {v0.8b}, [x7], #8
    ld1 {v2.8h}, [x8], #16
    ld1 {v16.4s, v17.4s}, [x0], #32
    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h
    st1 {v16.4s, v17.4s}, [x10], #32
    sub x9, x9, #8
    cmp x9, #8
    bge LoopDepth8
    L0:
    cmp x9, #0
    beq Loop16LineEnd
    LoopDepth0:
    ldrsb w14, [x7], #1
    ldrsh w15, [x8], #2
    ldr w16, [x0], #4
    add w14, w14, w6
    sxth w14, w14
    madd w14, w14, w15, w16
    str w14, [x10], #4
    subs x9, x9, #1
    bne LoopDepth0
    Loop16LineEnd:
 subs x3, x3, #1
 add x1, x1, x5
 bne LoopOutPixel
 End:
 ret
 #endif
--- a/mindspore/lite/nnacl/int8/common_func.h
+++ b/mindspore/lite/nnacl/int8/common_func.h
@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co
                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
                      size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
                      int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
 void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
                   int output_channel, int input_step, int8_t input_zp);
 void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
                          int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
 #endif
 #ifdef __cplusplus
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
@ -20,6 +20,99 @@
 #include "nnacl/int8/common_func.h"
 /*conv depthwise int8 begin*/
 // only support perlayer
 #ifndef ENABLE_ARM64
 void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
                   int output_channel, int input_step, int8_t input_zp) {
  for (int i = 0; i < num_pixels; i++) {
    for (int c = 0; c < output_channel; c++) {
      const int16_t input = input_ptr[c] - input_zp;
      *output_ptr++ += input * weight_ptr[c];
    }
    input_ptr += input_step;
  }
 }
 #endif
 void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
                    int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) {
  int align_num = 0;
 #ifdef ENABLE_ARM64
  align_num = num_pixels / 4 * 4;
  ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max);
 #endif
  for (int i = align_num; i < num_pixels; i++) {
    buffer[i] = RoundingDivideByPOT(
      SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift);
    buffer[i] += output_zp;
    buffer[i] = MSMAX(buffer[i], acc_min);
    buffer[i] = MSMIN(buffer[i], acc_max);
    dst[i] = (buffer[i]);
  }
 }
 void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data,
                const int32_t *bias_data, const ConvParameter *conv_param, int task_id) {
  int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
  int h_start = h_step * task_id;
  int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
  int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0];
  int left_shift = conv_param->conv_quant_arg_.left_shift_[0];
  int right_shift = conv_param->conv_quant_arg_.right_shift_[0];
  int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
  int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
  int acc_min = conv_param->conv_quant_arg_.out_act_min_[0];
  int acc_max = conv_param->conv_quant_arg_.out_act_max_[0];
  for (int b = 0; b < conv_param->output_batch_; b++) {
    const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
    int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
    for (int oh = h_start; oh < h_end; oh++) {
      int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
      int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
      // init acc
      for (int ow = 0; ow < conv_param->output_w_; ow++) {
        memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t));
      }
      for (int kh = start_kh; kh < end_kh; kh++) {
        int ih = ih_origin + conv_param->dilation_w_ * kh;
        const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
        const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
        int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
        for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
          int out_w_start = MSMAX(
            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
                                                        conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
                                                         conv_param->stride_w_);
          int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_;
          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
          const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
          int num_pixels = out_w_end - out_w_start;
          ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp);
          weight_kh += conv_param->output_channel_;
        }
      }
      // post func, acc int32 -> dst int8
      ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp,
                     out_multiplier, left_shift, right_shift, acc_min, acc_max);
    }
  }
 }
 /*conv depthwise int8 end*/
 /*conv depthwise sliding window int8 begin*/
 void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
                              int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier,
                              int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max,
@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
 }
 #endif
-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
+                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
  const int16_t *src = input_data;
  int8_t *dst = output_data;
  bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
  }  // batch loop
  // output nhwc4
 }
-/*conv depthwise int8 end*/
+/*conv depthwise sliding window int8 end*/
 /*deconv depthwise int8 begin*/
 void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
@ -23,8 +23,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
+void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data,
                const int32_t *bias_data, const ConvParameter *conv_param, int task_id);
 void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
 void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data,
                  const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@ -15,6 +15,7 @@
 */
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {
 ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
  if (sliding != nullptr) {
    delete sliding;
    sliding = nullptr;
  }
  if (packed_weight_ != nullptr) {
    free(packed_weight_);
    packed_weight_ = nullptr;
@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
 int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
  // init weight, int8 -> int16
  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
  auto weight_tensor = in_tensors_[kWeightIndex];
  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
-  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int channel = weight_tensor->Batch();
-  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+  int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
  auto tmp_weight = reinterpret_cast<int8_t *>(malloc(pack_weight_size * sizeof(int8_t)));
  if (tmp_weight == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
                     weight_tensor->Batch());
  int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
  packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
+  for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
-                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
+    packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
  }
-  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
+  memset(bias_data_, 0, channel * sizeof(int32_t));
  if (in_tensors_.size() == kInputSize2) {
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
  }
  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
  return RET_OK;
 }
 int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                        UP_DIV(conv_param_->input_channel_, 4);
  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  if (conv_param_->input_channel_ % C4NUM != 0) {
    need_align_ = true;
    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
                           UP_DIV(conv_param_->output_channel_, C4NUM);
    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
    if (packed_input_ == nullptr) {
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
 int ConvolutionDepthwiseInt8CPUKernel::Init() {
  sliding = new (std::nothrow) SlidingWindowParam;
  if (sliding == nullptr) {
    MS_LOG(ERROR) << "new sliding window param.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
  ConvolutionBaseCPUKernel::Init();
  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
  auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set quant param failed.";
    return ret;
  }
  conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
  ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
 }
 int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
-  ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
+  auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id;
-             sliding, task_id);
+  ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
             task_id);
  return RET_OK;
 }
@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) {
  return RET_OK;
 }
 int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
  int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_;
  row_buffer_ = reinterpret_cast<int32_t *>(context_->allocator->Malloc(output_row_size * sizeof(float)));
  if (row_buffer_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  return RET_OK;
 }
 int ConvolutionDepthwiseInt8CPUKernel::Run() {
  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
    MS_LOG(ERROR) << "Only support input channel equals output channel.";
@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
  }
  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
+  input_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());
  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
-  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
+  auto output_tensor = out_tensors_.at(kOutputIndex);
-  if (!need_align_) {
+  output_ptr_ = reinterpret_cast<int8_t *>(output_tensor->Data());
    packed_output_ = output_addr;
  }
  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
    return RET_ERROR;
  }
-  if (need_align_) {
+  context_->allocator->Free(row_buffer_);
    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
    context_->allocator->Free(packed_output_);
  }
  context_->allocator->Free(packed_input_);
  return RET_OK;
 }
@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T
                                               const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
-  auto kernel =
+  kernel::LiteKernel *kernel;
-    new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size();
  if (filter_quant_size == 1) {  // per tensor
    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {  // per channel
    kernel =
      new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel is nullptr.";
    return nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
  int Run() override;
  int InitWeightBias();
  int InitBuffer();
  int Execute(int task_id);
 private:
-  SlidingWindowParam *sliding = nullptr;
+  int InitBuffer();
  int16_t *packed_weight_ = nullptr;
-  int16_t *packed_input_ = nullptr;
+  int8_t *input_ptr_ = nullptr;
-  int8_t *packed_output_ = nullptr;
+  int8_t *output_ptr_ = nullptr;
-  bool need_align_ = false;
+  int32_t *row_buffer_ = nullptr;
 };
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
@ -0,0 +1,182 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "nnacl/int8/conv_depthwise_int8.h"
 #include "src/runtime/runtime_api.h"
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {
 ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() {
  if (sliding != nullptr) {
    delete sliding;
    sliding = nullptr;
  }
  if (packed_weight_ != nullptr) {
    free(packed_weight_);
    packed_weight_ = nullptr;
  }
  FreeQuantParam();
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
  // init weight, int8 -> int16
  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
  auto weight_tensor = in_tensors_[kWeightIndex];
  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
  packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
  if (in_tensors_.size() == kInputSize2) {
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
  }
  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
  return RET_OK;
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() {
  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                        UP_DIV(conv_param_->input_channel_, 4);
  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  if (conv_param_->input_channel_ % C4NUM != 0) {
    need_align_ = true;
    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
                           UP_DIV(conv_param_->output_channel_, C4NUM);
    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
    if (packed_input_ == nullptr) {
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::Init() {
  sliding = new (std::nothrow) SlidingWindowParam;
  if (sliding == nullptr) {
    MS_LOG(ERROR) << "new sliding window param.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() {
  ConvolutionBaseCPUKernel::Init();
  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
  auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set quant param failed.";
    return ret;
  }
  ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
    return ret;
  }
  return RET_OK;
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) {
  ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
               sliding, task_id);
  return RET_OK;
 }
 int ConvDwSWInt8Run(void *cdata, int task_id) {
  auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseSWInt8CPUKernel *>(cdata);
  auto ret = conv_dw_int8->Execute(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
    return RET_ERROR;
  }
  return RET_OK;
 }
 int ConvolutionDepthwiseSWInt8CPUKernel::Run() {
  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
    MS_LOG(ERROR) << "Only support input channel equals output channel.";
    return RET_ERROR;
  }
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }
  ret = InitBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
    return ret;
  }
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
  if (!need_align_) {
    packed_output_ = output_addr;
  }
  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]";
    return RET_ERROR;
  }
  if (need_align_) {
    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
    context_->allocator->Free(packed_output_);
  }
  context_->allocator->Free(packed_input_);
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
@ -0,0 +1,51 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/fp32/conv_depthwise.h"
 namespace mindspore::kernel {
 class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel {
 public:
  ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                      const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionDepthwiseSWInt8CPUKernel() override;
  int Init() override;
  int ReSize() override;
  int Run() override;
  int InitWeightBias();
  int InitBuffer();
  int Execute(int task_id);
 private:
  SlidingWindowParam *sliding = nullptr;
  int16_t *packed_weight_ = nullptr;
  int16_t *packed_input_ = nullptr;
  int8_t *packed_output_ = nullptr;
  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_