optimization for int8 matmul on arm64

4 years ago · 7126bea143
parent 5ac54edfee
commit 7126bea143
5 changed files with 324 additions and 410 deletions
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
@ -3,9 +3,9 @@

 .text
 .align 5
-.global MatmulInt8Neon32Opt
+.global MatmulInt8Opt
 #ifndef __APPLE__
-.type MatmulInt8Neon32Opt, %function
+.type MatmulInt8Opt, %function
 #endif

 //void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, 
@ -16,7 +16,7 @@
 // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
 // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp

-MatmulInt8Neon32Opt:
+MatmulInt8Opt:
  push {r0-r11, lr}
  vpush {q4-q7}
  add sp, sp, #116
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@ -815,22 +815,10 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t
                 const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
                 int32_t *multiplier, ConvParameter *conv_param, int32_t *filter_zp) {
  int is_per_oc = (int)conv_param->conv_quant_arg_.filter_arg_num_ != 1;
-#ifdef ENABLE_ARM32
-  MatmulInt8Neon32Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
-                      conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
-                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
-                      conv_param->output_channel_, is_per_oc, filter_zp);
-#elif ENABLE_ARM64
-  MatmulInt8Neon64Opt(packed_input, packed_weight, dst, UP_ROUND(row, C4NUM), UP_ROUND(col, C4NUM), deep16, input_sum,
-                      bias, conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
-                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift, row,
-                      col, conv_param->output_channel_, is_per_oc, filter_zp);
-#else
  MatmulInt8Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
                conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
                conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
                conv_param->output_channel_, is_per_oc, filter_zp);
-#endif
  return;
 }

--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@ -253,7 +253,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
 #ifndef ENABLE_ARM
 void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
                   const int *bias, int mini, int maxi, int out_zp, int32_t *multiplier, int32_t *left_shift,
-                   int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp) {
+                   int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp) {
  int col_tile = C4NUM;
  /* support per-layer && weight per-channel */
  /*  row4x16-major * row16x2-major => (int8)row-major*/
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@ -62,7 +62,7 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
                       size_t per_channel, int32_t *filter_zp);
 void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
                   const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
-                   int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);
+                   int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp);

 #ifdef ENABLE_ARM64
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
@ -71,18 +71,11 @@ void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, i

 void MatMulR4Int8Neon64(const int8_t *a, const int8_t *b, int32_t *dst, int row4, int col4, int deep16,
                        const int *input_sum, const int *bias);
-void MatmulInt8Neon64Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16,
-                         const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
-                         int32_t *left_shift, int32_t *right_shift, int row, int col, int stride, int filter_peroc,
-                         int32_t *filter_zp);
 #endif
 #ifdef ENABLE_ARM32
 void MatmulInt8Neon32(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
                      const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
                      int *multiplier, int *left_shift, int *right_shift, int stride, int per_channel);
-void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
-                         const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
-                         int32_t *left_shift, int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);
 #endif
 #ifdef __cplusplus
 }