optimization for int8 matmul on arm64

pull/9048/head
lixian 4 years ago
parent 5ac54edfee
commit 7126bea143

@ -3,9 +3,9 @@
.text
.align 5
.global MatmulInt8Neon32Opt
.global MatmulInt8Opt
#ifndef __APPLE__
.type MatmulInt8Neon32Opt, %function
.type MatmulInt8Opt, %function
#endif
//void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
@ -16,7 +16,7 @@
// #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
// #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp
MatmulInt8Neon32Opt:
MatmulInt8Opt:
push {r0-r11, lr}
vpush {q4-q7}
add sp, sp, #116

File diff suppressed because it is too large Load Diff

@ -815,22 +815,10 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t
const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
int32_t *multiplier, ConvParameter *conv_param, int32_t *filter_zp) {
int is_per_oc = (int)conv_param->conv_quant_arg_.filter_arg_num_ != 1;
#ifdef ENABLE_ARM32
MatmulInt8Neon32Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
conv_param->output_channel_, is_per_oc, filter_zp);
#elif ENABLE_ARM64
MatmulInt8Neon64Opt(packed_input, packed_weight, dst, UP_ROUND(row, C4NUM), UP_ROUND(col, C4NUM), deep16, input_sum,
bias, conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift, row,
col, conv_param->output_channel_, is_per_oc, filter_zp);
#else
MatmulInt8Opt(packed_input, packed_weight, dst, row, col, deep16, input_sum, bias,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, multiplier, left_shift, right_shift,
conv_param->output_channel_, is_per_oc, filter_zp);
#endif
return;
}

@ -253,7 +253,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
#ifndef ENABLE_ARM
void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
const int *bias, int mini, int maxi, int out_zp, int32_t *multiplier, int32_t *left_shift,
int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp) {
int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp) {
int col_tile = C4NUM;
/* support per-layer && weight per-channel */
/* row4x16-major * row16x2-major => (int8)row-major*/

@ -62,7 +62,7 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
size_t per_channel, int32_t *filter_zp);
void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);
int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp);
#ifdef ENABLE_ARM64
void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
@ -71,18 +71,11 @@ void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, i
void MatMulR4Int8Neon64(const int8_t *a, const int8_t *b, int32_t *dst, int row4, int col4, int deep16,
const int *input_sum, const int *bias);
void MatmulInt8Neon64Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16,
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
int32_t *left_shift, int32_t *right_shift, int row, int col, int stride, int filter_peroc,
int32_t *filter_zp);
#endif
#ifdef ENABLE_ARM32
void MatmulInt8Neon32(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
int *multiplier, int *left_shift, int *right_shift, int stride, int per_channel);
void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier,
int32_t *left_shift, int32_t *right_shift, int stride, int filter_peroc, int32_t *filter_zp);
#endif
#ifdef __cplusplus
}

Loading…
Cancel
Save