From 236c8de5dad06daa23526dd190885ed4d2a37d0f Mon Sep 17 00:00:00 2001 From: ling Date: Sat, 12 Sep 2020 15:56:31 +0800 Subject: [PATCH] [MSLITE][Develop]conv1x1 arm32 filter peroc --- mindspore/lite/nnacl/int8/conv_int8.c | 10 +++++ mindspore/lite/nnacl/int8/conv_int8.h | 3 ++ mindspore/lite/nnacl/int8/matmul_int8.c | 40 +++++++++++++++++-- mindspore/lite/nnacl/int8/matmul_int8.h | 4 ++ mindspore/lite/nnacl/pack.c | 28 +++++++++++++ .../kernel/arm/int8/convolution_1x1_int8.cc | 26 ++++++------ .../kernel/arm/int8/convolution_1x1_int8.h | 2 +- 7 files changed, 96 insertions(+), 17 deletions(-) diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c index bc6f3c3143..33e88fe5dc 100644 --- a/mindspore/lite/nnacl/int8/conv_int8.c +++ b/mindspore/lite/nnacl/int8/conv_int8.c @@ -735,6 +735,16 @@ void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int return; } +void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, + const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, + int32_t *multiplier, ConvParameter *conv_param) { + MatMulInt8_4x2_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, + left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_, + conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], + conv_param->conv_quant_arg_.filter_arg_num_ != 1); + return; +} + void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, int32_t *multiplier, ConvParameter *conv_param) { diff --git a/mindspore/lite/nnacl/int8/conv_int8.h b/mindspore/lite/nnacl/int8/conv_int8.h index 60d84e27f7..f5a1944fb1 100644 --- a/mindspore/lite/nnacl/int8/conv_int8.h +++ b/mindspore/lite/nnacl/int8/conv_int8.h @@ -64,6 +64,9 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, const int32_t *bias, int row, int col, int deep4, int32_t *left_shift, int32_t *right_shift, int32_t *multiplier, ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func); +void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, + const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, + int32_t *multiplier, ConvParameter *conv_param); // int8 convolution 3x3 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c index 33da1b4ed7..488bc9693e 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.c +++ b/mindspore/lite/nnacl/int8/matmul_int8.c @@ -46,12 +46,12 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { int col16 = UP_ROUND(col, C16NUM); for (int r = 0; r < row; r++) { - int rd4 = r / C2NUM; - int rm4 = r % C2NUM; + int rd2 = r / C2NUM; + int rm2 = r % C2NUM; for (int c = 0; c < col; c++) { int cd16 = c / C16NUM; int cm16 = c % C16NUM; - int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16; + int dst_index = rd2 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm2 * C16NUM + cm16; int src_index = r * col + c; dst_ptr[dst_index] = src_ptr[src_index]; } @@ -232,6 +232,40 @@ void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row return; } +void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, + bool peroc) { + /* support per-layer && weight per-channel */ + /* row4x16-major * row16x2-major => (int8)row-major*/ + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r4div = r / C4NUM, r4mod = r % C4NUM; + int c2div = c / C2NUM, c2mod = c % C2NUM; + size_t ci = r * stride + c; + int32_t value = 0; + for (int d = 0; d < deep_16; d++) { + int d16div = d / C16NUM, d16mod = d % C16NUM; + size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; + size_t bi = c2div * deep_16 * C2NUM + d16div * C2NUM * C16NUM + c2mod * C16NUM + d16mod; + value = value + a[ai] * b[bi]; + } + int32_t cur_input_sum = + peroc ? input_sum[c2div * UP_ROUND(row, C4NUM) * C2NUM + r * C2NUM + c2mod] : input_sum[r]; + value -= cur_input_sum; + value += bias[c]; + int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0]; + int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0]; + int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0]; + value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; + value = MSMIN(maxi, value); + value = MSMAX(mini, value); + dst[ci] = (int8_t)value; + } + } + return; +} + void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h index 11dd3a66c0..d2fd4c87f6 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.h +++ b/mindspore/lite/nnacl/int8/matmul_int8.h @@ -52,6 +52,10 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums int stride); void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); +void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, + bool peroc); #ifdef ENABLE_ARM64 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index a6e187c85d..583207cf83 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -404,14 +404,42 @@ void PackInputSum16x4PerChannel(const int8_t *input_value, int32_t *input_sum, i return; } +void PackInputSum16x4PerChannelArm32(const int8_t *input_value, int32_t *input_sum, int32_t *filter_zp_ptr, + size_t plane_size, size_t input_channel, size_t output_channel) { + size_t hw4 = UP_ROUND(plane_size, C4NUM); + size_t ic16 = UP_ROUND(input_channel, C16NUM); + + for (int ri = 0; ri < plane_size; ri++) { + int ri4div = ri / C4NUM, ri4mod = ri % C4NUM; + for (int ci = 0; ci < output_channel; ci++) { + int32_t tmp_sum_value = 0; + int ci2div = ci / C2NUM, ci2mod = ci % C2NUM; + int32_t filter_zp = filter_zp_ptr[ci]; + for (int di = 0; di < input_channel; di++) { + size_t di16div = di / C16NUM, di16mod = di % C16NUM; + int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod; + tmp_sum_value += input_value[src_index]; + } + int dst_index = ci2div * C2NUM * hw4 + ri * C2NUM + ci2mod; + input_sum[dst_index] = tmp_sum_value * filter_zp; + } + } + return; +} + void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, int32_t *filter_zp, ConvParameter *conv_param) { size_t hw4 = UP_ROUND(conv_param->input_h_ * conv_param->input_w_, C4NUM); size_t ic16 = UP_ROUND(conv_param->input_channel_, C16NUM); if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) { PackInputSum16x4PerLayer(input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16); } else { +#ifdef ENABLE_ARM32 + PackInputSum16x4PerChannelArm32(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_, + conv_param->input_channel_, conv_param->output_channel_); +#else PackInputSum16x4PerChannel(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_, conv_param->input_channel_, conv_param->output_channel_); +#endif } return; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc index 0acd324d2a..80147822b2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -175,7 +175,7 @@ int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() { MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!"; return RET_ERROR; } - memset(bias_data_, 0, size); + memset(bias_data_, 0, col2 * sizeof(int32_t)); if (in_tensors_.size() == 3) { memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); } @@ -249,16 +249,16 @@ int Convolution1x1Int8CPUKernel::InitParam() { /* init input sum size */ if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { - input_sum_size = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count); + input_sum_size_ = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count); } else { - input_sum_size = UP_ROUND(matmul_param_->row_, row_pack_count); + input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count); } - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, row_pack_count)); - thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, row_pack_count), thread_count_); + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_pack_count)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_pack_count), thread_count_); - thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, col_pack_count)); - thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, col_pack_count), thread_count_hw_); + thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_pack_count)); + thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, row_pack_count), thread_count_hw_); if (pre_trans_input_) { input_ptr_ = reinterpret_cast(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t))); @@ -269,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)); } return RET_OK; -} // namespace mindspore::kernel +} int Convolution1x1Int8CPUKernel::ReSize() { FreeResizeBuf(); @@ -314,10 +314,10 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { if (cur_oc <= 0) { return RET_OK; } - Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_, - output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum, - reinterpret_cast(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc, - matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); + Conv1x1Int8Arm32(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_, + output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum, + reinterpret_cast(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, + cur_oc, matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); #else if (support_optimize_) { int cur_stride = thread_stride_ * C8NUM; @@ -392,7 +392,7 @@ int Convolution1x1Int8Impl(void *cdata, int task_id) { } int Convolution1x1Int8CPUKernel::InitRunBuf() { - input_sum_ = reinterpret_cast(ctx_->allocator->Malloc(input_sum_size * sizeof(int32_t))); + input_sum_ = reinterpret_cast(ctx_->allocator->Malloc(input_sum_size_ * sizeof(int32_t))); if (input_sum_ == nullptr) { MS_LOG(ERROR) << "malloc input_sum_ failed."; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h index d8a57f2439..42ca9b972c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h @@ -69,7 +69,7 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { size_t thread_count_hw_ = 1; size_t thread_stride_hw_ = 0; bool pre_trans_input_ = false; - size_t input_sum_size = 0; + size_t input_sum_size_ = 0; MatMulParameter *matmul_param_ = nullptr; MATMUL_OPT_R_FUNC matmul_func_ = nullptr; bool support_optimize_ = false;