From 236c8de5dad06daa23526dd190885ed4d2a37d0f Mon Sep 17 00:00:00 2001
From: ling <lingqiaomin.huawei.com>
Date: Sat, 12 Sep 2020 15:56:31 +0800
Subject: [PATCH] [MSLITE][Develop]conv1x1 arm32 filter peroc

---
 mindspore/lite/nnacl/int8/conv_int8.c         | 10 +++++
 mindspore/lite/nnacl/int8/conv_int8.h         |  3 ++
 mindspore/lite/nnacl/int8/matmul_int8.c       | 40 +++++++++++++++++--
 mindspore/lite/nnacl/int8/matmul_int8.h       |  4 ++
 mindspore/lite/nnacl/pack.c                   | 28 +++++++++++++
 .../kernel/arm/int8/convolution_1x1_int8.cc   | 26 ++++++------
 .../kernel/arm/int8/convolution_1x1_int8.h    |  2 +-
 7 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c
index bc6f3c3143..33e88fe5dc 100644
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -735,6 +735,16 @@ void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int
   return;
 }
 
+void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                      const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
+                      int32_t *multiplier, ConvParameter *conv_param) {
+  MatMulInt8_4x2_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
+                   left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                   conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
+                   conv_param->conv_quant_arg_.filter_arg_num_ != 1);
+  return;
+}
+
 void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                  const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
                  int32_t *multiplier, ConvParameter *conv_param) {
diff --git a/mindspore/lite/nnacl/int8/conv_int8.h b/mindspore/lite/nnacl/int8/conv_int8.h
index 60d84e27f7..f5a1944fb1 100644
--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@@ -64,6 +64,9 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t
 void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                     const int32_t *bias, int row, int col, int deep4, int32_t *left_shift, int32_t *right_shift,
                     int32_t *multiplier, ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func);
+void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                      const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
+                      int32_t *multiplier, ConvParameter *conv_param);
 
 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c
index 33da1b4ed7..488bc9693e 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -46,12 +46,12 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co
 void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
   int col16 = UP_ROUND(col, C16NUM);
   for (int r = 0; r < row; r++) {
-    int rd4 = r / C2NUM;
-    int rm4 = r % C2NUM;
+    int rd2 = r / C2NUM;
+    int rm2 = r % C2NUM;
     for (int c = 0; c < col; c++) {
       int cd16 = c / C16NUM;
       int cm16 = c % C16NUM;
-      int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16;
+      int dst_index = rd2 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm2 * C16NUM + cm16;
       int src_index = r * col + c;
       dst_ptr[dst_index] = src_ptr[src_index];
     }
@@ -232,6 +232,40 @@ void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
   return;
 }
 
+void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                      bool peroc) {
+  /* support per-layer && weight per-channel */
+  /*  row4x16-major * row16x2-major => (int8)row-major*/
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM;
+      int c2div = c / C2NUM, c2mod = c % C2NUM;
+      size_t ci = r * stride + c;
+      int32_t value = 0;
+      for (int d = 0; d < deep_16; d++) {
+        int d16div = d / C16NUM, d16mod = d % C16NUM;
+        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c2div * deep_16 * C2NUM + d16div * C2NUM * C16NUM + c2mod * C16NUM + d16mod;
+        value = value + a[ai] * b[bi];
+      }
+      int32_t cur_input_sum =
+        peroc ? input_sum[c2div * UP_ROUND(row, C4NUM) * C2NUM + r * C2NUM + c2mod] : input_sum[r];
+      value -= cur_input_sum;
+      value += bias[c];
+      int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0];
+      int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0];
+      int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0];
+      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      dst[ci] = (int8_t)value;
+    }
+  }
+  return;
+}
+
 void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h
index 11dd3a66c0..d2fd4c87f6 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -52,6 +52,10 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums
                 int stride);
 
 void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                      bool peroc);
 
 #ifdef ENABLE_ARM64
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c
index a6e187c85d..583207cf83 100644
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -404,14 +404,42 @@ void PackInputSum16x4PerChannel(const int8_t *input_value, int32_t *input_sum, i
   return;
 }
 
+void PackInputSum16x4PerChannelArm32(const int8_t *input_value, int32_t *input_sum, int32_t *filter_zp_ptr,
+                                     size_t plane_size, size_t input_channel, size_t output_channel) {
+  size_t hw4 = UP_ROUND(plane_size, C4NUM);
+  size_t ic16 = UP_ROUND(input_channel, C16NUM);
+
+  for (int ri = 0; ri < plane_size; ri++) {
+    int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
+    for (int ci = 0; ci < output_channel; ci++) {
+      int32_t tmp_sum_value = 0;
+      int ci2div = ci / C2NUM, ci2mod = ci % C2NUM;
+      int32_t filter_zp = filter_zp_ptr[ci];
+      for (int di = 0; di < input_channel; di++) {
+        size_t di16div = di / C16NUM, di16mod = di % C16NUM;
+        int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
+        tmp_sum_value += input_value[src_index];
+      }
+      int dst_index = ci2div * C2NUM * hw4 + ri * C2NUM + ci2mod;
+      input_sum[dst_index] = tmp_sum_value * filter_zp;
+    }
+  }
+  return;
+}
+
 void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, int32_t *filter_zp, ConvParameter *conv_param) {
   size_t hw4 = UP_ROUND(conv_param->input_h_ * conv_param->input_w_, C4NUM);
   size_t ic16 = UP_ROUND(conv_param->input_channel_, C16NUM);
   if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
     PackInputSum16x4PerLayer(input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
   } else {
+#ifdef ENABLE_ARM32
+    PackInputSum16x4PerChannelArm32(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_,
+                                    conv_param->input_channel_, conv_param->output_channel_);
+#else
     PackInputSum16x4PerChannel(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_,
                                conv_param->input_channel_, conv_param->output_channel_);
+#endif
   }
   return;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
index 0acd324d2a..80147822b2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -175,7 +175,7 @@ int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() {
     MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!";
     return RET_ERROR;
   }
-  memset(bias_data_, 0, size);
+  memset(bias_data_, 0, col2 * sizeof(int32_t));
   if (in_tensors_.size() == 3) {
     memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
   }
@@ -249,16 +249,16 @@ int Convolution1x1Int8CPUKernel::InitParam() {
 
   /* init input sum size */
   if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
-    input_sum_size = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count);
+    input_sum_size_ = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count);
   } else {
-    input_sum_size = UP_ROUND(matmul_param_->row_, row_pack_count);
+    input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
   }
 
-  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, row_pack_count));
-  thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, row_pack_count), thread_count_);
+  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_pack_count));
+  thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_pack_count), thread_count_);
 
-  thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, col_pack_count));
-  thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, col_pack_count), thread_count_hw_);
+  thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_pack_count));
+  thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, row_pack_count), thread_count_hw_);
 
   if (pre_trans_input_) {
     input_ptr_ = reinterpret_cast<int8_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
@@ -269,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
     memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t));
   }
   return RET_OK;
-}  // namespace mindspore::kernel
+}
 
 int Convolution1x1Int8CPUKernel::ReSize() {
   FreeResizeBuf();
@@ -314,10 +314,10 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
   if (cur_oc <= 0) {
     return RET_OK;
   }
-  Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
-              output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
-              reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc,
-              matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
+  Conv1x1Int8Arm32(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
+                   output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
+                   reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_,
+                   cur_oc, matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
 #else
   if (support_optimize_) {
     int cur_stride = thread_stride_ * C8NUM;
@@ -392,7 +392,7 @@ int Convolution1x1Int8Impl(void *cdata, int task_id) {
 }
 
 int Convolution1x1Int8CPUKernel::InitRunBuf() {
-  input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size * sizeof(int32_t)));
+  input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size_ * sizeof(int32_t)));
   if (input_sum_ == nullptr) {
     MS_LOG(ERROR) << "malloc input_sum_ failed.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
index d8a57f2439..42ca9b972c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@@ -69,7 +69,7 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
   size_t thread_count_hw_ = 1;
   size_t thread_stride_hw_ = 0;
   bool pre_trans_input_ = false;
-  size_t input_sum_size = 0;
+  size_t input_sum_size_ = 0;
   MatMulParameter *matmul_param_ = nullptr;
   MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
   bool support_optimize_ = false;