From 7fd260c0ac4943d254577b662b51e1e710b56b6f Mon Sep 17 00:00:00 2001
From: zhanyuan <zhanyuan1@huawei.com>
Date: Thu, 17 Dec 2020 16:30:05 +0800
Subject: [PATCH] [MSLITE] Optimize fp32 conv 1x1 for arm v7a

---
 .../nnacl/assembly/arm32/MatmulFp32Opt12x4.S  | 160 +++++++++++++++++-
 .../kernel/arm/fp32/convolution_1x1_fp32.cc   | 101 +++++------
 .../kernel/arm/fp32/convolution_1x1_fp32.h    |   3 +
 3 files changed, 208 insertions(+), 56 deletions(-)

diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
index 03f0832923..bb765a7534 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
@@ -35,6 +35,12 @@ MatmulFloatNeon32Opt12x4:
     mov lr, #4
     mul r8, r8, lr // stride * sizeof(float)
 
+LoopRowStart:
+    cmp r6, #4
+    ble LoopRow4
+    cmp r6, #8
+    ble LoopRow8
+
 LoopRow:
     ldr r1, [sp, #-44] // reload rhs ptr
     ldr r7, [sp, #12] // reload rhs col
@@ -142,6 +148,158 @@ LoopRow:
             vmax.f32 q13, q13, q3
             vmax.f32 q14, q14, q3
             vmax.f32 q15, q15, q3
+            b Write
+
+LoopRow8:
+    ldr r1, [sp, #-44] // reload rhs ptr
+    ldr r7, [sp, #12] // reload rhs col
+    ldr r3, [sp, #-36] // reload bias ptr
+
+    LoopCol_R8:
+        ldr r2, [sp, #-40] // reload dst ptr
+        ldr r0, [sp, #-48] // reload lhs ptr
+        ldr r5, [sp, #4] // reload depth
+        vld1.32 {q3}, [r1]!
+        vld1.32 {q0, q1}, [r0]!
+        vmul.f32 q4, q3, d0[0]
+        vmul.f32 q5, q3, d0[1]
+        vmul.f32 q6, q3, d1[0]
+        vld1.32 {q2}, [r0]!
+        vmul.f32 q7, q3, d1[1]
+
+        vmul.f32 q8, q3, d2[0]
+        vmul.f32 q9, q3, d2[1]
+        vmul.f32 q10, q3, d3[0]
+        vmul.f32 q11, q3, d3[1]
+
+        subs r5, r5, #1
+        beq Bias_R8
+
+        LoopDepth_R8:
+            vld1.32 {q3}, [r1]!
+            vld1.32 {q0, q1}, [r0]!
+            vmla.f32 q4, q3, d0[0]
+            vmla.f32 q5, q3, d0[1]
+            vmla.f32 q6, q3, d1[0]
+            vld1.32 {q2}, [r0]!
+            vmla.f32 q7, q3, d1[1]
+
+            vmla.f32 q8, q3, d2[0]
+            vmla.f32 q9, q3, d2[1]
+            vmla.f32 q10, q3, d3[0]
+            vmla.f32 q11, q3, d3[1]
+
+            subs r5, r5, #1
+            bne LoopDepth_R8
+
+        Bias_R8:
+            cmp r3, #0
+            beq Activation_R8
+            vld1.32 {q0}, [r3]!
+            vadd.f32 q4, q4, q0
+            vadd.f32 q5, q5, q0
+            vadd.f32 q6, q6, q0
+            vadd.f32 q7, q7, q0
+            vadd.f32 q8, q8, q0
+            vadd.f32 q9, q9, q0
+            vadd.f32 q10, q10, q0
+            vadd.f32 q11, q11, q0
+
+        Activation_R8:
+            ldr lr, [sp]
+            cmp lr, #3
+            beq Relu6_R8
+            cmp lr, #1
+            beq Relu_R8
+            b Write
+
+        Relu6_R8:
+            vmov.i32 q2, #6
+            vcvt.f32.s32 q2, q2
+            vmin.f32 q4, q4, q2
+            vmin.f32 q5, q5, q2
+            vmin.f32 q6, q6, q2
+            vmin.f32 q7, q7, q2
+            vmin.f32 q8, q8, q2
+            vmin.f32 q9, q9, q2
+            vmin.f32 q10, q10, q2
+            vmin.f32 q11, q11, q2
+
+        Relu_R8:
+            veor q3, q3, q3
+            vmax.f32 q4, q4, q3
+            vmax.f32 q5, q5, q3
+            vmax.f32 q6, q6, q3
+            vmax.f32 q7, q7, q3
+            vmax.f32 q8, q8, q3
+            vmax.f32 q9, q9, q3
+            vmax.f32 q10, q10, q3
+            vmax.f32 q11, q11, q3
+            b Write
+
+LoopRow4:
+    ldr r1, [sp, #-44] // reload rhs ptr
+    ldr r7, [sp, #12] // reload rhs col
+    ldr r3, [sp, #-36] // reload bias ptr
+
+    LoopCol_R4:
+        ldr r2, [sp, #-40] // reload dst ptr
+        ldr r0, [sp, #-48] // reload lhs ptr
+        ldr r5, [sp, #4] // reload depth
+        vld1.32 {q3}, [r1]!
+        vld1.32 {q0, q1}, [r0]!
+        vmul.f32 q4, q3, d0[0]
+        vmul.f32 q5, q3, d0[1]
+        vmul.f32 q6, q3, d1[0]
+        vld1.32 {q2}, [r0]!
+        vmul.f32 q7, q3, d1[1]
+
+        subs r5, r5, #1
+        beq Bias_R4
+
+        LoopDepth_R4:
+            vld1.32 {q3}, [r1]!
+            vld1.32 {q0, q1}, [r0]!
+            vmla.f32 q4, q3, d0[0]
+            vmla.f32 q5, q3, d0[1]
+            vmla.f32 q6, q3, d1[0]
+            vld1.32 {q2}, [r0]!
+            vmla.f32 q7, q3, d1[1]
+
+            subs r5, r5, #1
+            bne LoopDepth_R4
+
+        Bias_R4:
+            cmp r3, #0
+            beq Activation_R4
+            vld1.32 {q0}, [r3]!
+            vadd.f32 q4, q4, q0
+            vadd.f32 q5, q5, q0
+            vadd.f32 q6, q6, q0
+            vadd.f32 q7, q7, q0
+
+        Activation_R4:
+            ldr lr, [sp]
+            cmp lr, #3
+            beq Relu6_R4
+            cmp lr, #1
+            beq Relu_R4
+            b Write
+
+        Relu6_R4:
+            vmov.i32 q2, #6
+            vcvt.f32.s32 q2, q2
+            vmin.f32 q4, q4, q2
+            vmin.f32 q5, q5, q2
+            vmin.f32 q6, q6, q2
+            vmin.f32 q7, q7, q2
+
+        Relu_R4:
+            veor q3, q3, q3
+            vmax.f32 q4, q4, q3
+            vmax.f32 q5, q5, q3
+            vmax.f32 q6, q6, q3
+            vmax.f32 q7, q7, q3
 
         Write:
             cmp r7, #1
@@ -398,7 +556,7 @@ LoopRow:
         cmp r6, #12
         ble LoopRowEnd
         sub r6, r6, #12 // lhs row - 12
-        b LoopRow
+        b LoopRowStart
 
 LoopRowEnd:
     sub sp, sp, #112
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
index 400c225bc3..e6a6b66db4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@@ -62,6 +62,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
   matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
   matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
   matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
+  matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
   matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
   matmul_param_->act_type_ = conv_param_->act_type_;
   return;
@@ -73,14 +74,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
   auto output_channel = filter_tensor->Batch();
 
 #ifdef ENABLE_AVX
-  int col_tile = C16NUM;
+  row_tile_ = C6NUM;
+  col_tile_ = C16NUM;
+#elif defined(ENABLE_SSE)
+  row_tile_ = C4NUM;
+  col_tile_ = C8NUM;
 #elif defined(ENABLE_ARM32)
-  int col_tile = C4NUM;
+  row_tile_ = C12NUM;
+  col_tile_ = C4NUM;
 #else
-  int col_tile = C8NUM;
+  row_tile_ = C12NUM;
+  col_tile_ = C8NUM;
 #endif
+
   if (in_tensors_.size() == 3) {
-    int size = UP_ROUND(output_channel, col_tile) * sizeof(float);
+    int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
     int weight_size = output_channel * sizeof(float);
     bias_data_ = malloc(size);
     if (bias_data_ == nullptr) {
@@ -91,8 +99,8 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
     memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
   }
 
-  int size = input_channel * UP_ROUND(output_channel, col_tile) * sizeof(float);
-  int down_size = input_channel * DOWN_DIV(output_channel, col_tile) * col_tile * sizeof(float);
+  int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
+  int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
   weight_ptr_ = reinterpret_cast<float *>(malloc(size));
   if (weight_ptr_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
@@ -113,27 +121,14 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
 }
 
 int Convolution1x1CPUKernel::InitConv1x1Param() {
-  int hw_tile = C12NUM;
-#ifdef ENABLE_AVX
-  hw_tile = C6NUM;
-#elif defined(ENABLE_SSE)
-  hw_tile = C4NUM;
-#endif
-  if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
+  if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
     multi_thread_by_hw_ = true;
-    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile));
-    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile;
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
   } else {
-#ifdef ENABLE_AVX
-    int col_tile = C16NUM;
-#elif defined(ENABLE_ARM32)
-    int col_tile = C4NUM;
-#else
-    int col_tile = C8NUM;
-#endif
     multi_thread_by_hw_ = false;
-    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile));
-    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile), thread_count_) * col_tile;
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
   }
 
   pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
@@ -167,6 +162,16 @@ int Convolution1x1CPUKernel::Init() {
   return ReSize();
 }
 
+void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
+#if ENABLE_AVX
+  RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
+#elif defined(ENABLE_SSE)
+  RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
+#else
+  RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
+#endif
+}
+
 int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
   int res_stride = matmul_param_->col_ - task_id * thread_stride_;
   int cur_oc = MSMIN(thread_stride_, res_stride);
@@ -198,20 +203,20 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
   }
 
   float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
-  float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
-
-#if ENABLE_AVX
-  RowMajor2Col6Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#elif defined(ENABLE_SSE)
-  RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#else
-  RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#endif
-
+  float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_;
   float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
-  MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
-            matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
-            OutType_Nhwc);
+  float *cur_intput = thread_input_ptr;
+  float *cur_output = thread_output_ptr;
+  for (int i = 0; i < cur_hw_; i += row_tile_) {
+    int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i);
+    PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_);
+    MatMulOpt(thread_pack_input, weight_ptr_, cur_output, reinterpret_cast<float *>(bias_data_),
+              matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_,
+              OutType_Nhwc);
+    cur_intput += row_tile_ * matmul_param_->deep_;
+    cur_output += row_tile_ * matmul_param_->col_;
+  }
+
   return RET_OK;
 }
 
@@ -228,17 +233,9 @@ int Convolution1x1RunHw(void *cdata, int task_id) {
 int Convolution1x1CPUKernel::Run() {
   auto src_in = reinterpret_cast<float *>(in_tensors_[0]->MutableData());
   auto src_out = reinterpret_cast<float *>(out_tensors_[0]->MutableData());
-
-#ifdef ENABLE_AVX
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_6_ * matmul_param_->deep_ * sizeof(float)));
-#elif defined(ENABLE_SSE)
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_4_ * matmul_param_->deep_ * sizeof(float)));
-#else
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
-#endif
+  int pack_input_size = multi_thread_by_hw_ ? (thread_count_ * row_tile_ * matmul_param_->deep_)
+                                            : (matmul_param_->row_align_ * matmul_param_->deep_);
+  pack_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(pack_input_size * sizeof(float)));
   if (pack_input_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
     return RET_MEMORY_FAILED;
@@ -256,13 +253,7 @@ int Convolution1x1CPUKernel::Run() {
     if (multi_thread_by_hw_) {
       ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_);
     } else {
-#ifdef ENABLE_AVX
-      RowMajor2Col6Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#elif defined(ENABLE_SSE)
-      RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#else
-      RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#endif
+      PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
       ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_);
     }
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
index 342ca4c5b2..8594784fb8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
@@ -51,6 +51,7 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
   int InitConv1x1BiasWeight();
   void InitConv1x1MatmulParam();
   void FreeTmpBuffer();
+  void PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col);
 
  private:
   MatMulParameter *matmul_param_ = nullptr;
@@ -62,6 +63,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
   float *pack_input_ = nullptr;
   float *input_ptr_ = nullptr;
   float *output_ptr_ = nullptr;
+  int row_tile_ = 0;
+  int col_tile_ = 0;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_