!10216 [MSLITE] Optimize fp32 convolution 1x1

From: @zhanyuan1 Reviewed-by: @zhang_xue_tong,@hangangqiang Signed-off-by: @zhang_xue_tong
4 years ago · a1eee2a15d
parent ccd83ffa42 7fd260c0ac
commit a1eee2a15d
3 changed files with 208 additions and 56 deletions
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
@ -35,6 +35,12 @@ MatmulFloatNeon32Opt12x4:
    mov lr, #4
    mul r8, r8, lr // stride * sizeof(float)

+LoopRowStart:
+    cmp r6, #4
+    ble LoopRow4
+    cmp r6, #8
+    ble LoopRow8
+
 LoopRow:
    ldr r1, [sp, #-44] // reload rhs ptr
    ldr r7, [sp, #12] // reload rhs col
@ -142,6 +148,158 @@ LoopRow:
            vmax.f32 q13, q13, q3
            vmax.f32 q14, q14, q3
            vmax.f32 q15, q15, q3
+            b Write
+
+LoopRow8:
+    ldr r1, [sp, #-44] // reload rhs ptr
+    ldr r7, [sp, #12] // reload rhs col
+    ldr r3, [sp, #-36] // reload bias ptr
+
+    LoopCol_R8:
+        ldr r2, [sp, #-40] // reload dst ptr
+        ldr r0, [sp, #-48] // reload lhs ptr
+        ldr r5, [sp, #4] // reload depth
+        vld1.32 {q3}, [r1]!
+        vld1.32 {q0, q1}, [r0]!
+        vmul.f32 q4, q3, d0[0]
+        vmul.f32 q5, q3, d0[1]
+        vmul.f32 q6, q3, d1[0]
+        vld1.32 {q2}, [r0]!
+        vmul.f32 q7, q3, d1[1]
+
+        vmul.f32 q8, q3, d2[0]
+        vmul.f32 q9, q3, d2[1]
+        vmul.f32 q10, q3, d3[0]
+        vmul.f32 q11, q3, d3[1]
+
+        subs r5, r5, #1
+        beq Bias_R8
+
+        LoopDepth_R8:
+            vld1.32 {q3}, [r1]!
+            vld1.32 {q0, q1}, [r0]!
+            vmla.f32 q4, q3, d0[0]
+            vmla.f32 q5, q3, d0[1]
+            vmla.f32 q6, q3, d1[0]
+            vld1.32 {q2}, [r0]!
+            vmla.f32 q7, q3, d1[1]
+
+            vmla.f32 q8, q3, d2[0]
+            vmla.f32 q9, q3, d2[1]
+            vmla.f32 q10, q3, d3[0]
+            vmla.f32 q11, q3, d3[1]
+
+            subs r5, r5, #1
+            bne LoopDepth_R8
+
+        Bias_R8:
+            cmp r3, #0
+            beq Activation_R8
+            vld1.32 {q0}, [r3]!
+            vadd.f32 q4, q4, q0
+            vadd.f32 q5, q5, q0
+            vadd.f32 q6, q6, q0
+            vadd.f32 q7, q7, q0
+            vadd.f32 q8, q8, q0
+            vadd.f32 q9, q9, q0
+            vadd.f32 q10, q10, q0
+            vadd.f32 q11, q11, q0
+
+        Activation_R8:
+            ldr lr, [sp]
+            cmp lr, #3
+            beq Relu6_R8
+            cmp lr, #1
+            beq Relu_R8
+            b Write
+
+        Relu6_R8:
+            vmov.i32 q2, #6
+            vcvt.f32.s32 q2, q2
+            vmin.f32 q4, q4, q2
+            vmin.f32 q5, q5, q2
+            vmin.f32 q6, q6, q2
+            vmin.f32 q7, q7, q2
+            vmin.f32 q8, q8, q2
+            vmin.f32 q9, q9, q2
+            vmin.f32 q10, q10, q2
+            vmin.f32 q11, q11, q2
+
+        Relu_R8:
+            veor q3, q3, q3
+            vmax.f32 q4, q4, q3
+            vmax.f32 q5, q5, q3
+            vmax.f32 q6, q6, q3
+            vmax.f32 q7, q7, q3
+            vmax.f32 q8, q8, q3
+            vmax.f32 q9, q9, q3
+            vmax.f32 q10, q10, q3
+            vmax.f32 q11, q11, q3
+            b Write
+
+LoopRow4:
+    ldr r1, [sp, #-44] // reload rhs ptr
+    ldr r7, [sp, #12] // reload rhs col
+    ldr r3, [sp, #-36] // reload bias ptr
+
+    LoopCol_R4:
+        ldr r2, [sp, #-40] // reload dst ptr
+        ldr r0, [sp, #-48] // reload lhs ptr
+        ldr r5, [sp, #4] // reload depth
+        vld1.32 {q3}, [r1]!
+        vld1.32 {q0, q1}, [r0]!
+        vmul.f32 q4, q3, d0[0]
+        vmul.f32 q5, q3, d0[1]
+        vmul.f32 q6, q3, d1[0]
+        vld1.32 {q2}, [r0]!
+        vmul.f32 q7, q3, d1[1]
+
+        subs r5, r5, #1
+        beq Bias_R4
+
+        LoopDepth_R4:
+            vld1.32 {q3}, [r1]!
+            vld1.32 {q0, q1}, [r0]!
+            vmla.f32 q4, q3, d0[0]
+            vmla.f32 q5, q3, d0[1]
+            vmla.f32 q6, q3, d1[0]
+            vld1.32 {q2}, [r0]!
+            vmla.f32 q7, q3, d1[1]
+
+            subs r5, r5, #1
+            bne LoopDepth_R4
+
+        Bias_R4:
+            cmp r3, #0
+            beq Activation_R4
+            vld1.32 {q0}, [r3]!
+            vadd.f32 q4, q4, q0
+            vadd.f32 q5, q5, q0
+            vadd.f32 q6, q6, q0
+            vadd.f32 q7, q7, q0
+
+        Activation_R4:
+            ldr lr, [sp]
+            cmp lr, #3
+            beq Relu6_R4
+            cmp lr, #1
+            beq Relu_R4
+            b Write
+
+        Relu6_R4:
+            vmov.i32 q2, #6
+            vcvt.f32.s32 q2, q2
+            vmin.f32 q4, q4, q2
+            vmin.f32 q5, q5, q2
+            vmin.f32 q6, q6, q2
+            vmin.f32 q7, q7, q2
+
+        Relu_R4:
+            veor q3, q3, q3
+            vmax.f32 q4, q4, q3
+            vmax.f32 q5, q5, q3
+            vmax.f32 q6, q6, q3
+            vmax.f32 q7, q7, q3

        Write:
            cmp r7, #1
@ -398,7 +556,7 @@ LoopRow:
        cmp r6, #12
        ble LoopRowEnd
        sub r6, r6, #12 // lhs row - 12
-        b LoopRow
+        b LoopRowStart

 LoopRowEnd:
    sub sp, sp, #112
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@ -62,6 +62,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
  matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
  matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
+  matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
  matmul_param_->act_type_ = conv_param_->act_type_;
  return;
@ -73,14 +74,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  auto output_channel = filter_tensor->Batch();

 #ifdef ENABLE_AVX
-  int col_tile = C16NUM;
+  row_tile_ = C6NUM;
+  col_tile_ = C16NUM;
+#elif defined(ENABLE_SSE)
+  row_tile_ = C4NUM;
+  col_tile_ = C8NUM;
 #elif defined(ENABLE_ARM32)
-  int col_tile = C4NUM;
+  row_tile_ = C12NUM;
+  col_tile_ = C4NUM;
 #else
-  int col_tile = C8NUM;
+  row_tile_ = C12NUM;
+  col_tile_ = C8NUM;
 #endif
+
  if (in_tensors_.size() == 3) {
-    int size = UP_ROUND(output_channel, col_tile) * sizeof(float);
+    int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
    int weight_size = output_channel * sizeof(float);
    bias_data_ = malloc(size);
    if (bias_data_ == nullptr) {
@ -91,8 +99,8 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }

-  int size = input_channel * UP_ROUND(output_channel, col_tile) * sizeof(float);
-  int down_size = input_channel * DOWN_DIV(output_channel, col_tile) * col_tile * sizeof(float);
+  int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
+  int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
  weight_ptr_ = reinterpret_cast<float *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
@ -113,27 +121,14 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
 }

 int Convolution1x1CPUKernel::InitConv1x1Param() {
-  int hw_tile = C12NUM;
-#ifdef ENABLE_AVX
-  hw_tile = C6NUM;
-#elif defined(ENABLE_SSE)
-  hw_tile = C4NUM;
-#endif
-  if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
+  if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
    multi_thread_by_hw_ = true;
-    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile));
-    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile;
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
  } else {
-#ifdef ENABLE_AVX
-    int col_tile = C16NUM;
-#elif defined(ENABLE_ARM32)
-    int col_tile = C4NUM;
-#else
-    int col_tile = C8NUM;
-#endif
    multi_thread_by_hw_ = false;
-    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile));
-    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile), thread_count_) * col_tile;
+    thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
+    thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
  }

  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
@ -167,6 +162,16 @@ int Convolution1x1CPUKernel::Init() {
  return ReSize();
 }

+void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
+#if ENABLE_AVX
+  RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
+#elif defined(ENABLE_SSE)
+  RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
+#else
+  RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
+#endif
+}
+
 int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
  int res_stride = matmul_param_->col_ - task_id * thread_stride_;
  int cur_oc = MSMIN(thread_stride_, res_stride);
@ -198,20 +203,20 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
  }

  float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
-  float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
-
-#if ENABLE_AVX
-  RowMajor2Col6Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#elif defined(ENABLE_SSE)
-  RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#else
-  RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
-#endif
-
+  float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_;
  float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
-  MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
-            matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
+  float *cur_intput = thread_input_ptr;
+  float *cur_output = thread_output_ptr;
+  for (int i = 0; i < cur_hw_; i += row_tile_) {
+    int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i);
+    PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_);
+    MatMulOpt(thread_pack_input, weight_ptr_, cur_output, reinterpret_cast<float *>(bias_data_),
+              matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_,
              OutType_Nhwc);
+    cur_intput += row_tile_ * matmul_param_->deep_;
+    cur_output += row_tile_ * matmul_param_->col_;
+  }
+
  return RET_OK;
 }

@ -228,17 +233,9 @@ int Convolution1x1RunHw(void *cdata, int task_id) {
 int Convolution1x1CPUKernel::Run() {
  auto src_in = reinterpret_cast<float *>(in_tensors_[0]->MutableData());
  auto src_out = reinterpret_cast<float *>(out_tensors_[0]->MutableData());
-
-#ifdef ENABLE_AVX
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_6_ * matmul_param_->deep_ * sizeof(float)));
-#elif defined(ENABLE_SSE)
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_4_ * matmul_param_->deep_ * sizeof(float)));
-#else
-  pack_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
-#endif
+  int pack_input_size = multi_thread_by_hw_ ? (thread_count_ * row_tile_ * matmul_param_->deep_)
+                                            : (matmul_param_->row_align_ * matmul_param_->deep_);
+  pack_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(pack_input_size * sizeof(float)));
  if (pack_input_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
    return RET_MEMORY_FAILED;
@ -256,13 +253,7 @@ int Convolution1x1CPUKernel::Run() {
    if (multi_thread_by_hw_) {
      ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_);
    } else {
-#ifdef ENABLE_AVX
-      RowMajor2Col6Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#elif defined(ENABLE_SSE)
-      RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#else
-      RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-#endif
+      PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
      ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_);
    }
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
@ -51,6 +51,7 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
  int InitConv1x1BiasWeight();
  void InitConv1x1MatmulParam();
  void FreeTmpBuffer();
+  void PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col);

 private:
  MatMulParameter *matmul_param_ = nullptr;
@ -62,6 +63,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
  float *pack_input_ = nullptr;
  float *input_ptr_ = nullptr;
  float *output_ptr_ = nullptr;
+  int row_tile_ = 0;
+  int col_tile_ = 0;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_