Optimize fullconnection kernel for vector input

5 years ago · 2635dc0f97
parent 550c3fe0d2
commit 2635dc0f97
5 changed files with 257 additions and 16 deletions
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
@ -0,0 +1,178 @@
+#ifdef __aarch64__
+    .text
+    .align 5
+    .global MatmulFp16Neon64_1xN
+#ifndef __APPLE__
+    .type MatmulFp16Neon64_1xN, %function
+#endif
+
+// void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
+// x0: a
+// x1: b
+// x2: c
+// x3: bias
+// w4: act_type
+// w5: depth
+// w6: col
+
+MatmulFp16Neon64_1xN:
+  sub sp, sp, #128
+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+
+  mov w14, #2      // sizeof(float16)
+  mul w8, w14, w5  // rhs depthx1 block stride
+  mov w14, #4
+  mul w13, w8, w14 // rhs depthx4 block stride 
+
+Loop:
+  mov x15, x0    // reload a ptr
+  mov x7, x1    // reload b ptr
+  mov w9, w5    // reload depth
+  cmp w6, #4
+  blt Loop1x1  
+
+Loop1x4:
+  dup v5.8h, wzr  
+  dup v6.8h, wzr  
+  dup v7.8h, wzr  
+  dup v8.8h, wzr  
+  dup v9.8h, wzr  
+  dup v10.8h, wzr  
+  dup v11.8h, wzr  
+  dup v12.8h, wzr
+  dup v13.8h, wzr
+
+  add x10, x7, x8
+  add x11, x10, x8
+  add x12, x11, x8
+
+Depth8_1x4:
+  cmp w9, #8
+  blt Depth1_1x4
+
+  ld1 {v0.8h}, [x15], #16
+  ld1 {v1.8h}, [x7], #16
+  ld1 {v2.8h}, [x10], #16
+  ld1 {v3.8h}, [x11], #16
+  ld1 {v4.8h}, [x12], #16
+
+  fmla v5.8h, v1.8h, v0.8h
+  fmla v6.8h, v2.8h, v0.8h
+  fmla v7.8h, v3.8h, v0.8h
+  fmla v8.8h, v4.8h, v0.8h
+  sub w9, w9, #8
+  cbz w9, End1x4
+  b Depth8_1x4
+
+Depth1_1x4:
+  ld1 {v0.h}[0], [x15], #2
+  ld1 {v1.h}[0], [x7], #2
+  ld1 {v1.h}[1], [x10], #2
+  ld1 {v1.h}[2], [x11], #2
+  ld1 {v1.h}[3], [x12], #2
+
+  fmla v9.8h, v1.8h, v0.h[0]
+  sub w9, w9, #1
+  cbz w9, End1x4
+  b Depth1_1x4
+
+End1x4:
+  faddp v10.8h, v5.8h, v6.8h
+  faddp v11.8h, v7.8h, v8.8h
+  faddp v12.8h, v10.8h, v11.8h
+  faddp v13.8h, v12.8h, v12.8h
+  fadd v13.8h, v13.8h, v9.8h
+
+  cbz x3, Act1x4
+  ld1 {v14.4h}, [x3], #8
+  fadd v13.8h, v13.8h, v14.8h 
+
+Act1x4:
+  cmp w4, #3
+  beq Relu6_1x4
+  cmp w4, #1
+  beq Relu1x4
+  b Write1x4
+
+Relu6_1x4:
+  movi v14.8h, #0x46, lsl #8
+  fmin v13.8h, v13.8h, v14.8h
+
+Relu1x4:
+  dup v14.8h, wzr
+  fmax v13.8h, v13.8h, v14.8h
+
+Write1x4:
+  st1 {v13.4h}, [x2], #8
+  sub w6, w6, #4
+  cbz w6, End
+  add x1, x1, x13
+  b Loop
+
+Loop1x1:
+  dup v2.8h, wzr
+  dup v3.8h, wzr
+  dup v4.8h, wzr
+  dup v5.8h, wzr
+  dup v6.8h, wzr
+
+Depth8_1x1:
+  cmp w9, #8
+  blt Depth1_1x1
+
+  ld1 {v0.8h}, [x15], #16
+  ld1 {v1.8h}, [x7], #16
+
+  fmla v2.8h, v1.8h, v0.8h
+  sub w9, w9, #8
+  cbz w9, End1x1
+  b Depth8_1x1
+
+Depth1_1x1:
+  ld1 {v0.h}[0], [x15], #2
+  ld1 {v1.h}[0], [x7], #2
+
+  fmla v3.8h, v1.8h, v0.h[0]
+  sub w9, w9, #1
+  cbz w9, End1x1
+  b Depth1_1x1
+
+End1x1:
+  faddp v4.8h, v2.8h, v2.8h  
+  faddp v5.8h, v4.8h, v4.8h  
+  faddp v6.8h, v5.8h, v5.8h  
+  fadd v6.8h, v6.8h, v3.8h
+
+  cbz x3, Act1x1
+  ld1 {v7.h}[0], [x3], #2
+  fadd v6.8h, v6.8h, v7.8h 
+
+Act1x1:
+  cmp w4, #3
+  beq Relu6_1x1
+  cmp w4, #1
+  beq Relu1x1
+  b Write1x1
+
+Relu6_1x1:
+  movi v7.8h, #0x46, lsl #8
+  fmin v6.8h, v6.8h, v7.8h
+
+Relu1x1:
+  dup v7.8h, wzr
+  fmax v6.8h, v6.8h, v7.8h
+
+Write1x1:
+  st1 {v6.h}[0], [x2], #2
+  sub w6, w6, #1
+  cbz w6, End
+  add x1, x1, x8
+  b Loop
+
+End:
+  sub sp, sp, #128
+  ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+  ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+  ret
+#endif
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@ -289,6 +289,11 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
  return;
 }

+void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
+                   int col) {
+  MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col);
+}
+
 void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
  size_t row_up_16 = UP_ROUND(row, C16NUM);
  size_t row16 = row / C16NUM * C16NUM;
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@ -35,6 +35,9 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, int out_type);

+void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
+                   int col);
+
 void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);

 void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
@ -45,6 +48,9 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
 void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);

+void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
+                          int depth, int col);
+
 void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

 void RowMajor2Row16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
@ -62,38 +62,59 @@ int FullconnectionFP16CPUKernel::ReSize() {
  thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM));
  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM;

+  if (row == 1) is_vector_input_ = true;
+  int a_pack_row = 0;
+  int b_pack_col = 0;
+  if (is_vector_input_) {
+    a_pack_row = 1;
+    b_pack_col = fc_param_->col_;
+  } else {
+    a_pack_row = fc_param_->row_16_;
+    b_pack_col = fc_param_->col_8_;
+  }
  a_pack_ptr_ =
-    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)));
+    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t)));
  if (a_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
-  memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t));
+  memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t));

  b_pack_ptr_ =
-    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)));
+    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t)));
  if (b_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
-  memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t));
+  memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t));

  fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
  if (fc_param_->b_const_) {
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
-      InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      if (is_vector_input_) {
+        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
+                         fc_param_->col_ * fc_param_->deep_);
+      } else {
+        InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      }
    } else {
-      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      if (is_vector_input_) {
+        memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()),
+               fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t));
+      } else {
+        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      }
    }
+    b_ptr_ = b_pack_ptr_;
  }

  if (in_tensors_.size() == 3) {
-    bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t)));
+    bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * sizeof(float16_t)));
    if (bias_ptr_ == nullptr) {
      FreeTmpBuffer();
      return RET_MEMORY_FAILED;
    }
-    memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t));
+    memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t));
    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, fc_param_->col_);
  }

@ -102,7 +123,7 @@ int FullconnectionFP16CPUKernel::ReSize() {
      reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t)));
  }
  return RET_OK;
-}
+}  // namespace mindspore::kernel

 void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
  RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true);
@ -133,11 +154,16 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
  if (cur_oc <= 0) {
    return RET_OK;
  }
-  auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
+  auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
  auto c = output_ptr_ + task_id * thread_stride_;
-  MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
-             OutType_Nhwc);
+  if (is_vector_input_) {
+    MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
+  } else {
+    MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
+               OutType_Nhwc);
+  }
+
  return RET_OK;
 }

@ -163,16 +189,39 @@ int FullconnectionFP16CPUKernel::Run() {
  } else {
    output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
  }
+
  if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
-    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
+    if (is_vector_input_) {
+      Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_, fc_param_->deep_);
+    } else {
+      InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
+    }
+    a_ptr_ = a_pack_ptr_;
  } else {
-    InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
+    if (is_vector_input_) {
+      a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
+    } else {
+      InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
+      a_ptr_ = a_pack_ptr_;
+    }
  }
+
  if (!fc_param_->b_const_) {
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
-      InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      if (is_vector_input_) {
+        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
+                         fc_param_->col_ * fc_param_->deep_);
+      } else {
+        InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      }
+      b_ptr_ = b_pack_ptr_;
    } else {
-      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+      if (is_vector_input_) {
+        b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
+      } else {
+        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+        b_ptr_ = b_pack_ptr_;
+      }
    }
  }
  ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
@ -51,6 +51,9 @@ class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel {
  float16_t *bias_ptr_ = nullptr;
  float16_t *output_fp16_ = nullptr;
  float16_t *output_ptr_ = nullptr;
+  float16_t *a_ptr_ = nullptr;
+  float16_t *b_ptr_ = nullptr;
+  bool is_vector_input_ = false;
 };
 }  // namespace mindspore::kernel