diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S new file mode 100644 index 0000000000..48fc0661c0 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S @@ -0,0 +1,178 @@ +#ifdef __aarch64__ + .text + .align 5 + .global MatmulFp16Neon64_1xN +#ifndef __APPLE__ + .type MatmulFp16Neon64_1xN, %function +#endif + +// void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) +// x0: a +// x1: b +// x2: c +// x3: bias +// w4: act_type +// w5: depth +// w6: col + +MatmulFp16Neon64_1xN: + sub sp, sp, #128 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + + mov w14, #2 // sizeof(float16) + mul w8, w14, w5 // rhs depthx1 block stride + mov w14, #4 + mul w13, w8, w14 // rhs depthx4 block stride + +Loop: + mov x15, x0 // reload a ptr + mov x7, x1 // reload b ptr + mov w9, w5 // reload depth + cmp w6, #4 + blt Loop1x1 + +Loop1x4: + dup v5.8h, wzr + dup v6.8h, wzr + dup v7.8h, wzr + dup v8.8h, wzr + dup v9.8h, wzr + dup v10.8h, wzr + dup v11.8h, wzr + dup v12.8h, wzr + dup v13.8h, wzr + + add x10, x7, x8 + add x11, x10, x8 + add x12, x11, x8 + +Depth8_1x4: + cmp w9, #8 + blt Depth1_1x4 + + ld1 {v0.8h}, [x15], #16 + ld1 {v1.8h}, [x7], #16 + ld1 {v2.8h}, [x10], #16 + ld1 {v3.8h}, [x11], #16 + ld1 {v4.8h}, [x12], #16 + + fmla v5.8h, v1.8h, v0.8h + fmla v6.8h, v2.8h, v0.8h + fmla v7.8h, v3.8h, v0.8h + fmla v8.8h, v4.8h, v0.8h + sub w9, w9, #8 + cbz w9, End1x4 + b Depth8_1x4 + +Depth1_1x4: + ld1 {v0.h}[0], [x15], #2 + ld1 {v1.h}[0], [x7], #2 + ld1 {v1.h}[1], [x10], #2 + ld1 {v1.h}[2], [x11], #2 + ld1 {v1.h}[3], [x12], #2 + + fmla v9.8h, v1.8h, v0.h[0] + sub w9, w9, #1 + cbz w9, End1x4 + b Depth1_1x4 + +End1x4: + faddp v10.8h, v5.8h, v6.8h + faddp v11.8h, v7.8h, v8.8h + faddp v12.8h, v10.8h, v11.8h + faddp v13.8h, v12.8h, v12.8h + fadd v13.8h, v13.8h, v9.8h + + cbz x3, Act1x4 + ld1 {v14.4h}, [x3], #8 + fadd v13.8h, v13.8h, v14.8h + +Act1x4: + cmp w4, #3 + beq Relu6_1x4 + cmp w4, #1 + beq Relu1x4 + b Write1x4 + +Relu6_1x4: + movi v14.8h, #0x46, lsl #8 + fmin v13.8h, v13.8h, v14.8h + +Relu1x4: + dup v14.8h, wzr + fmax v13.8h, v13.8h, v14.8h + +Write1x4: + st1 {v13.4h}, [x2], #8 + sub w6, w6, #4 + cbz w6, End + add x1, x1, x13 + b Loop + +Loop1x1: + dup v2.8h, wzr + dup v3.8h, wzr + dup v4.8h, wzr + dup v5.8h, wzr + dup v6.8h, wzr + +Depth8_1x1: + cmp w9, #8 + blt Depth1_1x1 + + ld1 {v0.8h}, [x15], #16 + ld1 {v1.8h}, [x7], #16 + + fmla v2.8h, v1.8h, v0.8h + sub w9, w9, #8 + cbz w9, End1x1 + b Depth8_1x1 + +Depth1_1x1: + ld1 {v0.h}[0], [x15], #2 + ld1 {v1.h}[0], [x7], #2 + + fmla v3.8h, v1.8h, v0.h[0] + sub w9, w9, #1 + cbz w9, End1x1 + b Depth1_1x1 + +End1x1: + faddp v4.8h, v2.8h, v2.8h + faddp v5.8h, v4.8h, v4.8h + faddp v6.8h, v5.8h, v5.8h + fadd v6.8h, v6.8h, v3.8h + + cbz x3, Act1x1 + ld1 {v7.h}[0], [x3], #2 + fadd v6.8h, v6.8h, v7.8h + +Act1x1: + cmp w4, #3 + beq Relu6_1x1 + cmp w4, #1 + beq Relu1x1 + b Write1x1 + +Relu6_1x1: + movi v7.8h, #0x46, lsl #8 + fmin v6.8h, v6.8h, v7.8h + +Relu1x1: + dup v7.8h, wzr + fmax v6.8h, v6.8h, v7.8h + +Write1x1: + st1 {v6.h}[0], [x2], #2 + sub w6, w6, #1 + cbz w6, End + add x1, x1, x8 + b Loop + +End: + sub sp, sp, #128 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ret +#endif \ No newline at end of file diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c index eb9bef6cbf..00909fe4d1 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c @@ -289,6 +289,11 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa return; } +void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, + int col) { + MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col); +} + void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { size_t row_up_16 = UP_ROUND(row, C16NUM); size_t row16 = row / C16NUM * C16NUM; diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h index 306098096e..5bf2293ab9 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.h +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h @@ -35,6 +35,9 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, int depth, int row, int col, int stride, int out_type); +void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, + int col); + void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col); @@ -45,6 +48,9 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); +void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, + int depth, int col); + void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Row16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc index 0a53a76faf..4bd23b7b75 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc @@ -62,38 +62,59 @@ int FullconnectionFP16CPUKernel::ReSize() { thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM)); thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM; + if (row == 1) is_vector_input_ = true; + int a_pack_row = 0; + int b_pack_col = 0; + if (is_vector_input_) { + a_pack_row = 1; + b_pack_col = fc_param_->col_; + } else { + a_pack_row = fc_param_->row_16_; + b_pack_col = fc_param_->col_8_; + } a_pack_ptr_ = - reinterpret_cast(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t))); + reinterpret_cast(ctx_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t))); if (a_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)); + memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t)); b_pack_ptr_ = - reinterpret_cast(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t))); + reinterpret_cast(ctx_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t))); if (b_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)); + memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t)); fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); if (fc_param_->b_const_) { if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + if (is_vector_input_) { + Float32ToFloat16(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_, + fc_param_->col_ * fc_param_->deep_); + } else { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + } } else { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + if (is_vector_input_) { + memcpy(b_pack_ptr_, reinterpret_cast(in_tensors_[1]->data_c()), + fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t)); + } else { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + } } + b_ptr_ = b_pack_ptr_; } if (in_tensors_.size() == 3) { - bias_ptr_ = reinterpret_cast(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t))); + bias_ptr_ = reinterpret_cast(ctx_->allocator->Malloc(b_pack_col * sizeof(float16_t))); if (bias_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t)); + memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t)); Float32ToFloat16(reinterpret_cast(in_tensors_[2]->data_c()), bias_ptr_, fc_param_->col_); } @@ -102,7 +123,7 @@ int FullconnectionFP16CPUKernel::ReSize() { reinterpret_cast(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t))); } return RET_OK; -} +} // namespace mindspore::kernel void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { RowMajor2Col16MajorFp16(reinterpret_cast(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true); @@ -133,11 +154,16 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) { if (cur_oc <= 0) { return RET_OK; } - auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_; + auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_; auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; auto c = output_ptr_ + task_id * thread_stride_; - MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, - OutType_Nhwc); + if (is_vector_input_) { + MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); + } else { + MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, + OutType_Nhwc); + } + return RET_OK; } @@ -163,16 +189,39 @@ int FullconnectionFP16CPUKernel::Run() { } else { output_ptr_ = reinterpret_cast(out_tensor->data_c()); } + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + if (is_vector_input_) { + Float32ToFloat16(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_, fc_param_->deep_); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + } + a_ptr_ = a_pack_ptr_; } else { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + if (is_vector_input_) { + a_ptr_ = reinterpret_cast(in_tensors_[0]->data_c()); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + a_ptr_ = a_pack_ptr_; + } } + if (!fc_param_->b_const_) { if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + if (is_vector_input_) { + Float32ToFloat16(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_, + fc_param_->col_ * fc_param_->deep_); + } else { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + } + b_ptr_ = b_pack_ptr_; } else { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + if (is_vector_input_) { + b_ptr_ = reinterpret_cast(in_tensors_[1]->data_c()); + } else { + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + b_ptr_ = b_pack_ptr_; + } } } ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h index ea4cdd7717..08142172c6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h @@ -51,6 +51,9 @@ class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel { float16_t *bias_ptr_ = nullptr; float16_t *output_fp16_ = nullptr; float16_t *output_ptr_ = nullptr; + float16_t *a_ptr_ = nullptr; + float16_t *b_ptr_ = nullptr; + bool is_vector_input_ = false; }; } // namespace mindspore::kernel