diff --git a/mindspore/lite/internal/CMakeLists.txt b/mindspore/lite/internal/CMakeLists.txt index 65fef46415..c799c6816c 100644 --- a/mindspore/lite/internal/CMakeLists.txt +++ b/mindspore/lite/internal/CMakeLists.txt @@ -39,6 +39,7 @@ if (PLATFORM_ARM64) # assembly file(GLOB ASSEMBLY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S + ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatVecMulFp32.S ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S) set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C) set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC}) diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S new file mode 100644 index 0000000000..09de97596b --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S @@ -0,0 +1,203 @@ +#ifdef __aarch64__ + .text + .align 5 + .global MatVecMulFp32Neon64 +#ifndef __APPLE__ + .type MatVecMulFp32Neon64, %function +#endif + +// void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) +// x0: a +// x1: b +// x2: c +// x3: bias +// w4: act_type +// w5: depth +// w6: col + +MatVecMulFp32Neon64: + sub sp, sp, #128 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + + mov w14, #4 // sizeof(float) + mul w8, w14, w5 // rhs depthx1 block stride + mov w14, #4 + mul w13, w8, w14 // rhs depthx4 block stride + +Loop: + mov x15, x0 // reload a ptr + mov x7, x1 // reload b ptr + mov w9, w5 // reload depth + cmp w6, #4 + blt Loop1x1 + +Loop1x4: + dup v10.8h, wzr + dup v11.8h, wzr + dup v12.8h, wzr + dup v13.8h, wzr + dup v14.8h, wzr + + add x10, x7, x8 + add x11, x10, x8 + add x12, x11, x8 + +Depth8_1x4: + cmp w9, #8 + blt Depth4_1x4 + + ld1 {v0.4s, v1.4s}, [x15], #32 + ld1 {v2.4s, v3.4s}, [x7], #32 + ld1 {v4.4s, v5.4s}, [x10], #32 + fmla v10.4s, v0.4s, v2.4s + fmla v10.4s, v1.4s, v3.4s + fmla v11.4s, v0.4s, v4.4s + fmla v11.4s, v1.4s, v5.4s + ld1 {v6.4s, v7.4s}, [x11], #32 + ld1 {v8.4s, v9.4s}, [x12], #32 + fmla v12.4s, v0.4s, v6.4s + fmla v12.4s, v1.4s, v7.4s + fmla v13.4s, v0.4s, v8.4s + fmla v13.4s, v1.4s, v9.4s + sub w9, w9, #8 + cbz w9, End1x4 + b Depth8_1x4 + +Depth4_1x4: + cmp w9, #4 + blt Depth1_1x4 + + ld1 {v0.4s}, [x15], #16 + ld1 {v1.4s}, [x7], #16 + ld1 {v2.4s}, [x10], #16 + ld1 {v3.4s}, [x11], #16 + ld1 {v4.4s}, [x12], #16 + fmla v10.4s, v1.4s, v0.4s + fmla v11.4s, v2.4s, v0.4s + fmla v12.4s, v3.4s, v0.4s + fmla v13.4s, v4.4s, v0.4s + sub w9, w9, #4 + cbz w9, End1x4 + b Depth8_1x4 + +Depth1_1x4: + ld1 {v0.s}[0], [x15], #4 + ld1 {v1.s}[0], [x7], #4 + ld1 {v1.s}[1], [x10], #4 + ld1 {v1.s}[2], [x11], #4 + ld1 {v1.s}[3], [x12], #4 + + fmla v14.4s, v1.4s, v0.s[0] + sub w9, w9, #1 + cbz w9, End1x4 + b Depth1_1x4 + +End1x4: + faddp v15.4s, v10.4s, v11.4s + faddp v16.4s, v12.4s, v13.4s + faddp v17.4s, v15.4s, v16.4s + fadd v14.4s, v14.4s, v17.4s + + cbz x3, Act1x4 + ld1 {v15.4s}, [x3], #16 + fadd v14.4s, v14.4s, v15.4s // add bias + +Act1x4: + cmp w4, #3 + beq Relu6_1x4 + cmp w4, #1 + beq Relu1x4 + b Write1x4 + +Relu6_1x4: + movi v15.4s, #0x46, lsl #8 + fmin v14.4s, v14.4s, v15.4s + +Relu1x4: + dup v15.4s, wzr + fmax v14.4s, v14.4s, v15.4s + +Write1x4: + st1 {v14.4s}, [x2], #16 + sub w6, w6, #4 + cbz w6, End + add x1, x1, x13 + b Loop + + +Loop1x1: + dup v4.4s, wzr + dup v5.4s, wzr + +Depth8_1x1: + cmp w9, #8 + blt Depth4_1x1 + + ld1 {v0.4s, v1.4s}, [x15], #32 + ld1 {v2.4s, v3.4s}, [x7], #32 + + fmla v4.4s, v2.4s, v0.4s + fmla v4.4s, v3.4s, v1.4s + sub w9, w9, #8 + cbz w9, End1x1 + b Depth8_1x1 + +Depth4_1x1: + cmp w9, #4 + blt Depth1_1x1 + + ld1 {v0.4s}, [x15], #16 + ld1 {v1.4s}, [x7], #16 + + fmla v4.4s, v1.4s, v0.4s + sub w9, w9, #4 + cbz w9, End1x1 + b Depth8_1x1 + +Depth1_1x1: + ld1 {v0.s}[0], [x15], #4 + ld1 {v1.s}[0], [x7], #4 + + fmla v5.4s, v1.4s, v0.s[0] + sub w9, w9, #1 + cbz w9, End1x1 + b Depth1_1x1 + +End1x1: + faddp v6.4s, v4.4s, v4.4s + faddp v7.4s, v6.4s, v6.4s + fadd v7.4s, v7.4s, v5.4s + + cbz x3, Act1x1 + ld1 {v8.s}[0], [x3], #4 + fadd v7.4s, v7.4s, v8.4s // add bias + +Act1x1: + cmp w4, #3 + beq Relu6_1x1 + cmp w4, #1 + beq Relu1x1 + b Write1x1 + +Relu6_1x1: + movi v8.4s, #0x46, lsl #8 + fmin v7.4s, v7.4s, v8.4s + +Relu1x1: + dup v8.4s, wzr + fmax v7.4s, v7.4s, v8.4s + +Write1x1: + st1 {v7.s}[0], [x2], #4 + sub w6, w6, #1 + cbz w6, End + add x1, x1, x8 + b Loop + +End: + sub sp, sp, #128 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ret +#endif \ No newline at end of file diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S similarity index 92% rename from mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S rename to mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S index 48fc0661c0..9ba601a797 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S @@ -1,12 +1,12 @@ #ifdef __aarch64__ .text .align 5 - .global MatmulFp16Neon64_1xN + .global MatVecMulFp16Neon64 #ifndef __APPLE__ - .type MatmulFp16Neon64_1xN, %function + .type MatVecMulFp16Neon64, %function #endif -// void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) +// void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) // x0: a // x1: b // x2: c @@ -15,7 +15,7 @@ // w5: depth // w6: col -MatmulFp16Neon64_1xN: +MatVecMulFp16Neon64: sub sp, sp, #128 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c index 00909fe4d1..39ed818c51 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c @@ -289,9 +289,9 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa return; } -void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, - int col) { - MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col); +void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, + int depth, int col) { + MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); } void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h index 5bf2293ab9..26d827c621 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.h +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h @@ -17,14 +17,14 @@ #ifndef MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ #define MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ -#include #include +#include #ifdef ENABLE_NEON #include #endif #include "nnacl/errorcode.h" -#include "nnacl/op_base.h" #include "nnacl/matmul_parameter.h" +#include "nnacl/op_base.h" #ifdef __cplusplus extern "C" { @@ -35,8 +35,8 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, int depth, int row, int col, int stride, int out_type); -void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, - int col); +void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, + int depth, int col); void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); @@ -48,8 +48,8 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); -void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, - int depth, int col); +void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, + int depth, int col); void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c index dd2a7a77ec..9ea98fdbf0 100644 --- a/mindspore/lite/nnacl/fp32/matmul.c +++ b/mindspore/lite/nnacl/fp32/matmul.c @@ -16,6 +16,14 @@ #include "nnacl/fp32/matmul.h" +void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col) { + for (int r = 0; r < row; ++r) { + for (int c = 0; c < col; ++c) { + dst_ptr[c * row + r] = src_ptr[r * col + c]; + } + } +} + void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { float *src = src_ptr + r * col; @@ -562,6 +570,12 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT #endif } +void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { +#ifdef ENABLE_ARM64 + MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col); +#endif +} + #ifdef ENABLE_NNACL_INFER_SHAPE static void SwapDims(int *dims, int index1, int index2) { int tmp = dims[index1]; diff --git a/mindspore/lite/nnacl/fp32/matmul.h b/mindspore/lite/nnacl/fp32/matmul.h index a258cb5592..5b07f514f4 100644 --- a/mindspore/lite/nnacl/fp32/matmul.h +++ b/mindspore/lite/nnacl/fp32/matmul.h @@ -17,17 +17,19 @@ #ifndef MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ #define MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ -#include #include +#include #include "nnacl/errorcode.h" -#include "nnacl/op_base.h" #include "nnacl/matmul_parameter.h" +#include "nnacl/op_base.h" #ifdef __cplusplus extern "C" { #endif void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, int col, size_t stride, int out_type); +void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col); +void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col); @@ -39,6 +41,7 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi int col, size_t stride, size_t writeNhwc, size_t WriteWino); void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t write_mode); +void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); #elif ENABLE_ARM32 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, int stride, size_t writeNhwc, size_t WriteWino); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc index 56866cc1cd..53e76b8d15 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc @@ -27,13 +27,13 @@ FullconnectionCPUKernel::~FullconnectionCPUKernel() { } void FullconnectionCPUKernel::FreeBuf() { - if (a_c12_ptr_ != nullptr) { - free(a_c12_ptr_); - a_c12_ptr_ = nullptr; + if (a_pack_ptr_ != nullptr) { + free(a_pack_ptr_); + a_pack_ptr_ = nullptr; } - if (b_r8_ptr_ != nullptr) { - free(b_r8_ptr_); - b_r8_ptr_ = nullptr; + if (b_pack_ptr_ != nullptr) { + free(b_pack_ptr_); + b_pack_ptr_ = nullptr; } if (bias_ptr_ != nullptr) { free(bias_ptr_); @@ -56,37 +56,50 @@ int FullconnectionCPUKernel::ReSize() { thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); - bias_ptr_ = reinterpret_cast(malloc(fc_param_->col_8_ * sizeof(float))); - memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float)); +#ifdef ENABLE_ARM64 + if (fc_param_->row_ == 1) { + is_vector_input_ = true; + } +#endif if (in_tensors_.size() == 3) { + int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; + bias_ptr_ = reinterpret_cast(malloc(col_tmp * sizeof(float))); memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float)); } #ifdef ENABLE_ARM32 - a_c12_ptr_ = reinterpret_cast(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); - if (a_c12_ptr_ == nullptr) { + a_pack_ptr_ = reinterpret_cast(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); + if (a_pack_ptr_ == nullptr) { return RET_MEMORY_FAILED; } - memset(a_c12_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); + memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); #else - a_c12_ptr_ = reinterpret_cast(malloc(fc_param_->row_12_ * fc_param_->deep_ * sizeof(float))); - if (a_c12_ptr_ == nullptr) { + int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; + a_pack_ptr_ = reinterpret_cast(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); + if (a_pack_ptr_ == nullptr) { return RET_MEMORY_FAILED; } - memset(a_c12_ptr_, 0, fc_param_->row_12_ * fc_param_->deep_ * sizeof(float)); + memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); #endif - b_r8_ptr_ = reinterpret_cast(malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float))); - if (b_r8_ptr_ == nullptr) { + int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; + b_pack_ptr_ = reinterpret_cast(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); + if (b_pack_ptr_ == nullptr) { FreeBuf(); return RET_MEMORY_FAILED; } - memset(b_r8_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float)); + memset(b_pack_ptr_, 0, col_tmp * fc_param_->deep_ * sizeof(float)); fc_param_->a_const_ = (in_tensors_[0]->data_c() != nullptr); fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); - if (fc_param_->a_const_) InitMatrixA(reinterpret_cast(in_tensors_[0]->MutableData()), a_c12_ptr_); - if (fc_param_->b_const_) InitMatrixB(reinterpret_cast(in_tensors_[1]->MutableData()), b_r8_ptr_); + if (fc_param_->a_const_) { + InitMatrixA(reinterpret_cast(in_tensors_[0]->MutableData()), a_pack_ptr_); + a_ptr_ = a_pack_ptr_; + } + if (fc_param_->b_const_) { + InitMatrixB(reinterpret_cast(in_tensors_[1]->MutableData()), b_pack_ptr_); + b_ptr_ = b_pack_ptr_; + } return RET_OK; } @@ -98,14 +111,24 @@ int FullconnectionCPUKernel::Init() { } void FullconnectionCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { + if (is_vector_input_) { + memcpy(dst_ptr, src_ptr, fc_param_->deep_ * sizeof(float)); + return; + } + #ifdef ENABLE_ARM32 - RowMajor2Col4Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_); + RowMajor2Col4Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); #else - RowMajor2Col12Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_); + RowMajor2Col12Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); #endif } void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { + if (is_vector_input_) { + memcpy(dst_ptr, src_ptr, fc_param_->col_ * fc_param_->deep_ * sizeof(float)); + return; + } + RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); } @@ -125,9 +148,16 @@ int FullconnectionCPUKernel::DoMatmul(int task_id) { return RET_OK; } - MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_, - c_r_ptr + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, - fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, OutType_Nhwc); + auto b = b_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_; + auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * C8NUM; + auto c = c_ptr_ + task_id * thread_stride_ * C8NUM; + if (is_vector_input_) { + MatVecMul(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); + } else { + MatMulOpt(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, + OutType_Nhwc); + } + return RET_OK; } @@ -139,11 +169,24 @@ int FullconnectionCPUKernel::Run() { } auto a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); - c_r_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); - - if (!fc_param_->a_const_) InitMatrixA(a_ptr, a_c12_ptr_); - if (!fc_param_->b_const_) InitMatrixB(b_ptr, b_r8_ptr_); - + c_ptr_ = reinterpret_cast(out_tensors_.at(0)->data_c()); + + if (!fc_param_->a_const_) { + if (is_vector_input_) { + a_ptr_ = a_ptr; + } else { + InitMatrixA(a_ptr, a_pack_ptr_); + a_ptr_ = a_pack_ptr_; + } + } + if (!fc_param_->b_const_) { + if (is_vector_input_) { + b_ptr_ = b_ptr; + } else { + InitMatrixB(b_ptr, b_pack_ptr_); + b_ptr_ = b_pack_ptr_; + } + } ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h index 7f9d4348e7..ee159fcb27 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h @@ -19,8 +19,8 @@ #include #include "src/runtime/kernel/arm/base/fullconnection_base.h" -#include "include/errorcode.h" #include "include/context.h" +#include "include/errorcode.h" #include "nnacl/fp32/matmul.h" using mindspore::lite::InnerContext; @@ -47,10 +47,13 @@ class FullconnectionCPUKernel : public FullconnectionBaseCPUKernel { void InitMatrixB(float *src_ptr, float *dst_ptr); private: - float *a_c12_ptr_ = nullptr; - float *b_r8_ptr_ = nullptr; - float *c_r_ptr = nullptr; + float *a_pack_ptr_ = nullptr; + float *b_pack_ptr_ = nullptr; + float *c_ptr_ = nullptr; float *bias_ptr_ = nullptr; + float *a_ptr_ = nullptr; + float *b_ptr_ = nullptr; + bool is_vector_input_ = false; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc index a50ed7d4f1..23add99a65 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc @@ -15,9 +15,9 @@ */ #include "src/runtime/kernel/arm/fp32/matmul.h" +#include "include/errorcode.h" #include "nnacl/fp32/matmul.h" #include "src/runtime/runtime_api.h" -#include "include/errorcode.h" using mindspore::lite::RET_ERROR; using mindspore::lite::RET_INPUT_TENSOR_ERROR; @@ -28,13 +28,13 @@ namespace mindspore::kernel { MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); } void MatmulCPUKernel::FreeTmpBuffer() { - if (a_c12_ptr_ != nullptr) { - free(a_c12_ptr_); - a_c12_ptr_ = nullptr; + if (a_pack_ptr_ != nullptr) { + free(a_pack_ptr_); + a_pack_ptr_ = nullptr; } - if (b_r8_ptr_ != nullptr) { - free(b_r8_ptr_); - b_r8_ptr_ = nullptr; + if (b_pack_ptr_ != nullptr) { + free(b_pack_ptr_); + b_pack_ptr_ = nullptr; } if (bias_ptr_ != nullptr) { free(bias_ptr_); @@ -50,24 +50,30 @@ int MatmulCPUKernel::MallocMatrixABuffer() { } params_->batch = batch; params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; +#ifdef ENABLE_ARM64 + if (params_->row_ == 1) { + is_vector_a_ = true; + } +#endif params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; params_->row_4_ = UP_ROUND(params_->row_, C4NUM); params_->row_12_ = UP_ROUND(params_->row_, C12NUM); #ifdef ENABLE_ARM32 - a_c12_ptr_ = reinterpret_cast(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); - if (a_c12_ptr_ == nullptr) { + a_pack_ptr_ = reinterpret_cast(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); + if (a_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(a_c12_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float)); + memset(a_pack_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float)); #else - a_c12_ptr_ = reinterpret_cast(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float))); - if (a_c12_ptr_ == nullptr) { + int row_tmp = is_vector_a_ ? 1 : params_->row_12_; + a_pack_ptr_ = reinterpret_cast(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); + if (a_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float)); + memset(a_pack_ptr_, 0, params_->batch * row_tmp * params_->deep_ * sizeof(float)); #endif return RET_OK; } @@ -86,12 +92,13 @@ int MatmulCPUKernel::MallocMatrixBBuffer() { params_->col_8_ = UP_ROUND(params_->col_, 8); params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; - b_r8_ptr_ = reinterpret_cast(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float))); - if (b_r8_ptr_ == nullptr) { + int col_tmp = is_vector_a_ ? params_->col_ : params_->col_8_; + b_pack_ptr_ = reinterpret_cast(malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); + if (b_pack_ptr_ == nullptr) { FreeTmpBuffer(); return RET_MEMORY_FAILED; } - memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float)); + memset(b_pack_ptr_, 0, params_->batch * col_tmp * params_->deep_ * sizeof(float)); thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8)); thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_); @@ -99,24 +106,22 @@ int MatmulCPUKernel::MallocMatrixBBuffer() { } int MatmulCPUKernel::InitBias() { - auto c_shape = out_tensors_[0]->shape(); - if (c_shape.empty()) { - return RET_OK; - } - auto col_8 = UP_ROUND(c_shape[c_shape.size() - 1], 8); - bias_ptr_ = reinterpret_cast(malloc(col_8 * sizeof(float))); - if (bias_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(bias_ptr_, 0, col_8 * sizeof(float)); if (in_tensors_.size() == 3) { + auto c_shape = out_tensors_[0]->shape(); auto bias_shape = in_tensors_[1]->shape(); if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { MS_LOG(ERROR) << "The bias'dimension is not equal with colum"; FreeTmpBuffer(); return RET_INPUT_TENSOR_ERROR; } + auto col = c_shape[c_shape.size() - 1]; + auto col_8 = UP_ROUND(col, 8); + auto col_tmp = is_vector_a_ ? col : col_8; + bias_ptr_ = reinterpret_cast(malloc(col_tmp * sizeof(float))); + if (bias_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float)); } return RET_OK; @@ -124,9 +129,9 @@ int MatmulCPUKernel::InitBias() { int MatmulCPUKernel::ReSize() { if (params_->a_const_ == false || params_->a_has_shape_ == false) { - if (a_c12_ptr_ != nullptr) { - free(a_c12_ptr_); - a_c12_ptr_ = nullptr; + if (a_pack_ptr_ != nullptr) { + free(a_pack_ptr_); + a_pack_ptr_ = nullptr; } auto ret = MallocMatrixABuffer(); if (ret != RET_OK) { @@ -135,9 +140,9 @@ int MatmulCPUKernel::ReSize() { } } if (params_->b_const_ == false || params_->b_has_shape_ == false) { - if (b_r8_ptr_ != nullptr) { - free(b_r8_ptr_); - b_r8_ptr_ = nullptr; + if (b_pack_ptr_ != nullptr) { + free(b_pack_ptr_); + b_pack_ptr_ = nullptr; } auto ret = MallocMatrixBBuffer(); if (ret != RET_OK) { @@ -158,6 +163,11 @@ int MatmulCPUKernel::ReSize() { } void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { + if (is_vector_a_) { + memcpy(dst_ptr, src_ptr, params_->batch * params_->deep_ * sizeof(float)); + return; + } + for (int i = 0; i < params_->batch; i++) { float *src = src_ptr + i * params_->deep_ * params_->row_; #ifdef ENABLE_ARM32 @@ -180,6 +190,19 @@ void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { } void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { + if (is_vector_a_) { + if (params_->b_transpose_) { + memcpy(dst_ptr, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); + } else { + for (int i = 0; i < params_->batch; i++) { + float *src = src_ptr + i * params_->deep_ * params_->col_; + float *dst = dst_ptr + i * params_->deep_ * params_->col_; + RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); + } + } + return; + } + for (int i = 0; i < params_->batch; i++) { float *src = src_ptr + i * params_->deep_ * params_->col_; float *dst = dst_ptr + i * params_->deep_ * params_->col_8_; @@ -213,10 +236,12 @@ int MatmulCPUKernel::Init() { params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); if (params_->a_const_ == true) { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_c12_ptr_); + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); + a_ptr_ = a_pack_ptr_; } if (params_->b_const_ == true) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_r8_ptr_); + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); + b_ptr_ = b_pack_ptr_; } if (!InferShapeDone()) { return RET_OK; @@ -234,9 +259,14 @@ int MatmulCPUKernel::RunImpl(int task_id) { if (cur_oc <= 0) { return RET_OK; } - MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_, - c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No, - params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); + auto b = cur_b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_; + auto c = cur_c_ptr_ + task_id * thread_stride_ * C8NUM; + auto bias = bias_ptr_ ? bias_ptr_ + task_id * thread_stride_ * C8NUM : NULL; + if (is_vector_a_) { + MatVecMul(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, cur_oc); + } else { + MatMulOpt(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); + } return RET_OK; } @@ -261,16 +291,32 @@ int MatmulCPUKernel::Run() { auto c_src = reinterpret_cast(out_tensors_[0]->data_c()); if (params_->a_const_ == false || is_train()) { - InitMatrixA(a_src, a_c12_ptr_); + if (is_vector_a_) { + a_ptr_ = a_src; + } else { + InitMatrixA(a_src, a_pack_ptr_); + a_ptr_ = a_pack_ptr_; + } } if (params_->b_const_ == false || is_train()) { - InitMatrixB(b_src, b_r8_ptr_); + if (is_vector_a_) { + b_ptr_ = b_src; + } else { + InitMatrixB(b_src, b_pack_ptr_); + b_ptr_ = b_pack_ptr_; + } } for (int i = 0; i < params_->batch; ++i) { - a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_; - b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_; - c_ptr_ = c_src + i * params_->row_ * params_->col_; + if (is_vector_a_) { + cur_a_ptr_ = a_ptr_ + i * params_->deep_; + cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_; + cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; + } else { + cur_a_ptr_ = a_ptr_ + i * params_->row_12_ * params_->deep_; + cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_8_; + cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; + } ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_); } return RET_OK; @@ -280,10 +326,10 @@ void MatmulCPUKernel::eval() { // Copy weights after training LiteKernel::eval(); if (params_->a_const_ == true) { - InitMatrixA(reinterpret_cast(in_tensors_[0]->MutableData()), a_c12_ptr_); + InitMatrixA(reinterpret_cast(in_tensors_[0]->MutableData()), a_pack_ptr_); } if (params_->b_const_ == true) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->MutableData()), b_r8_ptr_); + InitMatrixB(reinterpret_cast(in_tensors_[1]->MutableData()), b_pack_ptr_); } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h index 1dbd8a01cd..68ca588761 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h @@ -18,8 +18,8 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ #include -#include "src/lite_kernel.h" #include "nnacl/matmul_parameter.h" +#include "src/lite_kernel.h" #include "src/runtime/kernel/arm/base/matmul_base.h" namespace mindspore::kernel { @@ -45,12 +45,15 @@ class MatmulCPUKernel : public MatmulBaseCPUKernel { void FreeTmpBuffer(); private: - float *a_c12_ptr_ = nullptr; - float *b_r8_ptr_ = nullptr; + float *a_pack_ptr_ = nullptr; + float *b_pack_ptr_ = nullptr; float *bias_ptr_ = nullptr; float *a_ptr_ = nullptr; float *b_ptr_ = nullptr; - float *c_ptr_ = nullptr; + float *cur_a_ptr_ = nullptr; + float *cur_b_ptr_ = nullptr; + float *cur_c_ptr_ = nullptr; + bool is_vector_a_ = false; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc index e27c63aa60..4d8a6b119c 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc @@ -17,11 +17,11 @@ #include #include #include -#include "src/common/log_adapter.h" #include "common/common_test.h" +#include "nnacl/fp32/matmul.h" #include "src/common/file_utils.h" +#include "src/common/log_adapter.h" #include "src/runtime/kernel/arm/fp32/fullconnection.h" -#include "nnacl/fp32/matmul.h" namespace mindspore { using mindspore::lite::Tensor; @@ -144,4 +144,59 @@ TEST_F(TestFcFp32, FcTest2) { fc->Run(); CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), correct, total_size, 0.0001); } + +int FcTestInit3(std::vector *inputs_, std::vector *outputs_, + MatMulParameter *matmal_param, float **correct) { + Tensor *in_t = new Tensor(kNumberTypeFloat, {1, 1, 1, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST); + in_t->MallocData(); + float in[] = {1, 0, 3, 0, 4, 5, 2, 5, 2, 5, 1, 5, 0, 1, 2, 0, 2, 1, 0, 5}; + memcpy(in_t->MutableData(), in, sizeof(float) * in_t->ElementsNum()); + inputs_->push_back(in_t); + + Tensor *weight_t = new Tensor(kNumberTypeFloat, {16, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST); + weight_t->MallocData(); + float weight[] = {0, 5, 5, 3, 0, 5, 3, 1, 0, 1, 3, 0, 5, 5, 2, 4, 0, 1, 1, 2, 3, 0, 5, 5, 4, 4, 1, 4, 1, 1, 5, 3, + 3, 1, 0, 3, 1, 2, 4, 5, 3, 4, 4, 0, 3, 5, 0, 3, 4, 1, 0, 1, 3, 4, 0, 5, 2, 5, 0, 4, 2, 2, 2, 2, + 4, 4, 5, 2, 1, 1, 5, 1, 4, 4, 5, 1, 2, 4, 0, 3, 1, 1, 0, 2, 1, 5, 2, 0, 1, 1, 5, 5, 4, 0, 0, 4, + 2, 3, 2, 1, 4, 0, 5, 0, 2, 3, 1, 2, 1, 2, 1, 4, 2, 3, 5, 5, 4, 5, 2, 0, 3, 0, 2, 0, 1, 3, 0, 4, + 1, 5, 2, 5, 4, 2, 5, 1, 4, 5, 3, 1, 0, 4, 4, 4, 1, 3, 4, 2, 2, 4, 1, 4, 0, 1, 0, 2, 4, 5, 2, 1, + 0, 3, 5, 2, 4, 2, 1, 4, 2, 0, 1, 0, 2, 3, 0, 3, 2, 5, 5, 4, 3, 0, 0, 2, 0, 1, 5, 2, 2, 1, 3, 0, + 3, 0, 5, 3, 3, 3, 5, 5, 3, 4, 0, 1, 2, 1, 2, 4, 3, 5, 4, 3, 0, 0, 4, 4, 2, 3, 5, 4, 3, 5, 1, 2, + 1, 5, 0, 5, 1, 1, 5, 5, 0, 0, 1, 3, 2, 2, 2, 3, 4, 2, 2, 3, 2, 4, 3, 0, 2, 0, 3, 2, 1, 5, 2, 4, + 4, 5, 2, 5, 0, 5, 3, 3, 0, 3, 2, 5, 5, 1, 1, 0, 2, 3, 0, 1, 1, 2, 4, 1, 3, 3, 5, 5, 0, 1, 0, 0, + 1, 2, 3, 3, 5, 2, 2, 5, 1, 4, 3, 3, 0, 2, 5, 4, 3, 1, 2, 4, 0, 2, 1, 3, 1, 2, 1, 0, 5, 5, 4, 5}; + memcpy(weight_t->MutableData(), weight, sizeof(float) * weight_t->ElementsNum()); + inputs_->push_back(weight_t); + + Tensor *out_t = new Tensor(kNumberTypeFloat, {1, 16}, schema::Format_NHWC, lite::Tensor::Category::CONST); + out_t->MallocData(); + outputs_->push_back(out_t); + + matmal_param->b_transpose_ = true; + matmal_param->a_transpose_ = false; + matmal_param->has_bias_ = false; + matmal_param->act_type_ = ActType_No; + return out_t->ElementsNum(); +} + +TEST_F(TestFcFp32, FcTest3) { + std::vector inputs_; + std::vector outputs_; + auto matmul_param = new MatMulParameter(); + float *correct; + int total_size = FcTestInit3(&inputs_, &outputs_, matmul_param, &correct); + lite::InnerContext *ctx = new lite::InnerContext; + ctx->thread_num_ = 1; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::FullconnectionCPUKernel *fc = + new kernel::FullconnectionCPUKernel(reinterpret_cast(matmul_param), inputs_, outputs_, ctx, nullptr); + + fc->Init(); + struct timeval start, end; + gettimeofday(&start, NULL); + for (int i = 0; i < 100000; ++i) fc->Run(); + gettimeofday(&end, NULL); + // printf("## elapsed: %llu\n", 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - end.tv_usec); +} + } // namespace mindspore