From 6cb7bcfa0d0fe971625812d2d65b1e4a4e95ef9f Mon Sep 17 00:00:00 2001 From: z00512249 Date: Tue, 6 Apr 2021 17:34:58 +0800 Subject: [PATCH] fix nnacl cmake && arm neon instrustions --- .../nnacl/fp32/arithmetic_fp32_coder.cc | 3 +- .../nnacl/fp32/arithmetic_fp32_coder.h | 2 +- .../opcoders/nnacl/fp32/lstm_fp32_coder.cc | 7 +++- mindspore/lite/nnacl/CMakeLists.txt | 2 +- .../assembly/arm32/IndirectGemmInt8_2x4.S | 22 ++++++------- .../lite/nnacl/assembly/arm32/MatmulFp32.S | 32 +++++++++---------- .../lite/nnacl/assembly/arm32/MatmulFp32Opt.S | 32 +++++++++---------- .../nnacl/assembly/arm32/MatmulFp32Opt12x4.S | 24 +++++++------- .../lite/nnacl/assembly/arm32/MatmulInt8Opt.S | 6 ++-- .../nnacl/assembly/arm32/MatmulWinogradFp32.S | 2 +- .../nnacl/assembly/arm32/PreSum4x16Int8Pert.S | 2 +- 11 files changed, 70 insertions(+), 64 deletions(-) diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc index 54310afdf2..08970f7da5 100644 --- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc @@ -265,7 +265,7 @@ void ArithmeticFP32Coder::ComputeInOutStrides() { } } -void ArithmeticFP32Coder::CollectFilesForFnc(CoderContext *const context) { +void ArithmeticFP32Coder::CollectFilesForFunc(CoderContext *const context) { /** * for nnacl's operator combine all arithmetic to nnalc/arithmetic.c * this solution is not suitable for micro, for the size of package. @@ -332,6 +332,7 @@ int ArithmeticFP32Coder::DoCode(CoderContext *const context) { int count = MSMIN(stride, element_num - stride * kDefaultTaskId); MS_CHECK_TRUE(!arithmetic_run_.empty(), "arithmetic_run function is nullptr!"); NNaclFp32Serializer code; + CollectFilesForFunc(context); if (arithmetic_parameter_->broadcasting_) { stride = UP_DIV(outside_, thread_num_); out_count_ = MSMIN(stride, outside_ - stride * kDefaultTaskId); diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h index f69dcae820..449fe6668d 100644 --- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h @@ -85,7 +85,7 @@ class ArithmeticFP32Coder final : public OperatorCoder { int BroadcastRun(const std::string &input0, const std::string &input1, const std::string &output, int dim, int out_count, int out_thread_stride, NNaclFp32Serializer *const code); - void CollectFilesForFnc(CoderContext *const context); + void CollectFilesForFunc(CoderContext *const context); int break_pos_{0}; diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc index 5c75801db5..af07aed7b8 100644 --- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc @@ -170,7 +170,12 @@ int LstmFP32Coder::DoCode(CoderContext *context) { "lstm_fp32.c", "mul_fp32.c", }); - + if (target_ == kARM32A || target_ == kARM64) { + Collect(context, {}, {}, + { + "MatVecMulFp32.S", + }); + } Tensor *hidden_state = input_tensors_.at(kFifthIndex); MS_CHECK_PTR(hidden_state); Tensor *cell_state = input_tensors_.at(kSixthIndex); diff --git a/mindspore/lite/nnacl/CMakeLists.txt b/mindspore/lite/nnacl/CMakeLists.txt index 3aefebb380..ce584c8588 100644 --- a/mindspore/lite/nnacl/CMakeLists.txt +++ b/mindspore/lite/nnacl/CMakeLists.txt @@ -1,7 +1,7 @@ project(nnacl) set(NNACL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -include_directories(NNACL_DIR) +include_directories(${NNACL_DIR}/..) if(PLATFORM_ARM32 OR PLATFORM_ARM64) if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND DEFINED ARCHS) diff --git a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S index caea16f738..ec0475cfea 100644 --- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S @@ -152,20 +152,20 @@ asm_function IndirectGemmInt8_2x4 cmp lr, #0 beq SymSum ldr lr, [sp, #52] - vld1.32 q0, [r10] + vld1.32 {d0, d1}, [r10] add r10, r10, lr - vld1.32 q1, [r10] + vld1.32 {d2, d3}, [r10] b AddSum SymSum: - vld1.32 q0[], [r10]! - vld1.32 q1[], [r10]! + vld1.32 {d0[], d1[]}, [r10]! + vld1.32 {d2[], d3[]}, [r10]! AddSum: vsub.i32 q8, q8, q0 vsub.i32 q12, q12, q1 NoSum: cmp r3, #0 beq NoBias - vld1.32 q2, [r3] + vld1.32 {d4, d5}, [r3] vadd.i32 q8, q8, q2 vadd.i32 q12, q12, q2 @@ -174,19 +174,19 @@ asm_function IndirectGemmInt8_2x4 cmp lr, #0 bne PerChannel ldr lr, [sp, #36] - vld1.32 q3[], [lr] + vld1.32 {d6[], d7[]}, [lr] ldr lr, [sp, #32] - vld1.32 q4[], [lr] + vld1.32 {d8[], d9[]}, [lr] ldr lr, [sp, #40] - vld1.32 q5[], [lr] + vld1.32 {d10[], d11[]}, [lr] b QuantizeStart PerChannel: ldr lr, [sp, #36] - vld1.32 q3, [lr] + vld1.32 {d6, d7}, [lr] ldr lr, [sp, #32] - vld1.32 q4, [lr] + vld1.32 {d8, d9}, [lr] ldr lr, [sp, #40] - vld1.32 q5, [lr] + vld1.32 {d10, d11}, [lr] QuantizeStart: vshl.s32 q8, q8, q3 vshl.s32 q12, q12, q3 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S index 7ad42d5df8..6c2d551ad0 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S @@ -201,73 +201,73 @@ LoopCol: add r2, r2, r8 b WriteEnd Write4: - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] cmp r6, #1 beq WriteEnd add r2, r2, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] cmp r6, #2 beq WriteEnd add r2, r2, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] cmp r6, #3 beq WriteEnd add r2, r2, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] add r2, r2, r8 b WriteEnd Write5: add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18[0], [r4] cmp r6, #1 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22[0], [r4] cmp r6, #2 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26[0], [r4] cmp r6, #3 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30[0], [r4] add r2, r2, r8 b WriteEnd Write6: add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18, [r4] cmp r6, #1 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22, [r4] cmp r6, #2 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26, [r4] cmp r6, #3 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30, [r4] add r2, r2, r8 b WriteEnd Write7: add lr, r2, #24 add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18, [r4] vst1.32 d19[0], [lr] cmp r6, #1 @@ -275,7 +275,7 @@ LoopCol: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22, [r4] vst1.32 d23[0], [lr] cmp r6, #2 @@ -283,7 +283,7 @@ LoopCol: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26, [r4] vst1.32 d27[0], [lr] cmp r6, #3 @@ -291,7 +291,7 @@ LoopCol: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30, [r4] vst1.32 d31[0], [lr] add r2, r2, r8 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S index 4a13bc92aa..41e515ed4a 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S @@ -226,19 +226,19 @@ LoopRow: Write4: add lr, r2, #16 str lr, [sp, #-40] - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] cmp r6, #1 beq WriteEnd add r2, r2, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] cmp r6, #2 beq WriteEnd add r2, r2, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] cmp r6, #3 beq WriteEnd add r2, r2, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] add r2, r2, r8 add r2, r2, #16 b WriteEnd @@ -246,25 +246,25 @@ LoopRow: add lr, r2, #20 str lr, [sp, #-40] add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18[0], [r4] cmp r6, #1 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22[0], [r4] cmp r6, #2 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26[0], [r4] cmp r6, #3 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30[0], [r4] add r2, r2, r8 add r2, r2, #20 @@ -273,25 +273,25 @@ LoopRow: add lr, r2, #24 str lr, [sp, #-40] add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18, [r4] cmp r6, #1 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22, [r4] cmp r6, #2 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26, [r4] cmp r6, #3 beq WriteEnd add r2, r2, r8 add r4, r4, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30, [r4] add r2, r2, r8 add r2, r2, #24 @@ -301,7 +301,7 @@ LoopRow: str lr, [sp, #-40] add lr, r2, #24 add r4, r2, #16 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] vst1.32 d18, [r4] vst1.32 d19[0], [lr] cmp r6, #1 @@ -309,7 +309,7 @@ LoopRow: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] vst1.32 d22, [r4] vst1.32 d23[0], [lr] cmp r6, #2 @@ -317,7 +317,7 @@ LoopRow: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] vst1.32 d26, [r4] vst1.32 d27[0], [lr] cmp r6, #3 @@ -325,7 +325,7 @@ LoopRow: add r2, r2, r8 add r4, r4, r8 add lr, lr, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] vst1.32 d30, [r4] vst1.32 d31[0], [lr] add r2, r2, r8 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S index fc6a2225a1..fce88ad193 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S @@ -491,51 +491,51 @@ LoopRow4: Write4: add lr, r2, #16 str lr, [sp, #-40] - vst1.32 q4, [r2] + vst1.32 {d8, d9}, [r2] cmp r6, #1 beq WriteEnd add r2, r2, r8 - vst1.32 q5, [r2] + vst1.32 {d10, d11}, [r2] cmp r6, #2 beq WriteEnd add r2, r2, r8 - vst1.32 q6, [r2] + vst1.32 {d12, d13}, [r2] cmp r6, #3 beq WriteEnd add r2, r2, r8 - vst1.32 q7, [r2] + vst1.32 {d14, d15}, [r2] cmp r6, #4 beq WriteEnd add r2, r2, r8 - vst1.32 q8, [r2] + vst1.32 {d16, d17}, [r2] cmp r6, #5 beq WriteEnd add r2, r2, r8 - vst1.32 q9, [r2] + vst1.32 {d18, d19}, [r2] cmp r6, #6 beq WriteEnd add r2, r2, r8 - vst1.32 q10, [r2] + vst1.32 {d20, d21}, [r2] cmp r6, #7 beq WriteEnd add r2, r2, r8 - vst1.32 q11, [r2] + vst1.32 {d22, d23}, [r2] cmp r6, #8 beq WriteEnd add r2, r2, r8 - vst1.32 q12, [r2] + vst1.32 {d24, d25}, [r2] cmp r6, #9 beq WriteEnd add r2, r2, r8 - vst1.32 q13, [r2] + vst1.32 {d26, d27}, [r2] cmp r6, #10 beq WriteEnd add r2, r2, r8 - vst1.32 q14, [r2] + vst1.32 {d28, d29}, [r2] cmp r6, #11 beq WriteEnd add r2, r2, r8 - vst1.32 q15, [r2] + vst1.32 {d30, d31}, [r2] add r2, r2, r8 add r2, r2, #16 b WriteEnd diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S index 03c45a17d7..29aeebe52e 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S @@ -135,17 +135,17 @@ LoopRow: vsub.s32 d31, d31, d23 vmov.32 lr, d4[1] - vld1.32 {q9[]}, [lr] + vld1.32 {d18[], d19[]}, [lr] vshl.s32 q14, q14, q9 vshl.s32 q15, q15, q9 vmov.32 lr, d5[0] - vld1.32 {q8[]}, [lr] + vld1.32 {d16[], d17[]}, [lr] vqrdmulh.s32 q14, q14, q8 vqrdmulh.s32 q15, q15, q8 vmov.32 lr, d5[1] - vld1.32 {q7[]}, [lr] + vld1.32 {d14[], d15[]}, [lr] vand q6, q7, q14 vshr.s32 q6, q6, #31 vqadd.s32 q14, q14, q6 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S index 8bc5533b9e..9dc429975b 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S @@ -143,7 +143,7 @@ asm_function MatrixMultiplyWinograd mov r0, r8 // mat_b1 ldr r12, [sp] // k LoopK: - vld1.32 {s0}, [r9], r5 + vld1.32 d0[0], [r9], r5 vld1.32 d2[0], [r0], r4 vmla.f32 s8, s0, s4 subs r12, r12, #1 diff --git a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S index 15ebaa139d..013d7572e6 100644 --- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S +++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S @@ -73,7 +73,7 @@ CalLoop: Write: vmul.i32 q13, q13, q10 - vst1.32 q13, [r1], r7 + vst1.32 {d26, d27}, [r1], r7 beq RowLoop End: