!14675 fix nnacl cmake && arm neon instrustions

From: @zoloft
Reviewed-by: @wangchengyuan,@zhang_xue_tong,@wangchengyuan
Signed-off-by: @zhang_xue_tong
pull/14675/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 4bafc6d94e

@ -265,7 +265,7 @@ void ArithmeticFP32Coder::ComputeInOutStrides() {
} }
} }
void ArithmeticFP32Coder::CollectFilesForFnc(CoderContext *const context) { void ArithmeticFP32Coder::CollectFilesForFunc(CoderContext *const context) {
/** /**
* for nnacl's operator combine all arithmetic to nnalc/arithmetic.c * for nnacl's operator combine all arithmetic to nnalc/arithmetic.c
* this solution is not suitable for micro, for the size of package. * this solution is not suitable for micro, for the size of package.
@ -332,6 +332,7 @@ int ArithmeticFP32Coder::DoCode(CoderContext *const context) {
int count = MSMIN(stride, element_num - stride * kDefaultTaskId); int count = MSMIN(stride, element_num - stride * kDefaultTaskId);
MS_CHECK_TRUE(!arithmetic_run_.empty(), "arithmetic_run function is nullptr!"); MS_CHECK_TRUE(!arithmetic_run_.empty(), "arithmetic_run function is nullptr!");
NNaclFp32Serializer code; NNaclFp32Serializer code;
CollectFilesForFunc(context);
if (arithmetic_parameter_->broadcasting_) { if (arithmetic_parameter_->broadcasting_) {
stride = UP_DIV(outside_, thread_num_); stride = UP_DIV(outside_, thread_num_);
out_count_ = MSMIN(stride, outside_ - stride * kDefaultTaskId); out_count_ = MSMIN(stride, outside_ - stride * kDefaultTaskId);

@ -85,7 +85,7 @@ class ArithmeticFP32Coder final : public OperatorCoder {
int BroadcastRun(const std::string &input0, const std::string &input1, const std::string &output, int dim, int BroadcastRun(const std::string &input0, const std::string &input1, const std::string &output, int dim,
int out_count, int out_thread_stride, NNaclFp32Serializer *const code); int out_count, int out_thread_stride, NNaclFp32Serializer *const code);
void CollectFilesForFnc(CoderContext *const context); void CollectFilesForFunc(CoderContext *const context);
int break_pos_{0}; int break_pos_{0};

@ -170,7 +170,12 @@ int LstmFP32Coder::DoCode(CoderContext *context) {
"lstm_fp32.c", "lstm_fp32.c",
"mul_fp32.c", "mul_fp32.c",
}); });
if (target_ == kARM32A || target_ == kARM64) {
Collect(context, {}, {},
{
"MatVecMulFp32.S",
});
}
Tensor *hidden_state = input_tensors_.at(kFifthIndex); Tensor *hidden_state = input_tensors_.at(kFifthIndex);
MS_CHECK_PTR(hidden_state); MS_CHECK_PTR(hidden_state);
Tensor *cell_state = input_tensors_.at(kSixthIndex); Tensor *cell_state = input_tensors_.at(kSixthIndex);

@ -1,7 +1,7 @@
project(nnacl) project(nnacl)
set(NNACL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(NNACL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
include_directories(NNACL_DIR) include_directories(${NNACL_DIR}/..)
if(PLATFORM_ARM32 OR PLATFORM_ARM64) if(PLATFORM_ARM32 OR PLATFORM_ARM64)
if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND DEFINED ARCHS) if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND DEFINED ARCHS)

@ -152,20 +152,20 @@ asm_function IndirectGemmInt8_2x4
cmp lr, #0 cmp lr, #0
beq SymSum beq SymSum
ldr lr, [sp, #52] ldr lr, [sp, #52]
vld1.32 q0, [r10] vld1.32 {d0, d1}, [r10]
add r10, r10, lr add r10, r10, lr
vld1.32 q1, [r10] vld1.32 {d2, d3}, [r10]
b AddSum b AddSum
SymSum: SymSum:
vld1.32 q0[], [r10]! vld1.32 {d0[], d1[]}, [r10]!
vld1.32 q1[], [r10]! vld1.32 {d2[], d3[]}, [r10]!
AddSum: AddSum:
vsub.i32 q8, q8, q0 vsub.i32 q8, q8, q0
vsub.i32 q12, q12, q1 vsub.i32 q12, q12, q1
NoSum: NoSum:
cmp r3, #0 cmp r3, #0
beq NoBias beq NoBias
vld1.32 q2, [r3] vld1.32 {d4, d5}, [r3]
vadd.i32 q8, q8, q2 vadd.i32 q8, q8, q2
vadd.i32 q12, q12, q2 vadd.i32 q12, q12, q2
@ -174,19 +174,19 @@ asm_function IndirectGemmInt8_2x4
cmp lr, #0 cmp lr, #0
bne PerChannel bne PerChannel
ldr lr, [sp, #36] ldr lr, [sp, #36]
vld1.32 q3[], [lr] vld1.32 {d6[], d7[]}, [lr]
ldr lr, [sp, #32] ldr lr, [sp, #32]
vld1.32 q4[], [lr] vld1.32 {d8[], d9[]}, [lr]
ldr lr, [sp, #40] ldr lr, [sp, #40]
vld1.32 q5[], [lr] vld1.32 {d10[], d11[]}, [lr]
b QuantizeStart b QuantizeStart
PerChannel: PerChannel:
ldr lr, [sp, #36] ldr lr, [sp, #36]
vld1.32 q3, [lr] vld1.32 {d6, d7}, [lr]
ldr lr, [sp, #32] ldr lr, [sp, #32]
vld1.32 q4, [lr] vld1.32 {d8, d9}, [lr]
ldr lr, [sp, #40] ldr lr, [sp, #40]
vld1.32 q5, [lr] vld1.32 {d10, d11}, [lr]
QuantizeStart: QuantizeStart:
vshl.s32 q8, q8, q3 vshl.s32 q8, q8, q3
vshl.s32 q12, q12, q3 vshl.s32 q12, q12, q3

@ -201,73 +201,73 @@ LoopCol:
add r2, r2, r8 add r2, r2, r8
b WriteEnd b WriteEnd
Write4: Write4:
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
add r2, r2, r8 add r2, r2, r8
b WriteEnd b WriteEnd
Write5: Write5:
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18[0], [r4] vst1.32 d18[0], [r4]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22[0], [r4] vst1.32 d22[0], [r4]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26[0], [r4] vst1.32 d26[0], [r4]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30[0], [r4] vst1.32 d30[0], [r4]
add r2, r2, r8 add r2, r2, r8
b WriteEnd b WriteEnd
Write6: Write6:
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18, [r4] vst1.32 d18, [r4]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22, [r4] vst1.32 d22, [r4]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26, [r4] vst1.32 d26, [r4]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30, [r4] vst1.32 d30, [r4]
add r2, r2, r8 add r2, r2, r8
b WriteEnd b WriteEnd
Write7: Write7:
add lr, r2, #24 add lr, r2, #24
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18, [r4] vst1.32 d18, [r4]
vst1.32 d19[0], [lr] vst1.32 d19[0], [lr]
cmp r6, #1 cmp r6, #1
@ -275,7 +275,7 @@ LoopCol:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22, [r4] vst1.32 d22, [r4]
vst1.32 d23[0], [lr] vst1.32 d23[0], [lr]
cmp r6, #2 cmp r6, #2
@ -283,7 +283,7 @@ LoopCol:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26, [r4] vst1.32 d26, [r4]
vst1.32 d27[0], [lr] vst1.32 d27[0], [lr]
cmp r6, #3 cmp r6, #3
@ -291,7 +291,7 @@ LoopCol:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30, [r4] vst1.32 d30, [r4]
vst1.32 d31[0], [lr] vst1.32 d31[0], [lr]
add r2, r2, r8 add r2, r2, r8

@ -226,19 +226,19 @@ LoopRow:
Write4: Write4:
add lr, r2, #16 add lr, r2, #16
str lr, [sp, #-40] str lr, [sp, #-40]
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
add r2, r2, r8 add r2, r2, r8
add r2, r2, #16 add r2, r2, #16
b WriteEnd b WriteEnd
@ -246,25 +246,25 @@ LoopRow:
add lr, r2, #20 add lr, r2, #20
str lr, [sp, #-40] str lr, [sp, #-40]
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18[0], [r4] vst1.32 d18[0], [r4]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22[0], [r4] vst1.32 d22[0], [r4]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26[0], [r4] vst1.32 d26[0], [r4]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30[0], [r4] vst1.32 d30[0], [r4]
add r2, r2, r8 add r2, r2, r8
add r2, r2, #20 add r2, r2, #20
@ -273,25 +273,25 @@ LoopRow:
add lr, r2, #24 add lr, r2, #24
str lr, [sp, #-40] str lr, [sp, #-40]
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18, [r4] vst1.32 d18, [r4]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22, [r4] vst1.32 d22, [r4]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26, [r4] vst1.32 d26, [r4]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30, [r4] vst1.32 d30, [r4]
add r2, r2, r8 add r2, r2, r8
add r2, r2, #24 add r2, r2, #24
@ -301,7 +301,7 @@ LoopRow:
str lr, [sp, #-40] str lr, [sp, #-40]
add lr, r2, #24 add lr, r2, #24
add r4, r2, #16 add r4, r2, #16
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
vst1.32 d18, [r4] vst1.32 d18, [r4]
vst1.32 d19[0], [lr] vst1.32 d19[0], [lr]
cmp r6, #1 cmp r6, #1
@ -309,7 +309,7 @@ LoopRow:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
vst1.32 d22, [r4] vst1.32 d22, [r4]
vst1.32 d23[0], [lr] vst1.32 d23[0], [lr]
cmp r6, #2 cmp r6, #2
@ -317,7 +317,7 @@ LoopRow:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
vst1.32 d26, [r4] vst1.32 d26, [r4]
vst1.32 d27[0], [lr] vst1.32 d27[0], [lr]
cmp r6, #3 cmp r6, #3
@ -325,7 +325,7 @@ LoopRow:
add r2, r2, r8 add r2, r2, r8
add r4, r4, r8 add r4, r4, r8
add lr, lr, r8 add lr, lr, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
vst1.32 d30, [r4] vst1.32 d30, [r4]
vst1.32 d31[0], [lr] vst1.32 d31[0], [lr]
add r2, r2, r8 add r2, r2, r8

@ -491,51 +491,51 @@ LoopRow4:
Write4: Write4:
add lr, r2, #16 add lr, r2, #16
str lr, [sp, #-40] str lr, [sp, #-40]
vst1.32 q4, [r2] vst1.32 {d8, d9}, [r2]
cmp r6, #1 cmp r6, #1
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q5, [r2] vst1.32 {d10, d11}, [r2]
cmp r6, #2 cmp r6, #2
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q6, [r2] vst1.32 {d12, d13}, [r2]
cmp r6, #3 cmp r6, #3
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q7, [r2] vst1.32 {d14, d15}, [r2]
cmp r6, #4 cmp r6, #4
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q8, [r2] vst1.32 {d16, d17}, [r2]
cmp r6, #5 cmp r6, #5
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q9, [r2] vst1.32 {d18, d19}, [r2]
cmp r6, #6 cmp r6, #6
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q10, [r2] vst1.32 {d20, d21}, [r2]
cmp r6, #7 cmp r6, #7
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q11, [r2] vst1.32 {d22, d23}, [r2]
cmp r6, #8 cmp r6, #8
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q12, [r2] vst1.32 {d24, d25}, [r2]
cmp r6, #9 cmp r6, #9
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q13, [r2] vst1.32 {d26, d27}, [r2]
cmp r6, #10 cmp r6, #10
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q14, [r2] vst1.32 {d28, d29}, [r2]
cmp r6, #11 cmp r6, #11
beq WriteEnd beq WriteEnd
add r2, r2, r8 add r2, r2, r8
vst1.32 q15, [r2] vst1.32 {d30, d31}, [r2]
add r2, r2, r8 add r2, r2, r8
add r2, r2, #16 add r2, r2, #16
b WriteEnd b WriteEnd

@ -135,17 +135,17 @@ LoopRow:
vsub.s32 d31, d31, d23 vsub.s32 d31, d31, d23
vmov.32 lr, d4[1] vmov.32 lr, d4[1]
vld1.32 {q9[]}, [lr] vld1.32 {d18[], d19[]}, [lr]
vshl.s32 q14, q14, q9 vshl.s32 q14, q14, q9
vshl.s32 q15, q15, q9 vshl.s32 q15, q15, q9
vmov.32 lr, d5[0] vmov.32 lr, d5[0]
vld1.32 {q8[]}, [lr] vld1.32 {d16[], d17[]}, [lr]
vqrdmulh.s32 q14, q14, q8 vqrdmulh.s32 q14, q14, q8
vqrdmulh.s32 q15, q15, q8 vqrdmulh.s32 q15, q15, q8
vmov.32 lr, d5[1] vmov.32 lr, d5[1]
vld1.32 {q7[]}, [lr] vld1.32 {d14[], d15[]}, [lr]
vand q6, q7, q14 vand q6, q7, q14
vshr.s32 q6, q6, #31 vshr.s32 q6, q6, #31
vqadd.s32 q14, q14, q6 vqadd.s32 q14, q14, q6

@ -143,7 +143,7 @@ asm_function MatrixMultiplyWinograd
mov r0, r8 // mat_b1 mov r0, r8 // mat_b1
ldr r12, [sp] // k ldr r12, [sp] // k
LoopK: LoopK:
vld1.32 {s0}, [r9], r5 vld1.32 d0[0], [r9], r5
vld1.32 d2[0], [r0], r4 vld1.32 d2[0], [r0], r4
vmla.f32 s8, s0, s4 vmla.f32 s8, s0, s4
subs r12, r12, #1 subs r12, r12, #1

@ -73,7 +73,7 @@ CalLoop:
Write: Write:
vmul.i32 q13, q13, q10 vmul.i32 q13, q13, q10
vst1.32 q13, [r1], r7 vst1.32 {d26, d27}, [r1], r7
beq RowLoop beq RowLoop
End: End:

Loading…
Cancel
Save