parent
ba6023b87d
commit
aa94e5a91e
@ -0,0 +1,198 @@
|
||||
#ifdef ENABLE_ARM32
|
||||
.text
|
||||
.align 5
|
||||
.global TiledC4MatmulFp32
|
||||
#ifndef __APPLE__
|
||||
.type TiledC4MatmulFp32, %function
|
||||
#endif
|
||||
|
||||
TiledC4MatmulFp32:
|
||||
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
|
||||
//x0: dst
|
||||
//x1: src
|
||||
//x2: weight
|
||||
//x3: cal_num
|
||||
//x4: ic4
|
||||
//x5: oc4
|
||||
|
||||
push {r4-r8, lr}
|
||||
ldr r4, [sp, #24]
|
||||
ldr r5, [sp, #28]
|
||||
//step multi by sizeof(float)
|
||||
mov r8, #4
|
||||
mul r3, r8, r3
|
||||
|
||||
vpush {q4-q7}
|
||||
|
||||
LoopOc:
|
||||
mov r6, r1
|
||||
mov r8, r0
|
||||
subs r7, r4, #1
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
vld1.32 {q4, q5}, [r2]!
|
||||
vld1.32 {q6, q7}, [r2]!
|
||||
|
||||
vmul.f32 q8, q4, d0[0]
|
||||
vmul.f32 q9, q4, d2[0]
|
||||
vmul.f32 q10, q4, d4[0]
|
||||
vmul.f32 q11, q4, d6[0]
|
||||
|
||||
vmla.f32 q8, q5, d0[1]
|
||||
vmla.f32 q9, q5, d2[1]
|
||||
vmla.f32 q10, q5, d4[1]
|
||||
vmla.f32 q11, q5, d6[1]
|
||||
|
||||
vmla.f32 q8, q6, d1[0]
|
||||
vmla.f32 q9, q6, d3[0]
|
||||
vmla.f32 q10, q6, d5[0]
|
||||
vmla.f32 q11, q6, d7[0]
|
||||
|
||||
vmla.f32 q8, q7, d1[1]
|
||||
vmla.f32 q9, q7, d3[1]
|
||||
vmla.f32 q10, q7, d5[1]
|
||||
vmla.f32 q11, q7, d7[1]
|
||||
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
|
||||
vmul.f32 q12, q4, d0[0]
|
||||
vmul.f32 q13, q4, d2[0]
|
||||
vmul.f32 q14, q4, d4[0]
|
||||
vmul.f32 q15, q4, d6[0]
|
||||
|
||||
vmla.f32 q12, q5, d0[1]
|
||||
vmla.f32 q13, q5, d2[1]
|
||||
vmla.f32 q14, q5, d4[1]
|
||||
vmla.f32 q15, q5, d6[1]
|
||||
|
||||
vmla.f32 q12, q6, d1[0]
|
||||
vmla.f32 q13, q6, d3[0]
|
||||
vmla.f32 q14, q6, d5[0]
|
||||
vmla.f32 q15, q6, d7[0]
|
||||
|
||||
vmla.f32 q12, q7, d1[1]
|
||||
vmla.f32 q13, q7, d3[1]
|
||||
vmla.f32 q14, q7, d5[1]
|
||||
vmla.f32 q15, q7, d7[1]
|
||||
beq LoopIcEnd
|
||||
|
||||
subs r7, r7, #1
|
||||
|
||||
vld1.32 {q4, q5}, [r2]!
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
|
||||
vmla.f32 q8, q4, d0[0]
|
||||
vmla.f32 q9, q4, d2[0]
|
||||
beq LoopIcEndHalf
|
||||
|
||||
LoopIc:
|
||||
vmla.f32 q10, q4, d4[0]
|
||||
vmla.f32 q11, q4, d6[0]
|
||||
|
||||
vmla.f32 q8, q5, d0[1]
|
||||
vmla.f32 q9, q5, d2[1]
|
||||
vld1.32 {q6, q7}, [r2]!
|
||||
vmla.f32 q10, q5, d4[1]
|
||||
vmla.f32 q11, q5, d6[1]
|
||||
|
||||
vmla.f32 q8, q6, d1[0]
|
||||
vmla.f32 q9, q6, d3[0]
|
||||
vmla.f32 q10, q6, d5[0]
|
||||
vmla.f32 q11, q6, d7[0]
|
||||
|
||||
vmla.f32 q8, q7, d1[1]
|
||||
vmla.f32 q9, q7, d3[1]
|
||||
vmla.f32 q10, q7, d5[1]
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vmla.f32 q11, q7, d7[1]
|
||||
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
|
||||
vmla.f32 q12, q4, d0[0]
|
||||
vmla.f32 q13, q4, d2[0]
|
||||
vmla.f32 q14, q4, d4[0]
|
||||
vmla.f32 q15, q4, d6[0]
|
||||
|
||||
vmla.f32 q12, q5, d0[1]
|
||||
vmla.f32 q13, q5, d2[1]
|
||||
vmla.f32 q14, q5, d4[1]
|
||||
vmla.f32 q15, q5, d6[1]
|
||||
|
||||
vmla.f32 q12, q6, d1[0]
|
||||
vmla.f32 q13, q6, d3[0]
|
||||
vmla.f32 q14, q6, d5[0]
|
||||
vld1.32 {q4, q5}, [r2]!
|
||||
vmla.f32 q15, q6, d7[0]
|
||||
|
||||
vmla.f32 q12, q7, d1[1]
|
||||
vmla.f32 q13, q7, d3[1]
|
||||
vmla.f32 q14, q7, d5[1]
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vmla.f32 q15, q7, d7[1]
|
||||
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
|
||||
vmla.f32 q8, q4, d0[0]
|
||||
vmla.f32 q9, q4, d2[0]
|
||||
|
||||
subs r7, r7, #1
|
||||
bne LoopIc
|
||||
LoopIcEndHalf:
|
||||
vmla.f32 q10, q4, d4[0]
|
||||
vmla.f32 q11, q4, d6[0]
|
||||
|
||||
vmla.f32 q8, q5, d0[1]
|
||||
vmla.f32 q9, q5, d2[1]
|
||||
vld1.32 {q6, q7}, [r2]!
|
||||
vmla.f32 q10, q5, d4[1]
|
||||
vmla.f32 q11, q5, d6[1]
|
||||
|
||||
vmla.f32 q8, q6, d1[0]
|
||||
vmla.f32 q9, q6, d3[0]
|
||||
vmla.f32 q10, q6, d5[0]
|
||||
vmla.f32 q11, q6, d7[0]
|
||||
|
||||
vmla.f32 q8, q7, d1[1]
|
||||
vmla.f32 q9, q7, d3[1]
|
||||
vmla.f32 q10, q7, d5[1]
|
||||
vld1.32 {q0, q1}, [r1]!
|
||||
vmla.f32 q11, q7, d7[1]
|
||||
|
||||
vld1.32 {q2, q3}, [r1]!
|
||||
|
||||
vmla.f32 q12, q4, d0[0]
|
||||
vmla.f32 q13, q4, d2[0]
|
||||
vmla.f32 q14, q4, d4[0]
|
||||
vmla.f32 q15, q4, d6[0]
|
||||
|
||||
vmla.f32 q12, q5, d0[1]
|
||||
vmla.f32 q13, q5, d2[1]
|
||||
vmla.f32 q14, q5, d4[1]
|
||||
vmla.f32 q15, q5, d6[1]
|
||||
|
||||
vmla.f32 q12, q6, d1[0]
|
||||
vmla.f32 q13, q6, d3[0]
|
||||
vmla.f32 q14, q6, d5[0]
|
||||
vmla.f32 q15, q6, d7[0]
|
||||
|
||||
vmla.f32 q12, q7, d1[1]
|
||||
vmla.f32 q13, q7, d3[1]
|
||||
vmla.f32 q14, q7, d5[1]
|
||||
vmla.f32 q15, q7, d7[1]
|
||||
LoopIcEnd:
|
||||
vst1.32 {q8, q9}, [r0]!
|
||||
vst1.32 {q10, q11}, [r0]!
|
||||
vst1.32 {q12, q13}, [r0]!
|
||||
vst1.32 {q14, q15}, [r0]!
|
||||
mov r1, r6
|
||||
|
||||
subs r5, r5, #1
|
||||
add r0, r8, r3
|
||||
bne LoopOc
|
||||
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, pc}
|
||||
|
||||
#endif
|
@ -0,0 +1,218 @@
|
||||
#ifdef ENABLE_ARM32
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global WinogradTransLeft
|
||||
#ifndef __APPLE__
|
||||
.type WinogradTransLeft, %function
|
||||
#endif
|
||||
|
||||
//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
|
||||
//x0: S
|
||||
//x1: B
|
||||
//x2: M
|
||||
//x3: w
|
||||
//x4: h
|
||||
//x5: k
|
||||
//x6: length
|
||||
WinogradTransLeft:
|
||||
push {r4-r11, lr}
|
||||
ldr r4, [sp, #36]
|
||||
ldr r5, [sp, #40]
|
||||
ldr r6, [sp, #44]
|
||||
|
||||
mov r8, #16 // 4 * sizeof(float)
|
||||
mul r8, r6, r8
|
||||
mul r9, r3, r8
|
||||
sub r9, r9, r8
|
||||
add r7, r9, r8 // step for S
|
||||
mov r10, #4
|
||||
mul r10, r4, r10 // step for B
|
||||
|
||||
LoopH:
|
||||
push {r0, r3}
|
||||
LoopW:
|
||||
push {r0, r1}
|
||||
vmov.i32 q14, #0
|
||||
mov r11, r6
|
||||
InitZero:
|
||||
vst1.32 {q14}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne InitZero
|
||||
|
||||
sub r2, r2, r8
|
||||
mov r12, r5
|
||||
|
||||
LoopKStart7:
|
||||
cmp r12, #7
|
||||
blt LoopKStart4
|
||||
push {r3-r7}
|
||||
LoopK7:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
vld1.32 {d1[1]}, [r1], r10
|
||||
vld1.32 {d2[0]}, [r1], r10
|
||||
vld1.32 {d2[1]}, [r1], r10
|
||||
vld1.32 {d3[0]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r7
|
||||
add r3, r1, r7
|
||||
add r4, r3, r7
|
||||
add r5, r4, r7
|
||||
add r6, r5, r7
|
||||
add r7, r6, r7
|
||||
|
||||
LoopLength7:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
vld1.32 {q13}, [r4]!
|
||||
vmla.f32 q9, q13, d1[1]
|
||||
vld1.32 {q12}, [r5]!
|
||||
vmla.f32 q8, q12, d2[0]
|
||||
vld1.32 {q13}, [r6]!
|
||||
vmla.f32 q9, q13, d2[1]
|
||||
vld1.32 {q12}, [r7]!
|
||||
vmla.f32 q8, q12, d3[0]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength7
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #7
|
||||
add r0, r7, r9
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #7
|
||||
bge LoopK7
|
||||
|
||||
pop {r3-r7}
|
||||
|
||||
LoopKStart4:
|
||||
cmp r12, #4
|
||||
blt LoopKStart3
|
||||
vmov.32 d30[1], r3
|
||||
vmov.32 d31[0], r4
|
||||
LoopK4:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
vld1.32 {d1[1]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r7
|
||||
add r3, r1, r7
|
||||
add r4, r3, r7
|
||||
|
||||
LoopLength4:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
vld1.32 {q13}, [r4]!
|
||||
vmla.f32 q9, q13, d1[1]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength4
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #4
|
||||
add r0, r4, r9
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #4
|
||||
bge LoopK4
|
||||
|
||||
vmov.32 r3, d30[1]
|
||||
vmov.32 r4, d31[0]
|
||||
|
||||
LoopKStart3:
|
||||
cmp r12, #3
|
||||
blt LoopKStart
|
||||
vmov.32 d30[1], r3
|
||||
vmov.32 d31[0], r4
|
||||
LoopK3:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r7
|
||||
add r3, r1, r7
|
||||
|
||||
LoopLength3:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength3
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #3
|
||||
add r0, r3, r9
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #3
|
||||
bge LoopK3
|
||||
|
||||
vmov.32 r3, d30[1]
|
||||
vmov.32 r4, d31[0]
|
||||
|
||||
LoopKStart:
|
||||
cmp r12, #0
|
||||
beq LoopKEnd
|
||||
|
||||
LoopK:
|
||||
vld1.32 {d30[0]}, [r1], r10
|
||||
|
||||
vdup.32 q15, d30[0]
|
||||
mov r11, r6
|
||||
LoopLength:
|
||||
vld1.32 {q0}, [r2]
|
||||
vld1.32 {q1}, [r0]!
|
||||
vmla.f32 q0, q1, q15
|
||||
|
||||
vst1.32 {q0}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength
|
||||
subs r12, r12, #1
|
||||
|
||||
sub r2, r2, r8
|
||||
add r0, r0, r9
|
||||
bne LoopK
|
||||
|
||||
LoopKEnd:
|
||||
pop {r0, r1}
|
||||
subs r3, r3, #1
|
||||
add r0, r0, r8
|
||||
add r2, r2, r8
|
||||
bne LoopW
|
||||
|
||||
pop {r0, r3}
|
||||
add r1, r1, #4 //sizeof(float)
|
||||
subs r4, r4, #1
|
||||
bne LoopH
|
||||
|
||||
pop {r4-r11, pc}
|
||||
|
||||
#endif
|
@ -0,0 +1,208 @@
|
||||
#ifdef ENABLE_ARM32
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global WinogradTransRight
|
||||
#ifndef __APPLE__
|
||||
.type WinogradTransRight, %function
|
||||
#endif
|
||||
|
||||
//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
|
||||
//x0: S
|
||||
//x1: B
|
||||
//x2: M
|
||||
//x3: w
|
||||
//x4: h
|
||||
//x5: k
|
||||
//x6: length
|
||||
WinogradTransRight:
|
||||
push {r4-r11, lr}
|
||||
ldr r4, [sp, #36]
|
||||
ldr r5, [sp, #40]
|
||||
ldr r6, [sp, #44]
|
||||
|
||||
mov r8, #16 // 4 * sizeof(float)
|
||||
mul r8, r6, r8
|
||||
mul r9, r5, r8 // step for S
|
||||
mov r10, #4
|
||||
mul r10, r4, r10 // step for B
|
||||
|
||||
LoopH:
|
||||
push {r1, r3}
|
||||
LoopW:
|
||||
push {r0, r1}
|
||||
vmov.i32 q14, #0
|
||||
mov r11, r6
|
||||
InitZero:
|
||||
vst1.32 {q14}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne InitZero
|
||||
|
||||
sub r2, r2, r8
|
||||
mov r12, r5
|
||||
LoopKStart7:
|
||||
cmp r12, #7
|
||||
blt LoopKStart4
|
||||
push {r3-r7}
|
||||
LoopK7:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
vld1.32 {d1[1]}, [r1], r10
|
||||
vld1.32 {d2[0]}, [r1], r10
|
||||
vld1.32 {d2[1]}, [r1], r10
|
||||
vld1.32 {d3[0]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r8
|
||||
add r3, r1, r8
|
||||
add r4, r3, r8
|
||||
add r5, r4, r8
|
||||
add r6, r5, r8
|
||||
add r7, r6, r8
|
||||
LoopLength7:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
vld1.32 {q13}, [r4]!
|
||||
vmla.f32 q9, q13, d1[1]
|
||||
vld1.32 {q12}, [r5]!
|
||||
vmla.f32 q8, q12, d2[0]
|
||||
vld1.32 {q13}, [r6]!
|
||||
vmla.f32 q9, q13, d2[1]
|
||||
vld1.32 {q12}, [r7]!
|
||||
vmla.f32 q8, q12, d3[0]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength7
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #7
|
||||
mov r0, r7
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #7
|
||||
bge LoopK7
|
||||
|
||||
pop {r3-r7}
|
||||
|
||||
LoopKStart4:
|
||||
cmp r12, #4
|
||||
blt LoopKStart3
|
||||
vmov.32 d30[1], r3
|
||||
vmov.32 d31[0], r4
|
||||
LoopK4:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
vld1.32 {d1[1]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r8
|
||||
add r3, r1, r8
|
||||
add r4, r3, r8
|
||||
|
||||
LoopLength4:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
vld1.32 {q13}, [r4]!
|
||||
vmla.f32 q9, q13, d1[1]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength4
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #4
|
||||
mov r0, r4
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #4
|
||||
bge LoopK4
|
||||
|
||||
vmov.32 r3, d30[1]
|
||||
vmov.32 r4, d31[0]
|
||||
|
||||
LoopKStart3:
|
||||
cmp r12, #3
|
||||
blt LoopKStart
|
||||
vmov.32 d30[1], r3
|
||||
LoopK3:
|
||||
vld1.32 {d0[0]}, [r1], r10
|
||||
vld1.32 {d0[1]}, [r1], r10
|
||||
vld1.32 {d1[0]}, [r1], r10
|
||||
mov r11, r6
|
||||
vmov.32 d30[0], r1
|
||||
|
||||
add r1, r0, r8
|
||||
add r3, r1, r8
|
||||
|
||||
LoopLength3:
|
||||
vld1.32 {q8}, [r2]
|
||||
vld1.32 {q12}, [r0]!
|
||||
vmla.f32 q8, q12, d0[0]
|
||||
vld1.32 {q13}, [r1]!
|
||||
vmul.f32 q9, q13, d0[1]
|
||||
vld1.32 {q12}, [r3]!
|
||||
vmla.f32 q8, q12, d1[0]
|
||||
|
||||
vadd.f32 q9, q8, q9
|
||||
vst1.32 {q9}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength3
|
||||
|
||||
sub r2, r2, r8
|
||||
sub r12, r12, #3
|
||||
mov r0, r3
|
||||
vmov.32 r1, d30[0]
|
||||
cmp r12, #3
|
||||
bge LoopK3
|
||||
|
||||
vmov.32 r3, d30[1]
|
||||
|
||||
LoopKStart:
|
||||
cmp r12, #0
|
||||
beq LoopKEnd
|
||||
LoopK:
|
||||
vld1.32 {d30[0]}, [r1], r10
|
||||
vdup.32 q15, d30[0]
|
||||
mov r11, r6
|
||||
LoopLength:
|
||||
vld1.32 {q0}, [r2]
|
||||
vld1.32 {q1}, [r0]!
|
||||
vmla.f32 q0, q1, q15
|
||||
|
||||
vst1.32 {q0}, [r2]!
|
||||
subs r11, r11, #1
|
||||
bne LoopLength
|
||||
|
||||
subs r12, r12, #1
|
||||
sub r2, r2, r8
|
||||
bne LoopK
|
||||
LoopKEnd:
|
||||
pop {r0, r1}
|
||||
subs r3, r3, #1
|
||||
add r2, r2, r8
|
||||
add r1, r1, #4 //sizeof(float)
|
||||
bne LoopW
|
||||
|
||||
pop {r1, r3}
|
||||
add r0, r0, r9
|
||||
subs r4, r4, #1
|
||||
bne LoopH
|
||||
|
||||
pop {r4-r11, pc}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,147 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global WinogradTransLeft
|
||||
#ifndef __APPLE__
|
||||
.type WinogradTransLeft, %function
|
||||
#endif
|
||||
|
||||
WinogradTransLeft:
|
||||
//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
|
||||
//x0: S
|
||||
//x1: B
|
||||
//x2: M
|
||||
//x3: w
|
||||
//x4: h
|
||||
//x5: k
|
||||
//x6:length
|
||||
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #32
|
||||
|
||||
mov x8, #16 // 4 * sizeof(float)
|
||||
mul x8, x6, x8
|
||||
mul x9, x3, x8
|
||||
sub x9, x9, x8
|
||||
add x7, x9, x8 // step for S
|
||||
mov x10, #4
|
||||
mul x10, x4, x10 // step for B
|
||||
|
||||
LoopH:
|
||||
mov x13, x0
|
||||
mov x15, x3
|
||||
LoopW:
|
||||
mov x14, x13
|
||||
mov x17, x1
|
||||
dup v30.4s, wzr
|
||||
mov x11, x6
|
||||
InitZero:
|
||||
st1 {v30.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne InitZero
|
||||
|
||||
sub x2, x2, x8
|
||||
mov x12, x5
|
||||
LoopKStart4:
|
||||
cmp x12, #4
|
||||
blt LoopKStart3
|
||||
mov x16, x15
|
||||
mov x19, x4
|
||||
LoopK4:
|
||||
ld1 {v0.s}[0], [x17], x10
|
||||
ld1 {v0.s}[1], [x17], x10
|
||||
ld1 {v0.s}[2], [x17], x10
|
||||
ld1 {v0.s}[3], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
add x19, x16, x7
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x14], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
ld1 {v21.4s}, [x19], #16
|
||||
fmla v17.4s, v21.4s, v0.s[3]
|
||||
fadd v17.4s, v16.4s, v17.4s
|
||||
st1 {v17.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength4
|
||||
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #4
|
||||
add x14, x19, x9
|
||||
cmp x12, #4
|
||||
bge LoopK4
|
||||
|
||||
LoopKStart3:
|
||||
cmp x12, #3
|
||||
blt LoopKStart
|
||||
mov x16, x15
|
||||
LoopK3:
|
||||
ld1 {v0.s}[0], [x17], x10
|
||||
ld1 {v0.s}[1], [x17], x10
|
||||
ld1 {v0.s}[2], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
LoopLength3:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x14], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
fadd v17.4s, v16.4s, v17.4s
|
||||
st1 {v17.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength3
|
||||
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #3
|
||||
add x14, x16, x9
|
||||
cmp x12, #3
|
||||
bge LoopK3
|
||||
|
||||
LoopKStart:
|
||||
cmp x12, #0
|
||||
beq LKEnd
|
||||
LoopK:
|
||||
ld1r {v31.4s}, [x17], x10
|
||||
mov x11, x6
|
||||
LoopLength:
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x14], #16
|
||||
fmla v0.4s, v1.4s, v31.4s
|
||||
st1 {v0.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength
|
||||
|
||||
subs x12, x12, #1
|
||||
sub x2, x2, x8
|
||||
add x14, x14, x9
|
||||
bne LoopK
|
||||
|
||||
LKEnd:
|
||||
subs x15, x15, #1
|
||||
add x13, x13, x8
|
||||
add x2, x2, x8
|
||||
bne LoopW
|
||||
|
||||
add x1, x1, #4 //sizeof(float)
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #32
|
||||
ret
|
||||
|
||||
#endif
|
@ -0,0 +1,144 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global WinogradTransRight
|
||||
#ifndef __APPLE__
|
||||
.type WinogradTransRight, %function
|
||||
#endif
|
||||
|
||||
WinogradTransRight:
|
||||
//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
|
||||
//x0: S
|
||||
//x1: B
|
||||
//x2: M
|
||||
//x3: w
|
||||
//x4: h
|
||||
//x5: k
|
||||
//x6: length
|
||||
|
||||
mov x8, #16 // 4 * sizeof(float)
|
||||
mul x8, x6, x8
|
||||
mul x9, x5, x8 // step for S
|
||||
mov x10, #4
|
||||
mul x10, x4, x10 // step for B
|
||||
|
||||
LoopH:
|
||||
mov x7, x1
|
||||
mov x15, x3
|
||||
LoopW:
|
||||
mov x17, x0
|
||||
mov x13, x7
|
||||
dup v30.4s, wzr
|
||||
mov x11, x6
|
||||
InitZero:
|
||||
st1 {v30.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne InitZero
|
||||
sub x2, x2, x8
|
||||
mov x12, x5
|
||||
|
||||
LoopKStart4:
|
||||
cmp x12, #4
|
||||
blt LoopKStart3
|
||||
mov x16, x15
|
||||
mov x18, x4
|
||||
LoopK4:
|
||||
ld1 {v0.s}[0], [x13], x10
|
||||
ld1 {v0.s}[1], [x13], x10
|
||||
ld1 {v0.s}[2], [x13], x10
|
||||
ld1 {v0.s}[3], [x13], x10
|
||||
mov x11, x6
|
||||
mov x14, x13
|
||||
|
||||
add x14, x17, x8
|
||||
add x16, x14, x8
|
||||
add x18, x16, x8
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x17], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x14], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
fmla v17.4s, v21.4s, v0.s[3]
|
||||
|
||||
fadd v17.4s, v16.4s, v17.4s
|
||||
st1 {v17.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength4
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #4
|
||||
mov x17, x18
|
||||
|
||||
cmp x12, #4
|
||||
bge LoopK4
|
||||
|
||||
LoopKStart3:
|
||||
cmp x12, #3
|
||||
blt LoopKStart
|
||||
mov x16, x15
|
||||
LoopK3:
|
||||
ld1 {v0.s}[0], [x13], x10
|
||||
ld1 {v0.s}[1], [x13], x10
|
||||
ld1 {v0.s}[2], [x13], x10
|
||||
mov x11, x6
|
||||
mov x14, x13
|
||||
|
||||
add x14, x17, x8
|
||||
add x16, x14, x8
|
||||
|
||||
LoopLength3:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x17], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x14], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
|
||||
fadd v17.4s, v16.4s, v17.4s
|
||||
st1 {v17.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength3
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #3
|
||||
mov x17, x18
|
||||
cmp x12, #3
|
||||
bge LoopK3
|
||||
|
||||
LoopKStart:
|
||||
cmp x12, #0
|
||||
beq LoopKEnd
|
||||
|
||||
LoopK:
|
||||
ld1r {v31.4s}, [x13], x10
|
||||
|
||||
mov x11, x6
|
||||
LoopLength:
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x17], #16
|
||||
fmla v0.4s, v1.4s, v31.4s
|
||||
|
||||
st1 {v0.4s}, [x2], #16
|
||||
subs x11, x11, #1
|
||||
bne LoopLength
|
||||
subs x12, x12, #1
|
||||
|
||||
sub x2, x2, x8
|
||||
bne LoopK
|
||||
LoopKEnd:
|
||||
subs x15, x15, #1
|
||||
add x2, x2, x8
|
||||
add x7, x7, #4 //sizeof(float)
|
||||
bne LoopW
|
||||
|
||||
add x0, x0, x9
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
ret
|
||||
#endif
|
Loading…
Reference in new issue