!6182 [MS][LITE][Develop] add arm32 fp32 DwBoder、Row、Center op
Merge pull request !6182 from liuzhongkai/arm32_new1pull/6182/MERGE
commit
6873b53043
@ -0,0 +1,63 @@
|
||||
#ifdef ENABLE_ARM32
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp32Border
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp32Border, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
|
||||
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
|
||||
// r8: kernel_w, r9: relu, r10: relu6
|
||||
ConvDwFp32Border:
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
add sp, sp, #104
|
||||
|
||||
ldr r4, [sp] // height
|
||||
ldr r5, [sp, #4] // width
|
||||
ldr r6, [sp, #8] // in_kh_step
|
||||
ldr r7, [sp, #12] // in_kw_step
|
||||
ldr r8, [sp, #16] // kernel_w
|
||||
ldr r9, [sp, #20] // relu
|
||||
ldr r10, [sp, #24] // relu6
|
||||
|
||||
vld1.32 {q0}, [r3] // bias
|
||||
vmov.i32 q1, #6 // relu6
|
||||
vcvt.f32.s32 q1, q1
|
||||
veor q2, q2, q2 // relu
|
||||
|
||||
LoopH:
|
||||
mov r11, r1
|
||||
mov r12, r2
|
||||
mov r14, r5
|
||||
LoopW:
|
||||
vld1.32 {q3}, [r11], r7
|
||||
vld1.32 {q4}, [r12]!
|
||||
vmla.f32 q0, q3, q4
|
||||
subs r14, r14, #1
|
||||
bne LoopW
|
||||
subs r4, r4, #1
|
||||
add r1, r1, r6
|
||||
add r2, r2, r8
|
||||
bne LoopH
|
||||
|
||||
cmp r10, #0
|
||||
bne Relu6
|
||||
cmp r9, #0
|
||||
bne Relu
|
||||
b Write
|
||||
Relu6:
|
||||
vmin.f32 q0, q0, q1
|
||||
Relu:
|
||||
vmax.f32 q0, q0, q2
|
||||
Write:
|
||||
vst1.32 {q0}, [r0]
|
||||
|
||||
sub sp, sp, #104
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
#endif
|
@ -0,0 +1,113 @@
|
||||
#ifdef ENABLE_ARM32
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp32Row
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp32Row, %function
|
||||
#endif
|
||||
|
||||
// voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr,
|
||||
// size_t num_pixels, size_t input_channel, size_t input_step)
|
||||
// r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels,
|
||||
// r4: input_channel, r5: input_step
|
||||
ConvDwFp32Row:
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
|
||||
push {r4-r6, r8, r10, r11}
|
||||
vpush {q4-q7}
|
||||
add sp, sp, #88
|
||||
mov r11, r0
|
||||
ldr r4, [sp]
|
||||
ldr r5, [sp, #4]
|
||||
mov r6, #4
|
||||
mul r5, r5, r6
|
||||
cmp r3, #0
|
||||
beq End
|
||||
|
||||
LoopNumPixel:
|
||||
mov r6, r1 // input_ptr
|
||||
mov r8, r2 // filter_ptr
|
||||
mov r10, r4 // input_channel
|
||||
|
||||
LoopDepth16In:
|
||||
cmp r10, #16
|
||||
blt L4
|
||||
sub r10, r10, #16
|
||||
|
||||
vld1.32 {q0, q1}, [r6]!
|
||||
vld1.32 {q4, q5}, [r8]!
|
||||
vld1.32 {q8, q9}, [r0]!
|
||||
|
||||
cmp r10, #16
|
||||
blt LoopDepth16Out
|
||||
LoopDepth16:
|
||||
vmla.f32 q8, q0, q4
|
||||
vmla.f32 q9, q1, q5
|
||||
vst1.32 {q8, q9}, [r11]!
|
||||
|
||||
vld1.32 {q2, q3}, [r6]!
|
||||
vld1.32 {q6, q7}, [r8]!
|
||||
vld1.32 {q10, q11}, [r0]!
|
||||
vmla.f32 q10, q2, q6
|
||||
vmla.f32 q11, q3, q7
|
||||
vst1.32 {q10, q11}, [r11]!
|
||||
|
||||
vld1.32 {q0, q1}, [r6]!
|
||||
vld1.32 {q4, q5}, [r8]!
|
||||
vld1.32 {q8, q9}, [r0]!
|
||||
|
||||
sub r10, r10, #16
|
||||
cmp r10, #16
|
||||
bge LoopDepth16
|
||||
|
||||
LoopDepth16Out:
|
||||
vmla.f32 q8, q0, q4
|
||||
vmla.f32 q9, q1, q5
|
||||
vst1.32 {q8, q9}, [r11]!
|
||||
|
||||
vld1.32 {q2, q3}, [r6]!
|
||||
vld1.32 {q6, q7}, [r8]!
|
||||
vld1.32 {q10, q11}, [r0]!
|
||||
vmla.f32 q10, q2, q6
|
||||
vmla.f32 q11, q3, q7
|
||||
vst1.32 {q10, q11}, [r11]!
|
||||
|
||||
L4:
|
||||
cmp r10, #4
|
||||
blt L0
|
||||
|
||||
LoopDepth4:
|
||||
vld1.32 {q0}, [r6]!
|
||||
vld1.32 {q4}, [r8]!
|
||||
vld1.32 {q8}, [r0]!
|
||||
vmla.f32 q8, q0, q4
|
||||
vst1.32 {q8}, [r11]!
|
||||
sub r10, r10, #4
|
||||
cmp r10, #4
|
||||
bge LoopDepth4
|
||||
|
||||
L0:
|
||||
cmp r10, #0
|
||||
beq Loop16LineEnd
|
||||
|
||||
LoopDepth0:
|
||||
vld1.32 {s0}, [r6]!
|
||||
vld1.32 {s1}, [r8]!
|
||||
vld1.32 {s2}, [r0]!
|
||||
vmla.f32 s2, s0, s1
|
||||
vst1.32 {s2}, [r11]!
|
||||
subs r10, r10, #1
|
||||
bne LoopDepth0
|
||||
|
||||
Loop16LineEnd:
|
||||
subs r3, r3, #1
|
||||
add r1, r1, r5
|
||||
bne LoopNumPixel
|
||||
|
||||
End:
|
||||
sub sp, sp, #88
|
||||
vpop {q4-q7}
|
||||
pop {r4-r6, r8, r10, r11}
|
||||
bx lr
|
||||
#endif
|
Loading…
Reference in new issue