!7643 [MSLITE][Develop] optimize arm cpu int8 op conv dw 3x3: add assembly arm32
Merge pull request !7643 from yangruoqi713/conv_dwpull/7643/MERGE
commit
9d9f98768b
@ -0,0 +1,116 @@
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDw3x3BorderPixelInt8
|
||||
#ifndef __APPLE__
|
||||
.type ConvDw3x3BorderPixelInt8, %function
|
||||
#endif
|
||||
|
||||
// void ConvDw3x3BorderPixelInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
|
||||
// size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
|
||||
// size_t out_multiplier, size_t left_shift, size_t right_shift, size_t acc_min, size_t acc_max) {
|
||||
|
||||
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
|
||||
// r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
|
||||
// r14: acc_min, r15: acc_max
|
||||
ConvDw3x3BorderPixelInt8:
|
||||
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||
// according to https://stackoverflow.com/questions/53625807
|
||||
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||
// clang's rule seems more simple, though there are no subroutine calls here
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
|
||||
push {r4-r8, r9-r12, lr}
|
||||
vpush {q4-q7}
|
||||
add sp, sp, #104
|
||||
|
||||
ldr r4, [sp]
|
||||
ldr r5, [sp, #4]
|
||||
ldr r6, [sp, #8]
|
||||
ldr r7, [sp, #12]
|
||||
ldr r8, [sp, #16]
|
||||
|
||||
ldrb r10, [sp, #20] // in_zp
|
||||
vdup.8 d18, r10
|
||||
ldr r10, [sp, #24] // out_zp
|
||||
vdup.32 q15, r10
|
||||
ldr r10, [sp, #28] // out_multiplier
|
||||
vdup.32 q14, r10
|
||||
ldr r10, [sp, #32] // left_shift
|
||||
vdup.32 q13, r10
|
||||
ldr r10, [sp, #36] // right_shift
|
||||
vdup.32 q12, r10
|
||||
ldr r10, [sp, #40] // acc_min
|
||||
vdup.32 q11, r10
|
||||
ldr r10, [sp, #44] // acc_max
|
||||
vdup.32 q10, r10
|
||||
|
||||
mov r4, #2
|
||||
mul lr, r8, r4
|
||||
|
||||
LoopC:
|
||||
mov r9, r1
|
||||
mov r10, r2
|
||||
ldr r4, [sp]
|
||||
|
||||
vld1.32 {q3}, [r3]!
|
||||
vld1.32 {q4}, [r3]!
|
||||
LoopH:
|
||||
mov r11, r9
|
||||
mov r12, r10
|
||||
ldr r5, [sp, #4]
|
||||
LoopW:
|
||||
vld1.8 {d0}, [r11], r7
|
||||
vld1.16 {d2, d3}, [r12], lr // weight
|
||||
vsubl.s8 q2, d0, d18 // -zp
|
||||
|
||||
vmlal.s16 q3, d4, d2
|
||||
vmlal.s16 q4, d5, d3
|
||||
|
||||
subs r5, r5, #1
|
||||
bne LoopW
|
||||
subs r4, r4, #1
|
||||
add r9, r9, r6
|
||||
mov r11, #3
|
||||
mul r5, lr, r11
|
||||
add r10, r10, r5
|
||||
bne LoopH
|
||||
|
||||
vshl.s32 q3, q3, q13
|
||||
vqrdmulh.s32 q3, q3, q14
|
||||
vand q5, q3, q12
|
||||
vshr.s32 q5, q5, #31
|
||||
vqadd.s32 q3, q3, q5
|
||||
vrshl.s32 q3, q3, q12
|
||||
vadd.i32 q3, q3, q15
|
||||
vmax.s32 q3, q3, q11
|
||||
vmin.s32 q3, q3, q10
|
||||
vqmovn.s32 d14, q3
|
||||
|
||||
vshl.s32 q4, q4, q13
|
||||
vqrdmulh.s32 q4, q4, q14
|
||||
vand q6, q4, q12
|
||||
vshr.s32 q6, q6, #31
|
||||
vqadd.s32 q4, q4, q6
|
||||
vrshl.s32 q4, q4, q12
|
||||
vadd.i32 q4, q4, q15
|
||||
vmax.s32 q4, q4, q11
|
||||
vmin.s32 q4, q4, q10
|
||||
vqmovn.s32 d15, q4
|
||||
vqmovn.s16 d16, q7
|
||||
|
||||
vst1.8 {d16}, [r0]!
|
||||
add r1, r1, #8
|
||||
add r2, r2, #16
|
||||
|
||||
sub r8, r8, #8
|
||||
cmp r8, #8
|
||||
bge LoopC
|
||||
|
||||
sub sp, sp, #104
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r9-r12, pc}
|
||||
#endif
|
||||
#endif
|
Loading…
Reference in new issue