parent
6bf9732369
commit
ac1d19ce57
@ -0,0 +1,56 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp16Border
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp16Border, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
// size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
|
||||
// size_t relu6)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
|
||||
// x8: kernel_w, x9: relu, x10: relu6
|
||||
ConvDwFp16Border:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
|
||||
ld1 {v0.8h}, [x3] // bias
|
||||
movi v1.8h, #0x46, lsl #8 // relu 6
|
||||
dup v2.4s, wzr // relu
|
||||
|
||||
mov x13, x1
|
||||
mov x14, x2
|
||||
LoopH:
|
||||
mov x15, x13
|
||||
mov x16, x14
|
||||
mov x17, x5
|
||||
LoopW:
|
||||
ld1 {v3.8h}, [x15], x7
|
||||
ld1 {v4.8h}, [x16], #16
|
||||
fmla v0.8h, v3.8h, v4.8h
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
subs x4, x4, #1
|
||||
add x13, x13, x6
|
||||
add x14, x14, x8
|
||||
bne LoopH
|
||||
cbnz x10, Relu6
|
||||
cbnz x9, Relu
|
||||
b Write
|
||||
Relu6:
|
||||
fmin v0.8h, v0.8h, v1.8h
|
||||
Relu:
|
||||
fmax v0.8h, v0.8h, v2.8h
|
||||
Write:
|
||||
st1 {v0.8h}, [x0]
|
||||
|
||||
ret
|
||||
#endif
|
Loading…
Reference in new issue