You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mindspore/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S

295 lines
9.7 KiB

#ifdef __aarch64__
.text
.align 5
.global ConvDwFp16Center
#ifndef __APPLE__
.type ConvDwFp16Center, %function
#endif
// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: relu, x15: relu6
ConvDwFp16Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #48
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
ldr x14, [sp, #48]
ldr x15, [sp, #56]
ld1 {v24.8h}, [x3]
movi v26.8h, #0x46, lsl #8
dup v27.4s, wzr
LoopH:
mov x23, x1
mov x24, x5
mov x3, x0
cmp x24, #8
blt LoopW
cmp x24, #16
blt LoopW8
LoopW16:
mov x19, #16
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
mov v8.16b, v24.16b
mov v9.16b, v24.16b
mov v10.16b, v24.16b
mov v11.16b, v24.16b
mov v12.16b, v24.16b
mov v13.16b, v24.16b
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x21, x16
LoopKw16:
mov x22, x21
ld1 {v25.8h}, [x17], #16
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v0.8h, v16.8h, v25.8h
fmla v1.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v2.8h, v18.8h, v25.8h
fmla v3.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v4.8h, v20.8h, v25.8h
fmla v5.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v8.8h, v16.8h, v25.8h
fmla v9.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v10.8h, v18.8h, v25.8h
fmla v11.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v12.8h, v20.8h, v25.8h
fmla v13.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v14.8h, v22.8h, v25.8h
fmla v15.8h, v23.8h, v25.8h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
subs x20, x20, #1
bne LoopKh16
cbnz x15, Relu616
cbnz x14, Relu16
b Write16
Relu616:
fmin v0.8h, v0.8h, v26.8h
fmin v1.8h, v1.8h, v26.8h
fmin v2.8h, v2.8h, v26.8h
fmin v3.8h, v3.8h, v26.8h
fmin v4.8h, v4.8h, v26.8h
fmin v5.8h, v5.8h, v26.8h
fmin v6.8h, v6.8h, v26.8h
fmin v7.8h, v7.8h, v26.8h
fmin v8.8h, v8.8h, v26.8h
fmin v9.8h, v9.8h, v26.8h
fmin v10.8h, v10.8h, v26.8h
fmin v11.8h, v11.8h, v26.8h
fmin v12.8h, v12.8h, v26.8h
fmin v13.8h, v13.8h, v26.8h
fmin v14.8h, v14.8h, v26.8h
fmin v15.8h, v15.8h, v26.8h
Relu16:
fmax v0.8h, v0.8h, v27.8h
fmax v1.8h, v1.8h, v27.8h
fmax v2.8h, v2.8h, v27.8h
fmax v3.8h, v3.8h, v27.8h
fmax v4.8h, v4.8h, v27.8h
fmax v5.8h, v5.8h, v27.8h
fmax v6.8h, v6.8h, v27.8h
fmax v7.8h, v7.8h, v27.8h
fmax v8.8h, v8.8h, v27.8h
fmax v9.8h, v9.8h, v27.8h
fmax v10.8h, v10.8h, v27.8h
fmax v11.8h, v11.8h, v27.8h
fmax v12.8h, v12.8h, v27.8h
fmax v13.8h, v13.8h, v27.8h
fmax v14.8h, v14.8h, v27.8h
fmax v15.8h, v15.8h, v27.8h
Write16:
st1 {v0.8h}, [x3], x9
st1 {v1.8h}, [x3], x9
st1 {v2.8h}, [x3], x9
st1 {v3.8h}, [x3], x9
st1 {v4.8h}, [x3], x9
st1 {v5.8h}, [x3], x9
st1 {v6.8h}, [x3], x9
st1 {v7.8h}, [x3], x9
st1 {v8.8h}, [x3], x9
st1 {v9.8h}, [x3], x9
st1 {v10.8h}, [x3], x9
st1 {v11.8h}, [x3], x9
st1 {v12.8h}, [x3], x9
st1 {v13.8h}, [x3], x9
st1 {v14.8h}, [x3], x9
st1 {v15.8h}, [x3], x9
add x23, x23, x19
sub x24, x24, #16
cmp x24, #0
ble LoopWEnd
cmp x24, #8
blt LoopW
cmp x24, #16
bge LoopW16
LoopW8:
mov x19, #8
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x21, x16
LoopKw8:
mov x22, x21
ld1 {v25.8h}, [x17], #16
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v0.8h, v16.8h, v25.8h
fmla v1.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v2.8h, v18.8h, v25.8h
fmla v3.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v4.8h, v20.8h, v25.8h
fmla v5.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
subs x20, x20, #1
bne LoopKh8
cbnz x15, Relu68
cbnz x14, Relu8
b Write8
Relu68:
fmin v0.8h, v0.8h, v26.8h
fmin v1.8h, v1.8h, v26.8h
fmin v2.8h, v2.8h, v26.8h
fmin v3.8h, v3.8h, v26.8h
fmin v4.8h, v4.8h, v26.8h
fmin v5.8h, v5.8h, v26.8h
fmin v6.8h, v6.8h, v26.8h
fmin v7.8h, v7.8h, v26.8h
Relu8:
fmax v0.8h, v0.8h, v27.8h
fmax v1.8h, v1.8h, v27.8h
fmax v2.8h, v2.8h, v27.8h
fmax v3.8h, v3.8h, v27.8h
fmax v4.8h, v4.8h, v27.8h
fmax v5.8h, v5.8h, v27.8h
fmax v6.8h, v6.8h, v27.8h
fmax v7.8h, v7.8h, v27.8h
Write8:
st1 {v0.8h}, [x3], x9
st1 {v1.8h}, [x3], x9
st1 {v2.8h}, [x3], x9
st1 {v3.8h}, [x3], x9
st1 {v4.8h}, [x3], x9
st1 {v5.8h}, [x3], x9
st1 {v6.8h}, [x3], x9
st1 {v7.8h}, [x3], x9
add x23, x23, x19
sub x24, x24, #8
cmp x24, #0
ble LoopWEnd
cmp x24, #8
bge LoopW8
LoopW:
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x22, x16
LoopKw:
ld1 {v16.8h}, [x22], x13
ld1 {v25.8h}, [x17], #16
fmla v0.8h, v16.8h, v25.8h
subs x18, x18, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
bne LoopKh
cbnz x15, Relu6
cbnz x14, Relu
b Write
Relu6:
fmin v0.8h, v0.8h, v26.8h
Relu:
fmax v0.8h, v0.8h, v27.8h
Write:
st1 {v0.8h}, [x3], x9
add x23, x23, x11
subs x24, x24, #1
bne LoopW
LoopWEnd:
add x0, x0, x8
add x1, x1, x10
subs x4, x4, #1
bne LoopH
sub sp, sp, #48
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ret
#endif