parent
2b23de6161
commit
e847569108
@ -0,0 +1,144 @@
|
||||
#ifdef __aarch64__
|
||||
.text
|
||||
.align 5
|
||||
.global MatmulFloatNeon64OptRemain
|
||||
#ifndef __APPLE__
|
||||
.type MatmulFloatNeon64OptRemain, %function
|
||||
#endif
|
||||
|
||||
// void MatmulFloatNeon64(const float *a, const float *b, float *c, int depth
|
||||
// int row, int col, size_t stride)
|
||||
// x0: a
|
||||
// x1: b
|
||||
// x2: c
|
||||
// x3: depth
|
||||
// x4: row
|
||||
// x5: col
|
||||
// x6: stride
|
||||
// only for winograd
|
||||
MatmulFloatNeon64OptRemain:
|
||||
mov x18, #32 // sizeof(float) * 8
|
||||
mul x9, x3, x18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
|
||||
mov x18, #4
|
||||
mul x8, x5, x6
|
||||
mov x11, #8
|
||||
mul x11, x11, x6
|
||||
mul x8, x8, x18
|
||||
mul x11, x11, x18
|
||||
|
||||
cmp x4, #4
|
||||
ble LoopH4
|
||||
|
||||
LoopH8:
|
||||
mov x10, x4 // reload lhs row
|
||||
mov x12, x0 // reload lhs ptr
|
||||
mov x18, x2 // reload dst ptr
|
||||
|
||||
LoopW8:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
mov x13, x3 // reload depth
|
||||
dup v16.4s, wzr
|
||||
dup v17.4s, wzr
|
||||
dup v18.4s, wzr
|
||||
dup v19.4s, wzr
|
||||
dup v20.4s, wzr
|
||||
dup v21.4s, wzr
|
||||
dup v22.4s, wzr
|
||||
dup v23.4s, wzr
|
||||
dup v24.4s, wzr
|
||||
dup v25.4s, wzr
|
||||
dup v26.4s, wzr
|
||||
dup v27.4s, wzr
|
||||
dup v28.4s, wzr
|
||||
dup v29.4s, wzr
|
||||
dup v30.4s, wzr
|
||||
dup v31.4s, wzr
|
||||
|
||||
LoopD8:
|
||||
ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
|
||||
ld1 {v3.4s, v4.4s}, [x16], #32
|
||||
fmla v16.4s, v3.4s, v0.s[0]
|
||||
fmla v18.4s, v3.4s, v0.s[1]
|
||||
fmla v20.4s, v3.4s, v0.s[2]
|
||||
fmla v22.4s, v3.4s, v0.s[3]
|
||||
fmla v17.4s, v4.4s, v0.s[0]
|
||||
fmla v19.4s, v4.4s, v0.s[1]
|
||||
fmla v21.4s, v4.4s, v0.s[2]
|
||||
fmla v23.4s, v4.4s, v0.s[3]
|
||||
fmla v24.4s, v3.4s, v1.s[0]
|
||||
fmla v26.4s, v3.4s, v1.s[1]
|
||||
fmla v28.4s, v3.4s, v1.s[2]
|
||||
fmla v30.4s, v3.4s, v1.s[3]
|
||||
fmla v25.4s, v4.4s, v1.s[0]
|
||||
fmla v27.4s, v4.4s, v1.s[1]
|
||||
fmla v29.4s, v4.4s, v1.s[2]
|
||||
fmla v31.4s, v4.4s, v1.s[3]
|
||||
|
||||
subs w13, w13, #1
|
||||
bgt LoopD8
|
||||
|
||||
st1 {v16.4s, v17.4s}, [x18], x8
|
||||
st1 {v18.4s, v19.4s}, [x18], x8
|
||||
st1 {v20.4s, v21.4s}, [x18], x8
|
||||
st1 {v22.4s, v23.4s}, [x18], x8
|
||||
st1 {v24.4s, v25.4s}, [x18], x8
|
||||
st1 {v26.4s, v27.4s}, [x18], x8
|
||||
st1 {v28.4s, v29.4s}, [x18], x8
|
||||
st1 {v30.4s, v31.4s}, [x18], x8
|
||||
|
||||
subs x10, x10, #8 // lhs row - 8
|
||||
bgt LoopW8
|
||||
|
||||
subs x5, x5, #8 // rhs col - 8
|
||||
add x1, x1, x9 // rhs ptr + stride
|
||||
add x2, x2, x11
|
||||
bgt LoopH8
|
||||
|
||||
ret
|
||||
|
||||
LoopH4:
|
||||
mov x10, x4 // reload lhs row
|
||||
mov x12, x0 // reload lhs ptr
|
||||
mov x18, x2 // reload dst ptr
|
||||
|
||||
LoopW4:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
mov x13, x3 // reload depth
|
||||
dup v16.4s, wzr
|
||||
dup v17.4s, wzr
|
||||
dup v18.4s, wzr
|
||||
dup v19.4s, wzr
|
||||
dup v20.4s, wzr
|
||||
dup v21.4s, wzr
|
||||
dup v22.4s, wzr
|
||||
dup v23.4s, wzr
|
||||
|
||||
LoopD4:
|
||||
ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
|
||||
ld1 {v3.4s, v4.4s}, [x16], #32
|
||||
fmla v16.4s, v3.4s, v0.s[0]
|
||||
fmla v18.4s, v3.4s, v0.s[1]
|
||||
fmla v20.4s, v3.4s, v0.s[2]
|
||||
fmla v22.4s, v3.4s, v0.s[3]
|
||||
fmla v17.4s, v4.4s, v0.s[0]
|
||||
fmla v19.4s, v4.4s, v0.s[1]
|
||||
fmla v21.4s, v4.4s, v0.s[2]
|
||||
fmla v23.4s, v4.4s, v0.s[3]
|
||||
|
||||
subs x13, x13, #1
|
||||
bgt LoopD4
|
||||
|
||||
st1 {v16.4s, v17.4s}, [x18], x8
|
||||
st1 {v18.4s, v19.4s}, [x18], x8
|
||||
st1 {v20.4s, v21.4s}, [x18], x8
|
||||
st1 {v22.4s, v23.4s}, [x18], x8
|
||||
|
||||
subs x10, x10, #4 // lhs row - 4
|
||||
bgt LoopW4
|
||||
|
||||
subs x5, x5, #8 // rhs col - 8
|
||||
add x1, x1, x9 // rhs ptr + stride
|
||||
add x2, x2, x11
|
||||
bgt LoopH4
|
||||
ret
|
||||
#endif
|
Loading…
Reference in new issue