You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mindspore/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S

260 lines
7.7 KiB

.text
.align 5
.global TiledC4MatmulFp16
#ifndef __APPLE__
.type TiledC4MatmulFp16, %function
#endif
TiledC4MatmulFp16:
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
mov x7, #2 //sizeof(float)
mul x3, x3, x7
mov x7, #32
mul x10, x4, x7
cmp x5, #2
blt LoopOcHalf
LoopOc:
mov x8, x1
subs x9, x4, #1
add x6, x2, x10
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x8], #32
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x2], #32
fmul v16.4h, v8.4h, v0.h[0]
fmul v17.4h, v8.4h, v1.h[0]
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x8], #32
fmul v18.4h, v8.4h, v2.h[0]
fmul v19.4h, v8.4h, v3.h[0]
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x6], #32
fmul v20.4h, v8.4h, v4.h[0]
fmul v21.4h, v8.4h, v5.h[0]
fmul v22.4h, v8.4h, v6.h[0]
fmul v23.4h, v8.4h, v7.h[0]
fmul v24.4h, v12.4h, v0.h[0]
fmul v25.4h, v12.4h, v1.h[0]
fmul v26.4h, v12.4h, v2.h[0]
fmul v27.4h, v12.4h, v3.h[0]
fmul v28.4h, v12.4h, v4.h[0]
fmul v29.4h, v12.4h, v5.h[0]
fmul v30.4h, v12.4h, v6.h[0]
fmul v31.4h, v12.4h, v7.h[0]
beq LoopIcEnd
LoopIc:
add x2, x2, #64
prfm pldl1keep, [x2]
prfm pldl1keep, [x2, x10]
sub x2, x2, #64
prfm pldl1keep, [x8, #64]
prfm pldl1keep, [x8, #96]
fmla v16.4h, v9.4h, v0.h[1]
fmla v17.4h, v9.4h, v1.h[1]
fmla v18.4h, v9.4h, v2.h[1]
fmla v19.4h, v9.4h, v3.h[1]
fmla v20.4h, v9.4h, v4.h[1]
fmla v21.4h, v9.4h, v5.h[1]
fmla v22.4h, v9.4h, v6.h[1]
fmla v23.4h, v9.4h, v7.h[1]
fmla v24.4h, v13.4h, v0.h[1]
fmla v25.4h, v13.4h, v1.h[1]
fmla v26.4h, v13.4h, v2.h[1]
fmla v27.4h, v13.4h, v3.h[1]
fmla v28.4h, v13.4h, v4.h[1]
fmla v29.4h, v13.4h, v5.h[1]
fmla v30.4h, v13.4h, v6.h[1]
fmla v31.4h, v13.4h, v7.h[1]
fmla v16.4h, v10.4h, v0.h[2]
fmla v17.4h, v10.4h, v1.h[2]
fmla v18.4h, v10.4h, v2.h[2]
fmla v19.4h, v10.4h, v3.h[2]
fmla v20.4h, v10.4h, v4.h[2]
fmla v21.4h, v10.4h, v5.h[2]
fmla v22.4h, v10.4h, v6.h[2]
fmla v23.4h, v10.4h, v7.h[2]
fmla v24.4h, v14.4h, v0.h[2]
fmla v25.4h, v14.4h, v1.h[2]
fmla v26.4h, v14.4h, v2.h[2]
fmla v27.4h, v14.4h, v3.h[2]
fmla v28.4h, v14.4h, v4.h[2]
fmla v29.4h, v14.4h, v5.h[2]
fmla v30.4h, v14.4h, v6.h[2]
fmla v31.4h, v14.4h, v7.h[2]
fmla v16.4h, v11.4h, v0.h[3]
fmla v17.4h, v11.4h, v1.h[3]
fmla v18.4h, v11.4h, v2.h[3]
fmla v19.4h, v11.4h, v3.h[3]
fmla v20.4h, v11.4h, v4.h[3]
fmla v21.4h, v11.4h, v5.h[3]
fmla v22.4h, v11.4h, v6.h[3]
fmla v23.4h, v11.4h, v7.h[3]
fmla v24.4h, v15.4h, v0.h[3]
fmla v25.4h, v15.4h, v1.h[3]
fmla v26.4h, v15.4h, v2.h[3]
fmla v27.4h, v15.4h, v3.h[3]
fmla v28.4h, v15.4h, v4.h[3]
fmla v29.4h, v15.4h, v5.h[3]
fmla v30.4h, v15.4h, v6.h[3]
fmla v31.4h, v15.4h, v7.h[3]
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x2], #32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x8], #32
fmla v16.4h, v8.4h, v0.h[0]
fmla v17.4h, v8.4h, v1.h[0]
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x8], #32
fmla v18.4h, v8.4h, v2.h[0]
fmla v19.4h, v8.4h, v3.h[0]
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x6], #32
fmla v20.4h, v8.4h, v4.h[0]
fmla v21.4h, v8.4h, v5.h[0]
fmla v22.4h, v8.4h, v6.h[0]
fmla v23.4h, v8.4h, v7.h[0]
fmla v24.4h, v12.4h, v0.h[0]
fmla v25.4h, v12.4h, v1.h[0]
fmla v26.4h, v12.4h, v2.h[0]
fmla v27.4h, v12.4h, v3.h[0]
fmla v28.4h, v12.4h, v4.h[0]
fmla v29.4h, v12.4h, v5.h[0]
fmla v30.4h, v12.4h, v6.h[0]
fmla v31.4h, v12.4h, v7.h[0]
subs x9, x9, #1
bne LoopIc
LoopIcEnd:
fmla v16.4h, v9.4h, v0.h[1]
fmla v17.4h, v9.4h, v1.h[1]
fmla v18.4h, v9.4h, v2.h[1]
fmla v19.4h, v9.4h, v3.h[1]
fmla v20.4h, v9.4h, v4.h[1]
fmla v21.4h, v9.4h, v5.h[1]
fmla v22.4h, v9.4h, v6.h[1]
fmla v23.4h, v9.4h, v7.h[1]
fmla v24.4h, v13.4h, v0.h[1]
fmla v25.4h, v13.4h, v1.h[1]
fmla v26.4h, v13.4h, v2.h[1]
fmla v27.4h, v13.4h, v3.h[1]
fmla v28.4h, v13.4h, v4.h[1]
fmla v29.4h, v13.4h, v5.h[1]
fmla v30.4h, v13.4h, v6.h[1]
fmla v31.4h, v13.4h, v7.h[1]
fmla v16.4h, v10.4h, v0.h[2]
fmla v17.4h, v10.4h, v1.h[2]
fmla v18.4h, v10.4h, v2.h[2]
fmla v19.4h, v10.4h, v3.h[2]
fmla v20.4h, v10.4h, v4.h[2]
fmla v21.4h, v10.4h, v5.h[2]
fmla v22.4h, v10.4h, v6.h[2]
fmla v23.4h, v10.4h, v7.h[2]
fmla v24.4h, v14.4h, v0.h[2]
fmla v25.4h, v14.4h, v1.h[2]
fmla v26.4h, v14.4h, v2.h[2]
fmla v27.4h, v14.4h, v3.h[2]
fmla v28.4h, v14.4h, v4.h[2]
fmla v29.4h, v14.4h, v5.h[2]
fmla v30.4h, v14.4h, v6.h[2]
fmla v31.4h, v14.4h, v7.h[2]
add x7, x0, #32
fmla v16.4h, v11.4h, v0.h[3]
fmla v17.4h, v11.4h, v1.h[3]
fmla v18.4h, v11.4h, v2.h[3]
fmla v19.4h, v11.4h, v3.h[3]
fmla v20.4h, v11.4h, v4.h[3]
fmla v21.4h, v11.4h, v5.h[3]
fmla v22.4h, v11.4h, v6.h[3]
fmla v23.4h, v11.4h, v7.h[3]
fmla v24.4h, v15.4h, v0.h[3]
fmla v25.4h, v15.4h, v1.h[3]
fmla v26.4h, v15.4h, v2.h[3]
fmla v27.4h, v15.4h, v3.h[3]
fmla v28.4h, v15.4h, v4.h[3]
st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], x3
fmla v29.4h, v15.4h, v5.h[3]
st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x7], x3
fmla v30.4h, v15.4h, v6.h[3]
st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], x3
mov x2, x6
fmla v31.4h, v15.4h, v7.h[3]
st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x7]
subs x5, x5, #2
beq LoopOcEnd
cmp x5, #2
bge LoopOc
LoopOcHalf:
mov x8, x1
mov x9, x4
dup v16.4s, wzr
dup v17.4s, wzr
dup v18.4s, wzr
dup v19.4s, wzr
dup v20.4s, wzr
dup v21.4s, wzr
dup v22.4s, wzr
dup v23.4s, wzr
LoopIcHalf:
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x2], #32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x8], #32
fmla v16.4h, v8.4h, v0.h[0]
fmla v17.4h, v8.4h, v1.h[0]
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x8], #32
fmla v18.4h, v8.4h, v2.h[0]
fmla v19.4h, v8.4h, v3.h[0]
fmla v20.4h, v8.4h, v4.h[0]
fmla v21.4h, v8.4h, v5.h[0]
fmla v22.4h, v8.4h, v6.h[0]
fmla v23.4h, v8.4h, v7.h[0]
fmla v16.4h, v9.4h, v0.h[1]
fmla v17.4h, v9.4h, v1.h[1]
fmla v18.4h, v9.4h, v2.h[1]
fmla v19.4h, v9.4h, v3.h[1]
fmla v20.4h, v9.4h, v4.h[1]
fmla v21.4h, v9.4h, v5.h[1]
fmla v22.4h, v9.4h, v6.h[1]
fmla v23.4h, v9.4h, v7.h[1]
fmla v16.4h, v10.4h, v0.h[2]
fmla v17.4h, v10.4h, v1.h[2]
fmla v18.4h, v10.4h, v2.h[2]
fmla v19.4h, v10.4h, v3.h[2]
fmla v20.4h, v10.4h, v4.h[2]
fmla v21.4h, v10.4h, v5.h[2]
fmla v22.4h, v10.4h, v6.h[2]
fmla v23.4h, v10.4h, v7.h[2]
fmla v16.4h, v11.4h, v0.h[3]
fmla v17.4h, v11.4h, v1.h[3]
fmla v18.4h, v11.4h, v2.h[3]
fmla v19.4h, v11.4h, v3.h[3]
fmla v20.4h, v11.4h, v4.h[3]
fmla v21.4h, v11.4h, v5.h[3]
fmla v22.4h, v11.4h, v6.h[3]
fmla v23.4h, v11.4h, v7.h[3]
subs x9, x9, #1
bne LoopIcHalf
st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32
st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32
LoopOcEnd:
sub sp, sp, #128
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ret