!14676 [MS][LITE][Develop]remove use of x18 on apple devices

From: @lx0095
Reviewed-by: @hangangqiang,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
pull/14676/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 2513ed1ba7

@ -28,11 +28,11 @@ asm_function AdderFloatNeon64
ldr x8, [sp]
mov x18, #48 // sizeof(float) * 12
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x20, #48 // sizeof(float) * 12
mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x18, #4
mul x8, x8, x18
mov x20, #4
mul x8, x8, x20
LoopRowStart:
cmp x6, #4
@ -595,9 +595,9 @@ LoopRow4:
LoopColEnd:
add x0, x0, x17
mov x18, #4
mul x18, x18, x7
sub x11, x11, x18
mov x20, #4
mul x20, x20, x7
sub x11, x11, x20
mov x2, x11
subs x6, x6, #12
bgt LoopRowStart

@ -33,12 +33,13 @@
// w16: per_channel
asm_function ConvDw3x3Int8Neon64
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
ld1 {v9.8b}, [x16], x4
ld1 {v10.8b}, [x16], x4
ld1 {v11.8b}, [x16], x4
ld1 {v13.8b}, [x17], x4
ld1 {v14.8b}, [x17], x4
ld1 {v15.8b}, [x17], x4
ld1 {v17.8b}, [x18], x4
ld1 {v18.8b}, [x18], x4
ld1 {v19.8b}, [x18], x4
ld1 {v17.8b}, [x25], x4
ld1 {v18.8b}, [x25], x4
ld1 {v19.8b}, [x25], x4
ld1 {v21.4s}, [x3]
ld1 {v22.4s}, [x19]
@ -123,13 +124,13 @@ HEIGHT1_LOOP:
ld1 {v16.8b}, [x17]
smlal v23.4s, v0.4h, v10.4h
smlal2 v24.4s, v0.8h, v10.8h
ld1 {v20.8b}, [x18]
ld1 {v20.8b}, [x25]
add x1, x1, x21
ssubl v12.8h, v12.8b, v25.8b
smlal v21.4s, v1.4h, v10.4h
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
smlal2 v22.4s, v1.8h, v10.8h
ld1 {v9.8b}, [x16], x4
ssubl v16.8h, v16.8b, v25.8b
@ -159,17 +160,17 @@ HEIGHT1_LOOP:
smlal2 v24.4s, v5.8h, v16.8h
smlal v21.4s, v6.4h, v17.4h
smlal2 v22.4s, v6.8h, v17.8h
ld1 {v17.8b}, [x18], x4
ld1 {v17.8b}, [x25], x4
smlal v23.4s, v6.4h, v18.4h
smlal2 v24.4s, v6.8h, v18.8h
smlal v21.4s, v7.4h, v18.4h
smlal2 v22.4s, v7.8h, v18.8h
ld1 {v18.8b}, [x18], x4
ld1 {v18.8b}, [x25], x4
smlal v23.4s, v7.4h, v19.4h
smlal2 v24.4s, v7.8h, v19.8h
smlal v21.4s, v8.4h, v19.4h
smlal2 v22.4s, v8.8h, v19.8h
ld1 {v19.8b}, [x18], x4
ld1 {v19.8b}, [x25], x4
smlal v23.4s, v8.4h, v20.4h
smlal2 v24.4s, v8.8h, v20.8h
@ -278,7 +279,7 @@ WIDTH2_LEFT:
smlal2 v24.4s, v1.8h, v11.8h
smlal v21.4s, v2.4h, v11.4h
smlal2 v22.4s, v2.8h, v11.8h
ld1 {v20.8b}, [x18]
ld1 {v20.8b}, [x25]
smlal v23.4s, v2.4h, v12.4h
smlal2 v24.4s, v2.8h, v12.8h
smlal v21.4s, v3.4h, v13.4h
@ -443,12 +444,13 @@ OUTZP3:
st1 {v21.8b}, [x0], x6
End:
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

@ -33,12 +33,13 @@
// w16: per_channel
asm_function ConvDw3x3Int8Stride2
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
ld1 {v9.8b}, [x16], x4
ld1 {v10.8b}, [x16], x4
ssubl v9.8h, v9.8b, v28.8b
@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2
ssubl v14.8h, v14.8b, v28.8b
ld1 {v16.8b}, [x17], x4
ssubl v15.8h, v15.8b, v28.8b
ld1 {v19.8b}, [x18], x4
ld1 {v19.8b}, [x25], x4
ssubl v16.8h, v16.8b, v28.8b
ld1 {v20.8b}, [x18], x4
ld1 {v20.8b}, [x25], x4
ssubl v19.8h, v19.8b, v28.8b
ld1 {v21.8b}, [x18], x4
ld1 {v21.8b}, [x25], x4
ssubl v20.8h, v20.8b, v28.8b
ssubl v21.8h, v21.8b, v28.8b
@ -108,7 +109,7 @@ HEIGHT1_LOOP:
ld1 {v17.8b}, [x17], x4
ssubl v12.8h, v12.8b, v28.8b
smlal v26.4s, v0.4h, v11.4h
ld1 {v22.8b}, [x18], x4
ld1 {v22.8b}, [x25], x4
ssubl v17.8h, v17.8b, v28.8b
smlal2 v27.4s, v0.8h, v11.8h
ld1 {v13.8b}, [x16], x4
@ -117,7 +118,7 @@ HEIGHT1_LOOP:
ld1 {v18.8b}, [x17], x4
ssubl v13.8h, v13.8b, v28.8b
smlal2 v25.4s, v1.8h, v10.8h
ld1 {v23.8b}, [x18], x4
ld1 {v23.8b}, [x25], x4
ssubl v18.8h, v18.8b, v28.8b
smlal v26.4s, v1.4h, v12.4h
mov v9.16b, v13.16b
@ -157,12 +158,12 @@ HEIGHT1_LOOP:
smlal2 v27.4s, v6.8h, v21.8h
smlal v24.4s, v7.4h, v20.4h
smlal2 v25.4s, v7.8h, v20.8h
ld1 {v20.8b}, [x18], x4
ld1 {v20.8b}, [x25], x4
smlal v26.4s, v7.4h, v22.4h
smlal2 v27.4s, v7.8h, v22.8h
smlal v24.4s, v8.4h, v21.4h
smlal2 v25.4s, v8.8h, v21.8h
ld1 {v21.8b}, [x18], x4
ld1 {v21.8b}, [x25], x4
ssubl v20.8h, v20.8b, v28.8b
smlal v26.4s, v8.4h, v23.4h
ssubl v21.8h, v21.8b, v28.8b
@ -260,7 +261,7 @@ WIDTH2_LEFT:
ld1 {v17.8b}, [x17], x4
ssubl v12.8h, v12.8b, v28.8b
smlal v26.4s, v0.4h, v11.4h
ld1 {v22.8b}, [x18], x4
ld1 {v22.8b}, [x25], x4
ssubl v17.8h, v17.8b, v28.8b
smlal2 v27.4s, v0.8h, v11.8h
ld1 {v13.8b}, [x16], x4
@ -269,7 +270,7 @@ WIDTH2_LEFT:
ld1 {v18.8b}, [x17], x4
ssubl v13.8h, v13.8b, v28.8b
smlal2 v25.4s, v1.8h, v10.8h
ld1 {v23.8b}, [x18], x4
ld1 {v23.8b}, [x25], x4
ssubl v18.8h, v18.8b, v28.8b
smlal v26.4s, v1.4h, v12.4h
ssubl v23.8h, v23.8b, v28.8b
@ -452,11 +453,12 @@ OUTZP3:
st1 {v24.8b}, [x0], x6
End:
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

@ -19,12 +19,13 @@ asm_function ConvDwFp32Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -72,7 +73,7 @@ asm_function ConvDwFp32Center
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw16:
mov x22, x21
@ -109,7 +110,7 @@ asm_function ConvDwFp32Center
ld1 {v23.4s}, [x22], x11
fmla v14.4s, v22.4s, v25.4s
fmla v15.4s, v23.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
@ -192,7 +193,7 @@ asm_function ConvDwFp32Center
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw8:
mov x22, x21
@ -213,7 +214,7 @@ asm_function ConvDwFp32Center
ld1 {v23.4s}, [x22], x11
fmla v6.4s, v22.4s, v25.4s
fmla v7.4s, v23.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
@ -261,13 +262,13 @@ asm_function ConvDwFp32Center
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v16.4s}, [x22], x13
ld1 {v25.4s}, [x17], #16
fmla v0.4s, v16.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -290,11 +291,12 @@ asm_function ConvDwFp32Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

@ -13,8 +13,9 @@
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
asm_function ConvDwFp32Indirect3x3
sub sp, sp, #16
sub sp, sp, #32
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
movi v31.4s, #6
scvtf v31.4s, v31.4s
@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3
ldp x12, x13, [x1]
ldp x14, x15, [x1, #16]
ldp x16, x17, [x1, #32]
ldp x18, x19, [x1, #48]
ldp x21, x19, [x1, #48]
ldr x20, [x1, #64]
mov x9, x2
mov x10, x3
@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3
ld1 {v5.4s}, [x17], #16
ld1 {v22.4s}, [x9], #16
fmla v29.4s, v3.4s, v20.4s
ld1 {v6.4s}, [x18], #16
ld1 {v6.4s}, [x21], #16
ld1 {v23.4s}, [x9], #16
fmla v29.4s, v4.4s, v21.4s
ld1 {v7.4s}, [x19], #16
@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3
ld1 {v5.4s}, [x17], #16
ld1 {v22.4s}, [x9], #16
fmla v29.4s, v3.4s, v20.4s
ld1 {v6.4s}, [x18], #16
ld1 {v6.4s}, [x21], #16
ld1 {v23.4s}, [x9], #16
fmla v29.4s, v4.4s, v21.4s
ld1 {v7.4s}, [x19], #16
@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3
cmp x5, #0
bgt LoopPixel
End:
sub sp, sp, #16
sub sp, sp, #32
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

@ -13,17 +13,18 @@
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
asm_function ConvDwFp32Indirect5x5
sub sp, sp, #160
sub sp, sp, #176
stp x19, x20, [sp, #64]
stp x21, x22, [sp, #80]
stp x23, x24, [sp, #96]
stp x25, x26, [sp, #112]
stp x27, x28, [sp, #128]
stp x29, x30, [sp, #144]
ldrb w8, [sp, #160]
ldrb w8, [sp, #176]
stp x2, x3, [sp]
stp x4, x6, [sp, #16]
stp x7, x8, [sp, #32]
stp x0, x1, [sp, #160]
movi v31.4s, #6
scvtf v31.4s, v31.4s
@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5
ldp x12, x13, [x1, #48]
ldp x14, x15, [x1, #64]
ldp x16, x17, [x1, #80]
ldp x18, x19, [x1, #96]
ldp x0, x19, [x1, #96]
ldp x20, x21, [x1, #112]
ldp x22, x23, [x1, #128]
ldp x24, x25, [x1, #144]
@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5
ld1 {v1.4s}, [x17], #16
ld1 {v19.4s}, [x5], #16
fmla v29.4s, v7.4s, v25.4s
ld1 {v2.4s}, [x18], #16
ld1 {v2.4s}, [x0], #16
ld1 {v20.4s}, [x5], #16
fmla v29.4s, v16.4s, v26.4s
ld1 {v3.4s}, [x19], #16
@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5
RELU:
fmax v29.4s, v29.4s, v30.4s
WRITE:
st1 {v29.4s}, [x0], #16
ldr x4, [sp, #160]
st1 {v29.4s}, [x4], #16
str x4, [sp, #160]
ldr x4, [sp, #56]
ld1 {v29.4s}, [x4], #16
@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5
ld1 {v1.4s}, [x17], #16
ld1 {v19.4s}, [x5], #16
fmla v29.4s, v7.4s, v25.4s
ld1 {v2.4s}, [x18], #16
ld1 {v2.4s}, [x0], #16
ld1 {v20.4s}, [x5], #16
fmla v29.4s, v16.4s, v26.4s
ld1 {v3.4s}, [x19], #16
@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5
LeftWrite:
cmp x2, #4
bne Write3
st1 {v29.4s}, [x0], #16
ldr x4, [sp, #160]
st1 {v29.4s}, [x4], #16
str x4, [sp, #160]
b NextPixel
Write3:
sxtw x2, w2
tbnz w2, #1, Write2
tbnz w2, #0, Write1
Write2:
st1 {v29.2s}, [x0], #8
ldr x4, [sp, #160]
st1 {v29.2s}, [x4], #8
str x4, [sp, #160]
ext v29.16b, v29.16b, v29.16b, #8
tbz w2, #0, NextPixel
Write1:
str s29, [x0], #4
ldr x4, [sp, #160]
str s29, [x4], #4
str x4, [sp, #160]
NextPixel:
ldr x2, [sp, #24]
@ -279,6 +288,6 @@ End:
ldp x25, x26, [sp, #112]
ldp x27, x28, [sp, #128]
ldp x29, x30, [sp, #144]
add sp, sp, #160
add sp, sp, #176
ret
#endif

@ -22,12 +22,13 @@ asm_function ConvDwInt8Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -51,9 +52,9 @@ asm_function ConvDwInt8Center
ld1 {v24.4s}, [x17], #16
ld1 {v25.4s}, [x17], #16
ldr x18, [sp, #80] // right shift
ld1 {v26.4s}, [x18], #16
ld1 {v27.4s}, [x18], #16
ldr x25, [sp, #80] // right shift
ld1 {v26.4s}, [x25], #16
ld1 {v27.4s}, [x25], #16
ldr x19, [sp, #88] // acc_min
ld1 {v28.4s}, [x19], #16
@ -90,7 +91,7 @@ asm_function ConvDwInt8Center
mov v6.16b, v17.16b
mov v7.16b, v18.16b
LoopKh4:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw4:
mov x22, x21
@ -116,7 +117,7 @@ asm_function ConvDwInt8Center
smlal v6.4s, v8.4h, v16.4h
smlal2 v7.4s, v8.8h, v16.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw4
add x16, x16, x12
@ -194,15 +195,15 @@ asm_function ConvDwInt8Center
mov x16, x3
add x17, x16, x9
add x18, x17, x9
add x21, x18, x9
add x25, x17, x9
add x21, x25, x9
st1 {v0.s}[0], [x16], #4
st1 {v1.s}[0], [x16], #4
st1 {v2.s}[0], [x17], #4
st1 {v3.s}[0], [x17], #4
st1 {v4.s}[0], [x18], #4
st1 {v5.s}[0], [x18], #4
st1 {v4.s}[0], [x25], #4
st1 {v5.s}[0], [x25], #4
st1 {v6.s}[0], [x21], #4
st1 {v7.s}[0], [x21], #4
@ -221,7 +222,7 @@ asm_function ConvDwInt8Center
mov v0.16b, v17.16b
mov v1.16b, v18.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v15.8b}, [x22], x13
@ -229,7 +230,7 @@ asm_function ConvDwInt8Center
ld1 {v16.8h}, [x17], #16
smlal v0.4s, v14.4h, v16.4h
smlal2 v1.4s, v14.8h, v16.8h
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -271,11 +272,12 @@ asm_function ConvDwInt8Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

@ -47,11 +47,11 @@ asm_function ConvSwFp32Center
LoopH:
mov x17, x1
mov x18, x5
mov x28, x5
mov x3, x0
cmp x18, #8
cmp x28, #8
blt LoopW
cmp x18, #16
cmp x28, #16
blt LoopW8
LoopW16:
@ -244,12 +244,12 @@ asm_function ConvSwFp32Center
st1 {v14.4s}, [x3], x9
st1 {v15.4s}, [x3], x9
add x17, x17, x19
sub x18, x18, #16
cmp x18, #0
sub x28, x28, #16
cmp x28, #0
ble LoopWEnd
cmp x18, #8
cmp x28, #8
blt LoopW
cmp x18, #16
cmp x28, #16
bge LoopW16
LoopW8:
mov x19, #8
@ -369,10 +369,10 @@ asm_function ConvSwFp32Center
st1 {v6.4s}, [x3], x9
st1 {v7.4s}, [x3], x9
add x17, x17, x19
sub x18, x18, #8
cmp x18, #0
sub x28, x28, #8
cmp x28, #0
ble LoopWEnd
cmp x18, #8
cmp x28, #8
bge LoopW8
LoopW:
mov x20, x17
@ -427,7 +427,7 @@ asm_function ConvSwFp32Center
Write:
st1 {v0.4s}, [x3], x9
add x17, x17, x12
subs x18, x18, #1
subs x28, x28, #1
bne LoopW
LoopWEnd:
add x0, x0, x8

@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x22, x15
mov x19, x2
mov x20, x5
ld1 {v1.4s}, [x16], x8
LoopKh:
mov x21, x18
mov x21, x22
mov x13, x6
LoopKw:
ld1 {v0.4s}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center
st1 {v0.4s}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
add x22, x22, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10

File diff suppressed because it is too large Load Diff

@ -21,31 +21,32 @@
// x9: writeMode
asm_function MatmulFloatNeon64Opt
sub sp, sp, #144
sub sp, sp, #160
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
mov x18, #48 // sizeof(float) * 12
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x21, #48 // sizeof(float) * 12
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x18, #32
mul x16, x6, x18 // row * 8 * sizeof(float)
mov x21, #32
mul x16, x6, x21 // row * 8 * sizeof(float)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x18, #4
mov x21, #4
mul x15, x7, x8
mul x15, x15, x18 // kernel_size * col *sizeof(float)
mov x18, #32
mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
mul x15, x15, x21 // kernel_size * col *sizeof(float)
mov x21, #32
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
NoWinoSteps:
mov x18, #4
mul x8, x8, x18
mov x21, #4
mul x8, x8, x21
LoopRowStart:
cmp x6, #4
@ -1117,9 +1118,9 @@ LoopRow4:
LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x18, #4
mul x18, x18, x7
sub x11, x11, x18
mov x21, #4
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
@ -1129,9 +1130,10 @@ LoopColEnd:
subs x6, x6, #12
bgt LoopRowStart
sub sp, sp, #144
sub sp, sp, #160
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

@ -67,7 +67,7 @@ L2:
cmp w16, #0
beq End2
mov x18, x1 // reload b ptr
mov x28, x1 // reload b ptr
mov x19, x7 // reload bias ptr
mov w20, w5 // reload depth
dup v16.4s, wzr
@ -94,10 +94,10 @@ L3:
ld1 {v1.16b}, [x17], #16
ld1 {v2.16b}, [x17], #16
ld1 {v3.16b}, [x17], #16
ld1 {v4.16b}, [x18], #16
ld1 {v5.16b}, [x18], #16
ld1 {v6.16b}, [x18], #16
ld1 {v7.16b}, [x18], #16
ld1 {v4.16b}, [x28], #16
ld1 {v5.16b}, [x28], #16
ld1 {v6.16b}, [x28], #16
ld1 {v7.16b}, [x28], #16
smull v8.8h, v4.8b, v0.8b
smull v9.8h, v5.8b, v0.8b

@ -30,7 +30,7 @@
// x28: filter_zp
asm_function MatmulInt8Opt
sub sp, sp, #208
sub sp, sp, #224
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
@ -38,6 +38,7 @@ asm_function MatmulInt8Opt
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
stp x27, x28, [sp], #16
stp x29, x30, [sp], #16
ldr w8, [sp]
ldr w9, [sp, #8]
@ -55,7 +56,7 @@ asm_function MatmulInt8Opt
LoopRow:
mov x16, x1 // reload rhs ptr
mov x17, x4 // reload rhs col
mov x18, x7 // reload bias ptr
mov x29, x7 // reload bias ptr
mov x27, x2 // reload dst ptr
ldr x28, [sp, #64] // reload filter_zp
@ -158,7 +159,7 @@ LoopRow:
Bias:
cbz x7, NoBias
ld1 {v15.4s}, [x18], #16
ld1 {v15.4s}, [x29], #16
add v16.4s, v16.4s, v15.4s
add v17.4s, v17.4s, v15.4s
add v18.4s, v18.4s, v15.4s
@ -330,7 +331,7 @@ LoopColEnd:
b LoopRow
LoopRowEnd:
sub sp, sp, #208
sub sp, sp, #224
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
@ -338,5 +339,6 @@ LoopRowEnd:
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ldp x27, x28, [sp], #16
ldp x29, x30, [sp], #16
ret
#endif

@ -20,9 +20,10 @@
// x7: bias
asm_function MatMulR4Int8Neon64
sub sp, sp, #128
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
mov w15, #0 // b col index
mov w16, #0 // a row index
@ -40,7 +41,7 @@ L2:
cmp w16, w3
beq End2
mov x18, x1 // reload b ptr
mov x19, x1 // reload b ptr
mov x10, x7 // reload bias ptr
mov w11, w5 // reload depth
dup v16.4s, wzr
@ -67,10 +68,10 @@ L3:
ld1 {v1.16b}, [x17], #16
ld1 {v2.16b}, [x17], #16
ld1 {v3.16b}, [x17], #16
ld1 {v4.16b}, [x18], #16
ld1 {v5.16b}, [x18], #16
ld1 {v6.16b}, [x18], #16
ld1 {v7.16b}, [x18], #16
ld1 {v4.16b}, [x19], #16
ld1 {v5.16b}, [x19], #16
ld1 {v6.16b}, [x19], #16
ld1 {v7.16b}, [x19], #16
smull v8.8h, v4.8b, v0.8b
smull v9.8h, v5.8b, v0.8b
@ -172,8 +173,9 @@ End2:
b L1
End1:
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd
mov x14, x1 // mat_b
LoopN:
mov x16, x0 // mat_a_m
sub x18, x5, x15 // ni
sub x22, x5, x15 // ni
sub x19, x17, x3 // mi
mul x18, x18, x17 // ni * m
mul x22, x22, x17 // ni * m
mov x11, x6 // in_channel
add x18, x18, x19 // (ni * m) + mi
mul x18, x18, x7 // x18 * c4_channel
add x20, x2, x18 // dst + offset
add x22, x22, x19 // (ni * m) + mi
mul x22, x22, x7 // x22 * c4_channel
add x20, x2, x22 // dst + offset
cmp x11, #16
bge LoopC16
cmp x11, #8

@ -1,6 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
//.p2align 5,,15

@ -55,16 +55,16 @@ LoopH:
ld1 {v0.s}[2], [x17], x10
ld1 {v0.s}[3], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
add x19, x16, x7
LoopLength4:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x20], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
@ -90,14 +90,14 @@ LoopH:
ld1 {v0.s}[1], [x17], x10
ld1 {v0.s}[2], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
LoopLength3:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x20], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]

@ -18,6 +18,9 @@ asm_function WinogradTransRight
//x5: k
//x6: length
sub sp, sp, #16
stp x19, x20, [sp], #16
mov x8, #16 // 4 * sizeof(float)
mul x8, x6, x8
mul x9, x5, x8 // step for S
@ -43,7 +46,7 @@ LoopH:
cmp x12, #4
blt LoopKStart3
mov x16, x15
mov x18, x4
mov x19, x4
LoopK4:
ld1 {v0.s}[0], [x13], x10
ld1 {v0.s}[1], [x13], x10
@ -54,7 +57,7 @@ LoopH:
add x14, x17, x8
add x16, x14, x8
add x18, x16, x8
add x19, x16, x8
LoopLength4:
ld1 {v16.4s}, [x2]
@ -64,7 +67,7 @@ LoopH:
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x19], #16
fmla v17.4s, v21.4s, v0.s[3]
fadd v17.4s, v16.4s, v17.4s
@ -73,7 +76,7 @@ LoopH:
bne LoopLength4
sub x2, x2, x8
sub x12, x12, #4
mov x17, x18
mov x17, x19
cmp x12, #4
bge LoopK4
@ -107,7 +110,7 @@ LoopH:
bne LoopLength3
sub x2, x2, x8
sub x12, x12, #3
mov x17, x18
mov x17, x19
cmp x12, #3
bge LoopK3
@ -141,5 +144,7 @@ LoopH:
subs x4, x4, #1
bne LoopH
sub sp, sp, #16
ldp x19, x20, [sp], #16
ret
#endif

@ -1,4 +1,5 @@
#ifdef ENABLE_AVX
#include "nnacl/assembly_global.h"
.text
.align 4
.global ConvDwFp32Avx3x3
@ -31,7 +32,7 @@
// 56: input_stride
// 64: relu
// 72: relu6
ConvDwFp32Avx3x3:
asm_function ConvDwFp32Avx3x3
pushq %r15
pushq %r14
pushq %r13

@ -1,4 +1,5 @@
#ifdef ENABLE_AVX
#include "nnacl/assembly_global.h"
.text
.align 4
.global MatmulFloatAvxOpt
@ -34,7 +35,7 @@
// 72: stride
// 80: writeMode
MatmulFloatAvxOpt:
asm_function MatmulFloatAvxOpt
// rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention
pushq %r15
pushq %r14

@ -19,12 +19,13 @@ asm_function ConvDwFp16Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDwFp16Center
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw16:
mov x22, x21
@ -108,7 +109,7 @@ asm_function ConvDwFp16Center
ld1 {v23.8h}, [x22], x11
fmla v14.8h, v22.8h, v25.8h
fmla v15.8h, v23.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
@ -191,7 +192,7 @@ asm_function ConvDwFp16Center
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw8:
mov x22, x21
@ -212,7 +213,7 @@ asm_function ConvDwFp16Center
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
@ -260,13 +261,13 @@ asm_function ConvDwFp16Center
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v16.8h}, [x22], x13
ld1 {v25.8h}, [x17], #16
fmla v0.8h, v16.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -289,11 +290,12 @@ asm_function ConvDwFp16Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x22, x15
mov x19, x2
mov x20, x5
ld1 {v1.8h}, [x16], x8
LoopKh:
mov x21, x18
mov x21, x22
mov x13, x6
LoopKw:
ld1 {v0.8h}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center
st1 {v0.8h}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
add x22, x22, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10

@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ r29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #128
sub sp, sp, #144
// performance between storing 4 registers at the same time and separately storing them on in-order cores
// is not tested yet
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
ldr x8, [sp, #0]
ldr x9, [sp, #8]
@ -548,87 +549,87 @@ IndirectGemmStart:
b WriteEnd
Write7:
add x17, x15, #8
add x18, x15, #10
add x19, x15, #10
add x16, x15, #12
st1 {v16.4h}, [x15], x7
ins v0.s[0], v16.s[2]
st1 {v0.h}[0], [x17], x7
st1 {v0.h}[1], [x18], x7
st1 {v0.h}[1], [x19], x7
st1 {v16.h}[6], [x16], x7
st1 {v17.4h}, [x15], x7
ins v1.s[0], v17.s[2]
st1 {v1.h}[0], [x17], x7
st1 {v1.h}[1], [x18], x7
st1 {v1.h}[1], [x19], x7
st1 {v17.h}[6], [x16], x7
st1 {v18.4h}, [x15], x7
ins v2.s[0], v18.s[2]
st1 {v2.h}[0], [x17], x7
st1 {v2.h}[1], [x18], x7
st1 {v2.h}[1], [x19], x7
st1 {v18.h}[6], [x16], x7
st1 {v19.4h}, [x15], x7
ins v3.s[0], v19.s[2]
st1 {v3.h}[0], [x17], x7
st1 {v3.h}[1], [x18], x7
st1 {v3.h}[1], [x19], x7
st1 {v19.h}[6], [x16], x7
st1 {v20.4h}, [x15], x7
ins v4.s[0], v20.s[2]
st1 {v4.h}[0], [x17], x7
st1 {v4.h}[1], [x18], x7
st1 {v4.h}[1], [x19], x7
st1 {v20.h}[6], [x16], x7
st1 {v21.4h}, [x15], x7
ins v5.s[0], v21.s[2]
st1 {v5.h}[0], [x17], x7
st1 {v5.h}[1], [x18], x7
st1 {v5.h}[1], [x19], x7
st1 {v21.h}[6], [x16], x7
st1 {v22.4h}, [x15], x7
ins v6.s[0], v22.s[2]
st1 {v6.h}[0], [x17], x7
st1 {v6.h}[1], [x18], x7
st1 {v6.h}[1], [x19], x7
st1 {v22.h}[6], [x16], x7
st1 {v23.4h}, [x15], x7
ins v7.s[0], v23.s[2]
st1 {v7.h}[0], [x17], x7
st1 {v7.h}[1], [x18], x7
st1 {v7.h}[1], [x19], x7
st1 {v23.h}[6], [x16], x7
st1 {v24.4h}, [x15], x7
ins v8.s[0], v24.s[2]
st1 {v8.h}[0], [x17], x7
st1 {v8.h}[1], [x18], x7
st1 {v8.h}[1], [x19], x7
st1 {v24.h}[6], [x16], x7
st1 {v25.4h}, [x15], x7
ins v9.s[0], v25.s[2]
st1 {v9.h}[0], [x17], x7
st1 {v9.h}[1], [x18], x7
st1 {v9.h}[1], [x19], x7
st1 {v25.h}[6], [x16], x7
st1 {v26.4h}, [x15], x7
ins v10.s[0], v26.s[2]
st1 {v10.h}[0], [x17], x7
st1 {v10.h}[1], [x18], x7
st1 {v10.h}[1], [x19], x7
st1 {v26.h}[6], [x16], x7
st1 {v27.4h}, [x15], x7
ins v11.s[0], v27.s[2]
st1 {v11.h}[0], [x17], x7
st1 {v11.h}[1], [x18], x7
st1 {v11.h}[1], [x19], x7
st1 {v27.h}[6], [x16], x7
st1 {v28.4h}, [x15], x7
ins v12.s[0], v28.s[2]
st1 {v12.h}[0], [x17], x7
st1 {v12.h}[1], [x18], x7
st1 {v12.h}[1], [x19], x7
st1 {v28.h}[6], [x16], x7
st1 {v29.4h}, [x15], x7
ins v13.s[0], v29.s[2]
st1 {v13.h}[0], [x17], x7
st1 {v13.h}[1], [x18], x7
st1 {v13.h}[1], [x19], x7
st1 {v29.h}[6], [x16], x7
st1 {v30.4h}, [x15], x7
ins v14.s[0], v30.s[2]
st1 {v14.h}[0], [x17], x7
st1 {v14.h}[1], [x18], x7
st1 {v14.h}[1], [x19], x7
st1 {v30.h}[6], [x16], x7
st1 {v31.4h}, [x15]
ins v15.s[0], v31.s[2]
st1 {v15.h}[0], [x17]
st1 {v15.h}[1], [x18]
st1 {v15.h}[1], [x19]
st1 {v31.h}[6], [x16]
add x0, x0, #14
b WriteEnd
@ -661,9 +662,10 @@ IndirectGemmStart:
NoStepForward:
bgt LoopOc
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

File diff suppressed because it is too large Load Diff

@ -21,30 +21,31 @@
// x9: writeMode
asm_function MatmulFp16Neon64Opt
sub sp, sp, #80
sub sp, sp, #96
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
mov x18, #32 // sizeof(float16_t) * 16
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
mov x21, #32 // sizeof(float16_t) * 16
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x18, #16
mul x16, x6, x18 // row * 8 * sizeof(float16_t)
mov x21, #16
mul x16, x6, x21 // row * 8 * sizeof(float16_t)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x18, #2
mov x21, #2
mul x15, x7, x8
mul x15, x15, x18 // kernel_size * col *sizeof(float16_t)
mov x18, #16
mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t)
mul x15, x15, x21 // kernel_size * col *sizeof(float16_t)
mov x21, #16
mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t)
NoWinoSteps:
mov x18, #2
mul x8, x8, x18
mov x21, #2
mul x8, x8, x21
LoopRowStart:
cmp x6, #1
@ -1221,9 +1222,9 @@ LoopRow:
LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x18, #2
mul x18, x18, x7
sub x11, x11, x18
mov x21, #2
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
@ -1233,8 +1234,9 @@ LoopColEnd:
subs x6, x6, #16
bgt LoopRowStart
sub sp, sp, #80
sub sp, sp, #96
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save