diff --git a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S index df9e94fffa..621fd6eeac 100644 --- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S @@ -28,11 +28,11 @@ asm_function AdderFloatNeon64 ldr x8, [sp] - mov x18, #48 // sizeof(float) * 12 - mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth + mov x20, #48 // sizeof(float) * 12 + mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth - mov x18, #4 - mul x8, x8, x18 + mov x20, #4 + mul x8, x8, x20 LoopRowStart: cmp x6, #4 @@ -595,9 +595,9 @@ LoopRow4: LoopColEnd: add x0, x0, x17 - mov x18, #4 - mul x18, x18, x7 - sub x11, x11, x18 + mov x20, #4 + mul x20, x20, x7 + sub x11, x11, x20 mov x2, x11 subs x6, x6, #12 bgt LoopRowStart diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S index 3b46f4d810..391401e88f 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S @@ -33,12 +33,13 @@ // w16: per_channel asm_function ConvDw3x3Int8Neon64 - sub sp, sp, #176 + sub sp, sp, #192 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 stp x21, x22, [sp], #16 stp x23, x24, [sp], #16 + stp x25, x26, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] @@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64 mov x16, x1 add x17, x16, x5 - add x18, x17, x5 + add x25, x17, x5 ld1 {v9.8b}, [x16], x4 ld1 {v10.8b}, [x16], x4 ld1 {v11.8b}, [x16], x4 ld1 {v13.8b}, [x17], x4 ld1 {v14.8b}, [x17], x4 ld1 {v15.8b}, [x17], x4 - ld1 {v17.8b}, [x18], x4 - ld1 {v18.8b}, [x18], x4 - ld1 {v19.8b}, [x18], x4 + ld1 {v17.8b}, [x25], x4 + ld1 {v18.8b}, [x25], x4 + ld1 {v19.8b}, [x25], x4 ld1 {v21.4s}, [x3] ld1 {v22.4s}, [x19] @@ -123,13 +124,13 @@ HEIGHT1_LOOP: ld1 {v16.8b}, [x17] smlal v23.4s, v0.4h, v10.4h smlal2 v24.4s, v0.8h, v10.8h - ld1 {v20.8b}, [x18] + ld1 {v20.8b}, [x25] add x1, x1, x21 ssubl v12.8h, v12.8b, v25.8b smlal v21.4s, v1.4h, v10.4h mov x16, x1 add x17, x16, x5 - add x18, x17, x5 + add x25, x17, x5 smlal2 v22.4s, v1.8h, v10.8h ld1 {v9.8b}, [x16], x4 ssubl v16.8h, v16.8b, v25.8b @@ -159,17 +160,17 @@ HEIGHT1_LOOP: smlal2 v24.4s, v5.8h, v16.8h smlal v21.4s, v6.4h, v17.4h smlal2 v22.4s, v6.8h, v17.8h - ld1 {v17.8b}, [x18], x4 + ld1 {v17.8b}, [x25], x4 smlal v23.4s, v6.4h, v18.4h smlal2 v24.4s, v6.8h, v18.8h smlal v21.4s, v7.4h, v18.4h smlal2 v22.4s, v7.8h, v18.8h - ld1 {v18.8b}, [x18], x4 + ld1 {v18.8b}, [x25], x4 smlal v23.4s, v7.4h, v19.4h smlal2 v24.4s, v7.8h, v19.8h smlal v21.4s, v8.4h, v19.4h smlal2 v22.4s, v8.8h, v19.8h - ld1 {v19.8b}, [x18], x4 + ld1 {v19.8b}, [x25], x4 smlal v23.4s, v8.4h, v20.4h smlal2 v24.4s, v8.8h, v20.8h @@ -278,7 +279,7 @@ WIDTH2_LEFT: smlal2 v24.4s, v1.8h, v11.8h smlal v21.4s, v2.4h, v11.4h smlal2 v22.4s, v2.8h, v11.8h - ld1 {v20.8b}, [x18] + ld1 {v20.8b}, [x25] smlal v23.4s, v2.4h, v12.4h smlal2 v24.4s, v2.8h, v12.8h smlal v21.4s, v3.4h, v13.4h @@ -443,12 +444,13 @@ OUTZP3: st1 {v21.8b}, [x0], x6 End: - sub sp, sp, #176 + sub sp, sp, #192 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 ldp x21, x22, [sp], #16 ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S index 8f843192db..2162ade6bb 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S @@ -33,12 +33,13 @@ // w16: per_channel asm_function ConvDw3x3Int8Stride2 - sub sp, sp, #176 + sub sp, sp, #192 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 stp x21, x22, [sp], #16 stp x23, x24, [sp], #16 + stp x25, x26, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] @@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2 mov x16, x1 add x17, x16, x5 - add x18, x17, x5 + add x25, x17, x5 ld1 {v9.8b}, [x16], x4 ld1 {v10.8b}, [x16], x4 ssubl v9.8h, v9.8b, v28.8b @@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2 ssubl v14.8h, v14.8b, v28.8b ld1 {v16.8b}, [x17], x4 ssubl v15.8h, v15.8b, v28.8b - ld1 {v19.8b}, [x18], x4 + ld1 {v19.8b}, [x25], x4 ssubl v16.8h, v16.8b, v28.8b - ld1 {v20.8b}, [x18], x4 + ld1 {v20.8b}, [x25], x4 ssubl v19.8h, v19.8b, v28.8b - ld1 {v21.8b}, [x18], x4 + ld1 {v21.8b}, [x25], x4 ssubl v20.8h, v20.8b, v28.8b ssubl v21.8h, v21.8b, v28.8b @@ -108,7 +109,7 @@ HEIGHT1_LOOP: ld1 {v17.8b}, [x17], x4 ssubl v12.8h, v12.8b, v28.8b smlal v26.4s, v0.4h, v11.4h - ld1 {v22.8b}, [x18], x4 + ld1 {v22.8b}, [x25], x4 ssubl v17.8h, v17.8b, v28.8b smlal2 v27.4s, v0.8h, v11.8h ld1 {v13.8b}, [x16], x4 @@ -117,7 +118,7 @@ HEIGHT1_LOOP: ld1 {v18.8b}, [x17], x4 ssubl v13.8h, v13.8b, v28.8b smlal2 v25.4s, v1.8h, v10.8h - ld1 {v23.8b}, [x18], x4 + ld1 {v23.8b}, [x25], x4 ssubl v18.8h, v18.8b, v28.8b smlal v26.4s, v1.4h, v12.4h mov v9.16b, v13.16b @@ -157,12 +158,12 @@ HEIGHT1_LOOP: smlal2 v27.4s, v6.8h, v21.8h smlal v24.4s, v7.4h, v20.4h smlal2 v25.4s, v7.8h, v20.8h - ld1 {v20.8b}, [x18], x4 + ld1 {v20.8b}, [x25], x4 smlal v26.4s, v7.4h, v22.4h smlal2 v27.4s, v7.8h, v22.8h smlal v24.4s, v8.4h, v21.4h smlal2 v25.4s, v8.8h, v21.8h - ld1 {v21.8b}, [x18], x4 + ld1 {v21.8b}, [x25], x4 ssubl v20.8h, v20.8b, v28.8b smlal v26.4s, v8.4h, v23.4h ssubl v21.8h, v21.8b, v28.8b @@ -260,7 +261,7 @@ WIDTH2_LEFT: ld1 {v17.8b}, [x17], x4 ssubl v12.8h, v12.8b, v28.8b smlal v26.4s, v0.4h, v11.4h - ld1 {v22.8b}, [x18], x4 + ld1 {v22.8b}, [x25], x4 ssubl v17.8h, v17.8b, v28.8b smlal2 v27.4s, v0.8h, v11.8h ld1 {v13.8b}, [x16], x4 @@ -269,7 +270,7 @@ WIDTH2_LEFT: ld1 {v18.8b}, [x17], x4 ssubl v13.8h, v13.8b, v28.8b smlal2 v25.4s, v1.8h, v10.8h - ld1 {v23.8b}, [x18], x4 + ld1 {v23.8b}, [x25], x4 ssubl v18.8h, v18.8b, v28.8b smlal v26.4s, v1.4h, v12.4h ssubl v23.8h, v23.8b, v28.8b @@ -452,11 +453,12 @@ OUTZP3: st1 {v24.8b}, [x0], x6 End: - sub sp, sp, #176 + sub sp, sp, #192 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 ldp x21, x22, [sp], #16 ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S index c43932f5ec..d4e6be641e 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S @@ -19,12 +19,13 @@ asm_function ConvDwFp32Center // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved // whereas our coding style do not permit such amount of parameters - sub sp, sp, #176 + sub sp, sp, #192 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 stp x21, x22, [sp], #16 stp x23, x24, [sp], #16 + stp x25, x26, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] @@ -72,7 +73,7 @@ asm_function ConvDwFp32Center mov v14.16b, v24.16b mov v15.16b, v24.16b LoopKh16: - mov x18, x7 + mov x25, x7 mov x21, x16 LoopKw16: mov x22, x21 @@ -109,7 +110,7 @@ asm_function ConvDwFp32Center ld1 {v23.4s}, [x22], x11 fmla v14.4s, v22.4s, v25.4s fmla v15.4s, v23.4s, v25.4s - subs x18, x18, #1 + subs x25, x25, #1 add x21, x21, x13 bne LoopKw16 add x16, x16, x12 @@ -192,7 +193,7 @@ asm_function ConvDwFp32Center mov v6.16b, v24.16b mov v7.16b, v24.16b LoopKh8: - mov x18, x7 + mov x25, x7 mov x21, x16 LoopKw8: mov x22, x21 @@ -213,7 +214,7 @@ asm_function ConvDwFp32Center ld1 {v23.4s}, [x22], x11 fmla v6.4s, v22.4s, v25.4s fmla v7.4s, v23.4s, v25.4s - subs x18, x18, #1 + subs x25, x25, #1 add x21, x21, x13 bne LoopKw8 add x16, x16, x12 @@ -261,13 +262,13 @@ asm_function ConvDwFp32Center mov x20, x6 mov v0.16b, v24.16b LoopKh: - mov x18, x7 + mov x25, x7 mov x22, x16 LoopKw: ld1 {v16.4s}, [x22], x13 ld1 {v25.4s}, [x17], #16 fmla v0.4s, v16.4s, v25.4s - subs x18, x18, #1 + subs x25, x25, #1 bne LoopKw add x16, x16, x12 subs x20, x20, #1 @@ -290,11 +291,12 @@ asm_function ConvDwFp32Center subs x4, x4, #1 bne LoopH - sub sp, sp, #176 + sub sp, sp, #192 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 ldp x21, x22, [sp], #16 ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S index a60a27fe05..246d8bfab4 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S @@ -13,8 +13,9 @@ // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 asm_function ConvDwFp32Indirect3x3 - sub sp, sp, #16 + sub sp, sp, #32 stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 movi v31.4s, #6 scvtf v31.4s, v31.4s @@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3 ldp x12, x13, [x1] ldp x14, x15, [x1, #16] ldp x16, x17, [x1, #32] - ldp x18, x19, [x1, #48] + ldp x21, x19, [x1, #48] ldr x20, [x1, #64] mov x9, x2 mov x10, x3 @@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3 ld1 {v5.4s}, [x17], #16 ld1 {v22.4s}, [x9], #16 fmla v29.4s, v3.4s, v20.4s - ld1 {v6.4s}, [x18], #16 + ld1 {v6.4s}, [x21], #16 ld1 {v23.4s}, [x9], #16 fmla v29.4s, v4.4s, v21.4s ld1 {v7.4s}, [x19], #16 @@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3 ld1 {v5.4s}, [x17], #16 ld1 {v22.4s}, [x9], #16 fmla v29.4s, v3.4s, v20.4s - ld1 {v6.4s}, [x18], #16 + ld1 {v6.4s}, [x21], #16 ld1 {v23.4s}, [x9], #16 fmla v29.4s, v4.4s, v21.4s ld1 {v7.4s}, [x19], #16 @@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3 cmp x5, #0 bgt LoopPixel End: - sub sp, sp, #16 + sub sp, sp, #32 ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S index 5e1045aa72..6ff7307f78 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S @@ -13,17 +13,18 @@ // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 asm_function ConvDwFp32Indirect5x5 - sub sp, sp, #160 + sub sp, sp, #176 stp x19, x20, [sp, #64] stp x21, x22, [sp, #80] stp x23, x24, [sp, #96] stp x25, x26, [sp, #112] stp x27, x28, [sp, #128] stp x29, x30, [sp, #144] - ldrb w8, [sp, #160] + ldrb w8, [sp, #176] stp x2, x3, [sp] stp x4, x6, [sp, #16] stp x7, x8, [sp, #32] + stp x0, x1, [sp, #160] movi v31.4s, #6 scvtf v31.4s, v31.4s @@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5 ldp x12, x13, [x1, #48] ldp x14, x15, [x1, #64] ldp x16, x17, [x1, #80] - ldp x18, x19, [x1, #96] + ldp x0, x19, [x1, #96] ldp x20, x21, [x1, #112] ldp x22, x23, [x1, #128] ldp x24, x25, [x1, #144] @@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5 ld1 {v1.4s}, [x17], #16 ld1 {v19.4s}, [x5], #16 fmla v29.4s, v7.4s, v25.4s - ld1 {v2.4s}, [x18], #16 + ld1 {v2.4s}, [x0], #16 ld1 {v20.4s}, [x5], #16 fmla v29.4s, v16.4s, v26.4s ld1 {v3.4s}, [x19], #16 @@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5 RELU: fmax v29.4s, v29.4s, v30.4s WRITE: - st1 {v29.4s}, [x0], #16 + ldr x4, [sp, #160] + st1 {v29.4s}, [x4], #16 + str x4, [sp, #160] ldr x4, [sp, #56] ld1 {v29.4s}, [x4], #16 @@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5 ld1 {v1.4s}, [x17], #16 ld1 {v19.4s}, [x5], #16 fmla v29.4s, v7.4s, v25.4s - ld1 {v2.4s}, [x18], #16 + ld1 {v2.4s}, [x0], #16 ld1 {v20.4s}, [x5], #16 fmla v29.4s, v16.4s, v26.4s ld1 {v3.4s}, [x19], #16 @@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5 LeftWrite: cmp x2, #4 bne Write3 - st1 {v29.4s}, [x0], #16 + ldr x4, [sp, #160] + st1 {v29.4s}, [x4], #16 + str x4, [sp, #160] b NextPixel Write3: sxtw x2, w2 tbnz w2, #1, Write2 tbnz w2, #0, Write1 Write2: - st1 {v29.2s}, [x0], #8 + ldr x4, [sp, #160] + st1 {v29.2s}, [x4], #8 + str x4, [sp, #160] ext v29.16b, v29.16b, v29.16b, #8 tbz w2, #0, NextPixel Write1: - str s29, [x0], #4 + ldr x4, [sp, #160] + str s29, [x4], #4 + str x4, [sp, #160] NextPixel: ldr x2, [sp, #24] @@ -279,6 +288,6 @@ End: ldp x25, x26, [sp, #112] ldp x27, x28, [sp, #128] ldp x29, x30, [sp, #144] - add sp, sp, #160 + add sp, sp, #176 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S index 03fd8afe0c..017732e7ca 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S @@ -22,12 +22,13 @@ asm_function ConvDwInt8Center // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved // whereas our coding style do not permit such amount of parameters - sub sp, sp, #176 + sub sp, sp, #192 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 stp x21, x22, [sp], #16 stp x23, x24, [sp], #16 + stp x25, x26, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] @@ -51,9 +52,9 @@ asm_function ConvDwInt8Center ld1 {v24.4s}, [x17], #16 ld1 {v25.4s}, [x17], #16 - ldr x18, [sp, #80] // right shift - ld1 {v26.4s}, [x18], #16 - ld1 {v27.4s}, [x18], #16 + ldr x25, [sp, #80] // right shift + ld1 {v26.4s}, [x25], #16 + ld1 {v27.4s}, [x25], #16 ldr x19, [sp, #88] // acc_min ld1 {v28.4s}, [x19], #16 @@ -90,7 +91,7 @@ asm_function ConvDwInt8Center mov v6.16b, v17.16b mov v7.16b, v18.16b LoopKh4: - mov x18, x7 + mov x25, x7 mov x21, x16 LoopKw4: mov x22, x21 @@ -116,7 +117,7 @@ asm_function ConvDwInt8Center smlal v6.4s, v8.4h, v16.4h smlal2 v7.4s, v8.8h, v16.8h - subs x18, x18, #1 + subs x25, x25, #1 add x21, x21, x13 bne LoopKw4 add x16, x16, x12 @@ -194,15 +195,15 @@ asm_function ConvDwInt8Center mov x16, x3 add x17, x16, x9 - add x18, x17, x9 - add x21, x18, x9 + add x25, x17, x9 + add x21, x25, x9 st1 {v0.s}[0], [x16], #4 st1 {v1.s}[0], [x16], #4 st1 {v2.s}[0], [x17], #4 st1 {v3.s}[0], [x17], #4 - st1 {v4.s}[0], [x18], #4 - st1 {v5.s}[0], [x18], #4 + st1 {v4.s}[0], [x25], #4 + st1 {v5.s}[0], [x25], #4 st1 {v6.s}[0], [x21], #4 st1 {v7.s}[0], [x21], #4 @@ -221,7 +222,7 @@ asm_function ConvDwInt8Center mov v0.16b, v17.16b mov v1.16b, v18.16b LoopKh: - mov x18, x7 + mov x25, x7 mov x22, x16 LoopKw: ld1 {v15.8b}, [x22], x13 @@ -229,7 +230,7 @@ asm_function ConvDwInt8Center ld1 {v16.8h}, [x17], #16 smlal v0.4s, v14.4h, v16.4h smlal2 v1.4s, v14.8h, v16.8h - subs x18, x18, #1 + subs x25, x25, #1 bne LoopKw add x16, x16, x12 subs x20, x20, #1 @@ -271,11 +272,12 @@ asm_function ConvDwInt8Center subs x4, x4, #1 bne LoopH - sub sp, sp, #176 + sub sp, sp, #192 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 ldp x21, x22, [sp], #16 ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S index ff4ac86616..277f3ebd10 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S @@ -47,11 +47,11 @@ asm_function ConvSwFp32Center LoopH: mov x17, x1 - mov x18, x5 + mov x28, x5 mov x3, x0 - cmp x18, #8 + cmp x28, #8 blt LoopW - cmp x18, #16 + cmp x28, #16 blt LoopW8 LoopW16: @@ -244,12 +244,12 @@ asm_function ConvSwFp32Center st1 {v14.4s}, [x3], x9 st1 {v15.4s}, [x3], x9 add x17, x17, x19 - sub x18, x18, #16 - cmp x18, #0 + sub x28, x28, #16 + cmp x28, #0 ble LoopWEnd - cmp x18, #8 + cmp x28, #8 blt LoopW - cmp x18, #16 + cmp x28, #16 bge LoopW16 LoopW8: mov x19, #8 @@ -369,10 +369,10 @@ asm_function ConvSwFp32Center st1 {v6.4s}, [x3], x9 st1 {v7.4s}, [x3], x9 add x17, x17, x19 - sub x18, x18, #8 - cmp x18, #0 + sub x28, x28, #8 + cmp x28, #0 ble LoopWEnd - cmp x18, #8 + cmp x28, #8 bge LoopW8 LoopW: mov x20, x17 @@ -427,7 +427,7 @@ asm_function ConvSwFp32Center Write: st1 {v0.4s}, [x3], x9 add x17, x17, x12 - subs x18, x18, #1 + subs x28, x28, #1 bne LoopW LoopWEnd: add x0, x0, x8 diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S index 19601f5779..d4c49827d2 100644 --- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S @@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center mov x16, x1 mov x17, x4 LoopW: - mov x18, x15 + mov x22, x15 mov x19, x2 mov x20, x5 ld1 {v1.4s}, [x16], x8 LoopKh: - mov x21, x18 + mov x21, x22 mov x13, x6 LoopKw: ld1 {v0.4s}, [x21] @@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center st1 {v0.4s}, [x21], x12 subs x13, x13, #1 bne LoopKw - add x18, x18, x11 + add x22, x22, x11 subs x20, x20, #1 bne LoopKh add x15, x15, x10 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S index 5c7024ea94..47aaeb121e 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S @@ -21,30 +21,31 @@ // w13: c8_nhwc_c4 asm_function MatmulFloatNeon64 - sub sp, sp, #128 + sub sp, sp, #144 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + stp x19, x20, [sp], #16 ldr x9, [sp, #8] ldr x14, [sp, #16] - mov w18, #32 // sizeof(float) * 8 - mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth - mov x18, #4 + mov w19, #32 // sizeof(float) * 8 + mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth + mov x19, #4 ldr x17, [sp] cbz x14, NoWinoSteps mul x8, x7, x17 mov x11, #8 mul x11, x11, x17 - mul x8, x8, x18 - mul x11, x11, x18 + mul x8, x8, x19 + mul x11, x11, x19 NoWinoSteps: - mul x17, x17, x18 + mul x17, x17, x19 L1: mov w10, w6 // reload lhs row mov x12, x0 // reload lhs ptr - mov x18, x2 // reload dst ptr + mov x19, x2 // reload dst ptr L2: mov x16, x1 // reload rhs ptr @@ -254,435 +255,435 @@ Write: b Write8 Write1: - str s8, [x18] + str s8, [x19] cmp w10, #1 beq WriteEnd - add x18, x18, x17 - str s10, [x18] + add x19, x19, x17 + str s10, [x19] cmp w10, #2 beq WriteEnd - add x18, x18, x17 - str s12, [x18] + add x19, x19, x17 + str s12, [x19] cmp w10, #3 beq WriteEnd - add x18, x18, x17 - str s14, [x18] + add x19, x19, x17 + str s14, [x19] cmp w10, #4 beq WriteEnd - add x18, x18, x17 - str s16, [x18] + add x19, x19, x17 + str s16, [x19] cmp w10, #5 beq WriteEnd - add x18, x18, x17 - str s18, [x18] + add x19, x19, x17 + str s18, [x19] cmp w10, #6 beq WriteEnd - add x18, x18, x17 - str s20, [x18] + add x19, x19, x17 + str s20, [x19] cmp w10, #7 beq WriteEnd - add x18, x18, x17 - str s22, [x18] + add x19, x19, x17 + str s22, [x19] cmp w10, #8 beq WriteEnd - add x18, x18, x17 - str s24, [x18] + add x19, x19, x17 + str s24, [x19] cmp w10, #9 beq WriteEnd - add x18, x18, x17 - str s26, [x18] + add x19, x19, x17 + str s26, [x19] cmp w10, #10 beq WriteEnd - add x18, x18, x17 - str s28, [x18] + add x19, x19, x17 + str s28, [x19] cmp w10, #11 beq WriteEnd - add x18, x18, x17 - str s30, [x18] - add x18, x18, x17 + add x19, x19, x17 + str s30, [x19] + add x19, x19, x17 b WriteEnd Write2: dup s9, v8.s[1] - stp s8, s9, [x18] + stp s8, s9, [x19] cmp w10, #1 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s11, v10.s[1] - stp s10, s11, [x18] + stp s10, s11, [x19] cmp w10, #2 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s13, v12.s[1] - stp s12, s13, [x18] + stp s12, s13, [x19] cmp w10, #3 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s15, v14.s[1] - stp s14, s15, [x18] + stp s14, s15, [x19] cmp w10, #4 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s17, v16.s[1] - stp s16, s17, [x18] + stp s16, s17, [x19] cmp w10, #5 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s19, v18.s[1] - stp s18, s19, [x18] + stp s18, s19, [x19] cmp w10, #6 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s21, v20.s[1] - stp s20, s21, [x18] + stp s20, s21, [x19] cmp w10, #7 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s23, v22.s[1] - stp s22, s23, [x18] + stp s22, s23, [x19] cmp w10, #8 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s25, v24.s[1] - stp s24, s25, [x18] + stp s24, s25, [x19] cmp w10, #9 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s27, v26.s[1] - stp s26, s27, [x18] + stp s26, s27, [x19] cmp w10, #10 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s29, v28.s[1] - stp s28, s29, [x18] + stp s28, s29, [x19] cmp w10, #11 beq WriteEnd - add x18, x18, x17 + add x19, x19, x17 dup s31, v30.s[1] - stp s30, s31, [x18] - add x18, x18, x17 + stp s30, s31, [x19] + add x19, x19, x17 b WriteEnd Write3: - add x13, x18, #8 + add x13, x19, #8 dup s9, v8.s[1] - stp s8, s9, [x18] - add x18, x18, x17 + stp s8, s9, [x19] + add x19, x19, x17 st1 {v8.s}[2], [x13], x17 cmp w10, #1 beq WriteEnd dup s11, v10.s[1] - stp s10, s11, [x18] - add x18, x18, x17 + stp s10, s11, [x19] + add x19, x19, x17 st1 {v10.s}[2], [x13], x17 cmp w10, #2 beq WriteEnd dup s13, v12.s[1] - stp s12, s13, [x18] - add x18, x18, x17 + stp s12, s13, [x19] + add x19, x19, x17 st1 {v12.s}[2], [x13], x17 cmp w10, #3 beq WriteEnd dup s15, v14.s[1] - stp s14, s15, [x18] - add x18, x18, x17 + stp s14, s15, [x19] + add x19, x19, x17 st1 {v14.s}[2], [x13], x17 cmp w10, #4 beq WriteEnd dup s17, v16.s[1] - stp s16, s17, [x18] - add x18, x18, x17 + stp s16, s17, [x19] + add x19, x19, x17 st1 {v16.s}[2], [x13], x17 cmp w10, #5 beq WriteEnd dup s19, v18.s[1] - stp s18, s19, [x18] - add x18, x18, x17 + stp s18, s19, [x19] + add x19, x19, x17 st1 {v18.s}[2], [x13], x17 cmp w10, #6 beq WriteEnd dup s21, v20.s[1] - stp s20, s21, [x18] - add x18, x18, x17 + stp s20, s21, [x19] + add x19, x19, x17 st1 {v20.s}[2], [x13], x17 cmp w10, #7 beq WriteEnd dup s23, v22.s[1] - stp s22, s23, [x18] - add x18, x18, x17 + stp s22, s23, [x19] + add x19, x19, x17 st1 {v22.s}[2], [x13], x17 cmp w10, #8 beq WriteEnd dup s25, v24.s[1] - stp s24, s25, [x18] - add x18, x18, x17 + stp s24, s25, [x19] + add x19, x19, x17 st1 {v24.s}[2], [x13], x17 cmp w10, #9 beq WriteEnd dup s27, v26.s[1] - stp s26, s27, [x18] - add x18, x18, x17 + stp s26, s27, [x19] + add x19, x19, x17 st1 {v26.s}[2], [x13], x17 cmp w10, #10 beq WriteEnd dup s29, v28.s[1] - stp s28, s29, [x18] - add x18, x18, x17 + stp s28, s29, [x19] + add x19, x19, x17 st1 {v28.s}[2], [x13], x17 cmp w10, #11 beq WriteEnd dup s31, v30.s[1] - stp s30, s31, [x18] - add x18, x18, x17 + stp s30, s31, [x19] + add x19, x19, x17 st1 {v30.s}[2], [x13] b WriteEnd Write4: - st1 {v8.4s}, [x18], x17 + st1 {v8.4s}, [x19], x17 cmp w10, #1 beq WriteEnd - st1 {v10.4s}, [x18], x17 + st1 {v10.4s}, [x19], x17 cmp w10, #2 beq WriteEnd - st1 {v12.4s}, [x18], x17 + st1 {v12.4s}, [x19], x17 cmp w10, #3 beq WriteEnd - st1 {v14.4s}, [x18], x17 + st1 {v14.4s}, [x19], x17 cmp w10, #4 beq WriteEnd - st1 {v16.4s}, [x18], x17 + st1 {v16.4s}, [x19], x17 cmp w10, #5 beq WriteEnd - st1 {v18.4s}, [x18], x17 + st1 {v18.4s}, [x19], x17 cmp w10, #6 beq WriteEnd - st1 {v20.4s}, [x18], x17 + st1 {v20.4s}, [x19], x17 cmp w10, #7 beq WriteEnd - st1 {v22.4s}, [x18], x17 + st1 {v22.4s}, [x19], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4s}, [x18], x17 + st1 {v24.4s}, [x19], x17 cmp w10, #9 beq WriteEnd - st1 {v26.4s}, [x18], x17 + st1 {v26.4s}, [x19], x17 cmp w10, #10 beq WriteEnd - st1 {v28.4s}, [x18], x17 + st1 {v28.4s}, [x19], x17 cmp w10, #11 beq WriteEnd - st1 {v30.4s}, [x18], x17 + st1 {v30.4s}, [x19], x17 b WriteEnd Write5: - add x13, x18, #16 - st1 {v8.4s}, [x18], x17 + add x13, x19, #16 + st1 {v8.4s}, [x19], x17 str s9, [x13] cmp w10, #1 beq WriteEnd add x13, x13, x17 - st1 {v10.4s}, [x18], x17 + st1 {v10.4s}, [x19], x17 str s11, [x13] cmp w10, #2 beq WriteEnd add x13, x13, x17 - st1 {v12.4s}, [x18], x17 + st1 {v12.4s}, [x19], x17 str s13, [x13] cmp w10, #3 beq WriteEnd add x13, x13, x17 - st1 {v14.4s}, [x18], x17 + st1 {v14.4s}, [x19], x17 str s15, [x13] cmp w10, #4 beq WriteEnd add x13, x13, x17 - st1 {v16.4s}, [x18], x17 + st1 {v16.4s}, [x19], x17 str s17, [x13] cmp w10, #5 beq WriteEnd add x13, x13, x17 - st1 {v18.4s}, [x18], x17 + st1 {v18.4s}, [x19], x17 str s19, [x13] cmp w10, #6 beq WriteEnd add x13, x13, x17 - st1 {v20.4s}, [x18], x17 + st1 {v20.4s}, [x19], x17 str s21, [x13] cmp w10, #7 beq WriteEnd add x13, x13, x17 - st1 {v22.4s}, [x18], x17 + st1 {v22.4s}, [x19], x17 str s23, [x13] cmp w10, #8 beq WriteEnd add x13, x13, x17 - st1 {v24.4s}, [x18], x17 + st1 {v24.4s}, [x19], x17 str s25, [x13] cmp w10, #9 beq WriteEnd add x13, x13, x17 - st1 {v26.4s}, [x18], x17 + st1 {v26.4s}, [x19], x17 str s27, [x13] cmp w10, #10 beq WriteEnd add x13, x13, x17 - st1 {v28.4s}, [x18], x17 + st1 {v28.4s}, [x19], x17 str s29, [x13] cmp w10, #11 beq WriteEnd add x13, x13, x17 - st1 {v30.4s}, [x18], x17 + st1 {v30.4s}, [x19], x17 str s31, [x13] b WriteEnd Write6: - add x13, x18, #16 - st1 {v8.4s}, [x18], x17 + add x13, x19, #16 + st1 {v8.4s}, [x19], x17 dup s8, v9.s[1] stp s9, s8, [x13] cmp w10, #1 beq WriteEnd add x13, x13, x17 - st1 {v10.4s}, [x18], x17 + st1 {v10.4s}, [x19], x17 dup s10, v11.s[1] stp s11, s10, [x13] cmp w10, #2 beq WriteEnd add x13, x13, x17 - st1 {v12.4s}, [x18], x17 + st1 {v12.4s}, [x19], x17 dup s12, v13.s[1] stp s13, s12, [x13] cmp w10, #3 beq WriteEnd add x13, x13, x17 - st1 {v14.4s}, [x18], x17 + st1 {v14.4s}, [x19], x17 dup s14, v15.s[1] stp s15, s14, [x13] cmp w10, #4 beq WriteEnd add x13, x13, x17 - st1 {v16.4s}, [x18], x17 + st1 {v16.4s}, [x19], x17 dup s16, v17.s[1] stp s17, s16, [x13] cmp w10, #5 beq WriteEnd add x13, x13, x17 - st1 {v18.4s}, [x18], x17 + st1 {v18.4s}, [x19], x17 dup s18, v19.s[1] stp s19, s18, [x13] cmp w10, #6 beq WriteEnd add x13, x13, x17 - st1 {v20.4s}, [x18], x17 + st1 {v20.4s}, [x19], x17 dup s20, v21.s[1] stp s21, s20, [x13] cmp w10, #7 beq WriteEnd add x13, x13, x17 - st1 {v22.4s}, [x18], x17 + st1 {v22.4s}, [x19], x17 dup s22, v23.s[1] stp s23, s22, [x13] cmp w10, #8 beq WriteEnd add x13, x13, x17 - st1 {v24.4s}, [x18], x17 + st1 {v24.4s}, [x19], x17 dup s24, v25.s[1] stp s25, s24, [x13] cmp w10, #9 beq WriteEnd add x13, x13, x17 - st1 {v26.4s}, [x18], x17 + st1 {v26.4s}, [x19], x17 dup s26, v27.s[1] stp s27, s26, [x13] cmp w10, #10 beq WriteEnd add x13, x13, x17 - st1 {v28.4s}, [x18], x17 + st1 {v28.4s}, [x19], x17 dup s28, v29.s[1] stp s29, s28, [x13] cmp w10, #11 beq WriteEnd add x13, x13, x17 - st1 {v30.4s}, [x18], x17 + st1 {v30.4s}, [x19], x17 dup s30, v31.s[1] stp s31, s30, [x13] b WriteEnd Write7: - add x13, x18, #16 - add x16, x18, #24 - st1 {v8.4s}, [x18], x17 + add x13, x19, #16 + add x16, x19, #24 + st1 {v8.4s}, [x19], x17 dup s8, v9.s[1] stp s9, s8, [x13] add x13, x13, x17 st1 {v9.s}[2], [x16], x17 cmp w10, #1 beq WriteEnd - st1 {v10.4s}, [x18], x17 + st1 {v10.4s}, [x19], x17 dup s10, v11.s[1] stp s11, s10, [x13] add x13, x13, x17 st1 {v11.s}[2], [x16], x17 cmp w10, #2 beq WriteEnd - st1 {v12.4s}, [x18], x17 + st1 {v12.4s}, [x19], x17 dup s12, v13.s[1] stp s13, s12, [x13] add x13, x13, x17 st1 {v13.s}[2], [x16], x17 cmp w10, #3 beq WriteEnd - st1 {v14.4s}, [x18], x17 + st1 {v14.4s}, [x19], x17 dup s14, v15.s[1] stp s15, s14, [x13] add x13, x13, x17 st1 {v15.s}[2], [x16], x17 cmp w10, #4 beq WriteEnd - st1 {v16.4s}, [x18], x17 + st1 {v16.4s}, [x19], x17 dup s16, v17.s[1] stp s17, s16, [x13] add x13, x13, x17 st1 {v17.s}[2], [x16], x17 cmp w10, #5 beq WriteEnd - st1 {v18.4s}, [x18], x17 + st1 {v18.4s}, [x19], x17 dup s18, v19.s[1] stp s19, s18, [x13] add x13, x13, x17 st1 {v19.s}[2], [x16], x17 cmp w10, #6 beq WriteEnd - st1 {v20.4s}, [x18], x17 + st1 {v20.4s}, [x19], x17 dup s20, v21.s[1] stp s21, s20, [x13] add x13, x13, x17 st1 {v21.s}[2], [x16], x17 cmp w10, #7 beq WriteEnd - st1 {v22.4s}, [x18], x17 + st1 {v22.4s}, [x19], x17 dup s22, v23.s[1] stp s23, s22, [x13] add x13, x13, x17 st1 {v23.s}[2], [x16], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4s}, [x18], x17 + st1 {v24.4s}, [x19], x17 dup s24, v25.s[1] stp s25, s24, [x13] add x13, x13, x17 st1 {v25.s}[2], [x16], x17 cmp w10, #9 beq WriteEnd - st1 {v26.4s}, [x18], x17 + st1 {v26.4s}, [x19], x17 dup s26, v27.s[1] stp s27, s26, [x13] add x13, x13, x17 st1 {v27.s}[2], [x16], x17 cmp w10, #10 beq WriteEnd - st1 {v28.4s}, [x18], x17 + st1 {v28.4s}, [x19], x17 dup s28, v29.s[1] stp s29, s28, [x13] add x13, x13, x17 st1 {v29.s}[2], [x16], x17 cmp w10, #11 beq WriteEnd - st1 {v30.4s}, [x18], x17 + st1 {v30.4s}, [x19], x17 dup s30, v31.s[1] stp s31, s30, [x13] add x13, x13, x17 @@ -697,54 +698,54 @@ WriteC8: st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 b WriteEnd WriteWino: - st1 {v8.4s, v9.4s}, [x18], x8 - st1 {v10.4s, v11.4s}, [x18], x8 - st1 {v12.4s, v13.4s}, [x18], x8 - st1 {v14.4s, v15.4s}, [x18], x8 - st1 {v16.4s, v17.4s}, [x18], x8 - st1 {v18.4s, v19.4s}, [x18], x8 - st1 {v20.4s, v21.4s}, [x18], x8 - st1 {v22.4s, v23.4s}, [x18], x8 - st1 {v24.4s, v25.4s}, [x18], x8 - st1 {v26.4s, v27.4s}, [x18], x8 - st1 {v28.4s, v29.4s}, [x18], x8 - st1 {v30.4s, v31.4s}, [x18], x8 + st1 {v8.4s, v9.4s}, [x19], x8 + st1 {v10.4s, v11.4s}, [x19], x8 + st1 {v12.4s, v13.4s}, [x19], x8 + st1 {v14.4s, v15.4s}, [x19], x8 + st1 {v16.4s, v17.4s}, [x19], x8 + st1 {v18.4s, v19.4s}, [x19], x8 + st1 {v20.4s, v21.4s}, [x19], x8 + st1 {v22.4s, v23.4s}, [x19], x8 + st1 {v24.4s, v25.4s}, [x19], x8 + st1 {v26.4s, v27.4s}, [x19], x8 + st1 {v28.4s, v29.4s}, [x19], x8 + st1 {v30.4s, v31.4s}, [x19], x8 b WriteEnd Write8: - st1 {v8.4s, v9.4s}, [x18], x17 + st1 {v8.4s, v9.4s}, [x19], x17 cmp w10, #1 beq WriteEnd - st1 {v10.4s, v11.4s}, [x18], x17 + st1 {v10.4s, v11.4s}, [x19], x17 cmp w10, #2 beq WriteEnd - st1 {v12.4s, v13.4s}, [x18], x17 + st1 {v12.4s, v13.4s}, [x19], x17 cmp w10, #3 beq WriteEnd - st1 {v14.4s, v15.4s}, [x18], x17 + st1 {v14.4s, v15.4s}, [x19], x17 cmp w10, #4 beq WriteEnd - st1 {v16.4s, v17.4s}, [x18], x17 + st1 {v16.4s, v17.4s}, [x19], x17 cmp w10, #5 beq WriteEnd - st1 {v18.4s, v19.4s}, [x18], x17 + st1 {v18.4s, v19.4s}, [x19], x17 cmp w10, #6 beq WriteEnd - st1 {v20.4s, v21.4s}, [x18], x17 + st1 {v20.4s, v21.4s}, [x19], x17 cmp w10, #7 beq WriteEnd - st1 {v22.4s, v23.4s}, [x18], x17 + st1 {v22.4s, v23.4s}, [x19], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4s, v25.4s}, [x18], x17 + st1 {v24.4s, v25.4s}, [x19], x17 cmp w10, #9 beq WriteEnd - st1 {v26.4s, v27.4s}, [x18], x17 + st1 {v26.4s, v27.4s}, [x19], x17 cmp w10, #10 beq WriteEnd - st1 {v28.4s, v29.4s}, [x18], x17 + st1 {v28.4s, v29.4s}, [x19], x17 cmp w10, #11 beq WriteEnd - st1 {v30.4s, v31.4s}, [x18], x17 + st1 {v30.4s, v31.4s}, [x19], x17 WriteEnd: subs w10, w10, #12 // lhs row - 12 @@ -766,8 +767,9 @@ NoDstStep: bgt L1 End1: - sub sp, sp, #128 + sub sp, sp, #144 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldp x19, x20, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index e495feec78..07a87a8e81 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -21,31 +21,32 @@ // x9: writeMode asm_function MatmulFloatNeon64Opt - sub sp, sp, #144 + sub sp, sp, #160 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] - mov x18, #48 // sizeof(float) * 12 - mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth + mov x21, #48 // sizeof(float) * 12 + mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth cbnz x9, NoC8Steps mov x11, x2 - mov x18, #32 - mul x16, x6, x18 // row * 8 * sizeof(float) + mov x21, #32 + mul x16, x6, x21 // row * 8 * sizeof(float) NoC8Steps: cmp x9, #2 bne NoWinoSteps - mov x18, #4 + mov x21, #4 mul x15, x7, x8 - mul x15, x15, x18 // kernel_size * col *sizeof(float) - mov x18, #32 - mul x16, x8, x18 // kernel_size * 8 * sizeof(float) + mul x15, x15, x21 // kernel_size * col *sizeof(float) + mov x21, #32 + mul x16, x8, x21 // kernel_size * 8 * sizeof(float) NoWinoSteps: - mov x18, #4 - mul x8, x8, x18 + mov x21, #4 + mul x8, x8, x21 LoopRowStart: cmp x6, #4 @@ -1117,9 +1118,9 @@ LoopRow4: LoopColEnd: add x0, x0, x17 cbz x9, C8DstStep - mov x18, #4 - mul x18, x18, x7 - sub x11, x11, x18 + mov x21, #4 + mul x21, x21, x7 + sub x11, x11, x21 mov x2, x11 b NoDstStep C8DstStep: @@ -1129,9 +1130,10 @@ LoopColEnd: subs x6, x6, #12 bgt LoopRowStart - sub sp, sp, #144 + sub sp, sp, #160 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S index 883d07fb09..600f122e16 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S @@ -67,7 +67,7 @@ L2: cmp w16, #0 beq End2 - mov x18, x1 // reload b ptr + mov x28, x1 // reload b ptr mov x19, x7 // reload bias ptr mov w20, w5 // reload depth dup v16.4s, wzr @@ -94,10 +94,10 @@ L3: ld1 {v1.16b}, [x17], #16 ld1 {v2.16b}, [x17], #16 ld1 {v3.16b}, [x17], #16 - ld1 {v4.16b}, [x18], #16 - ld1 {v5.16b}, [x18], #16 - ld1 {v6.16b}, [x18], #16 - ld1 {v7.16b}, [x18], #16 + ld1 {v4.16b}, [x28], #16 + ld1 {v5.16b}, [x28], #16 + ld1 {v6.16b}, [x28], #16 + ld1 {v7.16b}, [x28], #16 smull v8.8h, v4.8b, v0.8b smull v9.8h, v5.8b, v0.8b diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S index c08607df9e..fd31cc0f9c 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S @@ -30,7 +30,7 @@ // x28: filter_zp asm_function MatmulInt8Opt - sub sp, sp, #208 + sub sp, sp, #224 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 @@ -38,6 +38,7 @@ asm_function MatmulInt8Opt stp x23, x24, [sp], #16 stp x25, x26, [sp], #16 stp x27, x28, [sp], #16 + stp x29, x30, [sp], #16 ldr w8, [sp] ldr w9, [sp, #8] @@ -55,7 +56,7 @@ asm_function MatmulInt8Opt LoopRow: mov x16, x1 // reload rhs ptr mov x17, x4 // reload rhs col - mov x18, x7 // reload bias ptr + mov x29, x7 // reload bias ptr mov x27, x2 // reload dst ptr ldr x28, [sp, #64] // reload filter_zp @@ -158,7 +159,7 @@ LoopRow: Bias: cbz x7, NoBias - ld1 {v15.4s}, [x18], #16 + ld1 {v15.4s}, [x29], #16 add v16.4s, v16.4s, v15.4s add v17.4s, v17.4s, v15.4s add v18.4s, v18.4s, v15.4s @@ -330,7 +331,7 @@ LoopColEnd: b LoopRow LoopRowEnd: - sub sp, sp, #208 + sub sp, sp, #224 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 @@ -338,5 +339,6 @@ LoopRowEnd: ldp x23, x24, [sp], #16 ldp x25, x26, [sp], #16 ldp x27, x28, [sp], #16 + ldp x29, x30, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S index 3f6cf4644b..98426e2120 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S @@ -20,9 +20,10 @@ // x7: bias asm_function MatMulR4Int8Neon64 - sub sp, sp, #128 + sub sp, sp, #144 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + stp x19, x20, [sp], #16 mov w15, #0 // b col index mov w16, #0 // a row index @@ -40,7 +41,7 @@ L2: cmp w16, w3 beq End2 - mov x18, x1 // reload b ptr + mov x19, x1 // reload b ptr mov x10, x7 // reload bias ptr mov w11, w5 // reload depth dup v16.4s, wzr @@ -67,10 +68,10 @@ L3: ld1 {v1.16b}, [x17], #16 ld1 {v2.16b}, [x17], #16 ld1 {v3.16b}, [x17], #16 - ld1 {v4.16b}, [x18], #16 - ld1 {v5.16b}, [x18], #16 - ld1 {v6.16b}, [x18], #16 - ld1 {v7.16b}, [x18], #16 + ld1 {v4.16b}, [x19], #16 + ld1 {v5.16b}, [x19], #16 + ld1 {v6.16b}, [x19], #16 + ld1 {v7.16b}, [x19], #16 smull v8.8h, v4.8b, v0.8b smull v9.8h, v5.8b, v0.8b @@ -172,8 +173,9 @@ End2: b L1 End1: - sub sp, sp, #128 + sub sp, sp, #144 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldp x19, x20, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S index a378f1527e..182e7f85ab 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S @@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd mov x14, x1 // mat_b LoopN: mov x16, x0 // mat_a_m - sub x18, x5, x15 // ni + sub x22, x5, x15 // ni sub x19, x17, x3 // mi - mul x18, x18, x17 // ni * m + mul x22, x22, x17 // ni * m mov x11, x6 // in_channel - add x18, x18, x19 // (ni * m) + mi - mul x18, x18, x7 // x18 * c4_channel - add x20, x2, x18 // dst + offset + add x22, x22, x19 // (ni * m) + mi + mul x22, x22, x7 // x22 * c4_channel + add x20, x2, x22 // dst + offset cmp x11, #16 bge LoopC16 cmp x11, #8 diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S index 374c5d60de..eb62903d91 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S +++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S @@ -1,6 +1,5 @@ #ifdef __aarch64__ #include "nnacl/assembly_global.h" - .text .align 5 //.p2align 5,,15 diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S index 84a0ed9ab4..e469642058 100644 --- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S +++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S @@ -55,16 +55,16 @@ LoopH: ld1 {v0.s}[2], [x17], x10 ld1 {v0.s}[3], [x17], x10 mov x11, x6 - mov x18, x17 - add x18, x14, x7 - add x16, x18, x7 + mov x20, x17 + add x20, x14, x7 + add x16, x20, x7 add x19, x16, x7 LoopLength4: ld1 {v16.4s}, [x2] ld1 {v20.4s}, [x14], #16 fmla v16.4s, v20.4s, v0.s[0] - ld1 {v21.4s}, [x18], #16 + ld1 {v21.4s}, [x20], #16 fmul v17.4s, v21.4s, v0.s[1] ld1 {v20.4s}, [x16], #16 fmla v16.4s, v20.4s, v0.s[2] @@ -90,14 +90,14 @@ LoopH: ld1 {v0.s}[1], [x17], x10 ld1 {v0.s}[2], [x17], x10 mov x11, x6 - mov x18, x17 - add x18, x14, x7 - add x16, x18, x7 + mov x20, x17 + add x20, x14, x7 + add x16, x20, x7 LoopLength3: ld1 {v16.4s}, [x2] ld1 {v20.4s}, [x14], #16 fmla v16.4s, v20.4s, v0.s[0] - ld1 {v21.4s}, [x18], #16 + ld1 {v21.4s}, [x20], #16 fmul v17.4s, v21.4s, v0.s[1] ld1 {v20.4s}, [x16], #16 fmla v16.4s, v20.4s, v0.s[2] diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S index 7b96ed500e..a413cf5c01 100644 --- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S +++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S @@ -18,6 +18,9 @@ asm_function WinogradTransRight //x5: k //x6: length +sub sp, sp, #16 +stp x19, x20, [sp], #16 + mov x8, #16 // 4 * sizeof(float) mul x8, x6, x8 mul x9, x5, x8 // step for S @@ -43,7 +46,7 @@ LoopH: cmp x12, #4 blt LoopKStart3 mov x16, x15 - mov x18, x4 + mov x19, x4 LoopK4: ld1 {v0.s}[0], [x13], x10 ld1 {v0.s}[1], [x13], x10 @@ -54,7 +57,7 @@ LoopH: add x14, x17, x8 add x16, x14, x8 - add x18, x16, x8 + add x19, x16, x8 LoopLength4: ld1 {v16.4s}, [x2] @@ -64,7 +67,7 @@ LoopH: fmul v17.4s, v21.4s, v0.s[1] ld1 {v20.4s}, [x16], #16 fmla v16.4s, v20.4s, v0.s[2] - ld1 {v21.4s}, [x18], #16 + ld1 {v21.4s}, [x19], #16 fmla v17.4s, v21.4s, v0.s[3] fadd v17.4s, v16.4s, v17.4s @@ -73,7 +76,7 @@ LoopH: bne LoopLength4 sub x2, x2, x8 sub x12, x12, #4 - mov x17, x18 + mov x17, x19 cmp x12, #4 bge LoopK4 @@ -107,7 +110,7 @@ LoopH: bne LoopLength3 sub x2, x2, x8 sub x12, x12, #3 - mov x17, x18 + mov x17, x19 cmp x12, #3 bge LoopK3 @@ -141,5 +144,7 @@ LoopH: subs x4, x4, #1 bne LoopH + sub sp, sp, #16 + ldp x19, x20, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S b/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S index 8ea32fdf1a..a55642d6c7 100644 --- a/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S +++ b/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S @@ -1,4 +1,5 @@ #ifdef ENABLE_AVX +#include "nnacl/assembly_global.h" .text .align 4 .global ConvDwFp32Avx3x3 @@ -31,7 +32,7 @@ // 56: input_stride // 64: relu // 72: relu6 -ConvDwFp32Avx3x3: +asm_function ConvDwFp32Avx3x3 pushq %r15 pushq %r14 pushq %r13 diff --git a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S index ff762c462b..643c1b3d0e 100644 --- a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S +++ b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S @@ -1,4 +1,5 @@ #ifdef ENABLE_AVX +#include "nnacl/assembly_global.h" .text .align 4 .global MatmulFloatAvxOpt @@ -34,7 +35,7 @@ // 72: stride // 80: writeMode -MatmulFloatAvxOpt: +asm_function MatmulFloatAvxOpt // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention pushq %r15 pushq %r14 diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S index 74cc4c4bf7..359160786e 100644 --- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S +++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S @@ -19,12 +19,13 @@ asm_function ConvDwFp16Center // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved // whereas our coding style do not permit such amount of parameters - sub sp, sp, #176 + sub sp, sp, #192 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 stp x21, x22, [sp], #16 stp x23, x24, [sp], #16 + stp x25, x26, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] @@ -71,7 +72,7 @@ asm_function ConvDwFp16Center mov v14.16b, v24.16b mov v15.16b, v24.16b LoopKh16: - mov x18, x7 + mov x25, x7 mov x21, x16 LoopKw16: mov x22, x21 @@ -108,7 +109,7 @@ asm_function ConvDwFp16Center ld1 {v23.8h}, [x22], x11 fmla v14.8h, v22.8h, v25.8h fmla v15.8h, v23.8h, v25.8h - subs x18, x18, #1 + subs x25, x25, #1 add x21, x21, x13 bne LoopKw16 add x16, x16, x12 @@ -191,7 +192,7 @@ asm_function ConvDwFp16Center mov v6.16b, v24.16b mov v7.16b, v24.16b LoopKh8: - mov x18, x7 + mov x25, x7 mov x21, x16 LoopKw8: mov x22, x21 @@ -212,7 +213,7 @@ asm_function ConvDwFp16Center ld1 {v23.8h}, [x22], x11 fmla v6.8h, v22.8h, v25.8h fmla v7.8h, v23.8h, v25.8h - subs x18, x18, #1 + subs x25, x25, #1 add x21, x21, x13 bne LoopKw8 add x16, x16, x12 @@ -260,13 +261,13 @@ asm_function ConvDwFp16Center mov x20, x6 mov v0.16b, v24.16b LoopKh: - mov x18, x7 + mov x25, x7 mov x22, x16 LoopKw: ld1 {v16.8h}, [x22], x13 ld1 {v25.8h}, [x17], #16 fmla v0.8h, v16.8h, v25.8h - subs x18, x18, #1 + subs x25, x25, #1 bne LoopKw add x16, x16, x12 subs x20, x20, #1 @@ -289,11 +290,12 @@ asm_function ConvDwFp16Center subs x4, x4, #1 bne LoopH - sub sp, sp, #176 + sub sp, sp, #192 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 ldp x21, x22, [sp], #16 ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S index c0ec1a6bbe..d315ac914f 100644 --- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S +++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S @@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center mov x16, x1 mov x17, x4 LoopW: - mov x18, x15 + mov x22, x15 mov x19, x2 mov x20, x5 ld1 {v1.8h}, [x16], x8 LoopKh: - mov x21, x18 + mov x21, x22 mov x13, x6 LoopKw: ld1 {v0.8h}, [x21] @@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center st1 {v0.8h}, [x21], x12 subs x13, x13, #1 bne LoopKw - add x18, x18, x11 + add x22, x22, x11 subs x20, x20, #1 bne LoopKh add x15, x15, x10 diff --git a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S index 5f2c7e641e..c4c2e5e311 100644 --- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S +++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S @@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8 // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ r29 should be also preserved // whereas our coding style do not permit such amount of parameters - sub sp, sp, #128 + sub sp, sp, #144 // performance between storing 4 registers at the same time and separately storing them on in-order cores // is not tested yet st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + stp x19, x20, [sp], #16 ldr x8, [sp, #0] ldr x9, [sp, #8] @@ -548,87 +549,87 @@ IndirectGemmStart: b WriteEnd Write7: add x17, x15, #8 - add x18, x15, #10 + add x19, x15, #10 add x16, x15, #12 st1 {v16.4h}, [x15], x7 ins v0.s[0], v16.s[2] st1 {v0.h}[0], [x17], x7 - st1 {v0.h}[1], [x18], x7 + st1 {v0.h}[1], [x19], x7 st1 {v16.h}[6], [x16], x7 st1 {v17.4h}, [x15], x7 ins v1.s[0], v17.s[2] st1 {v1.h}[0], [x17], x7 - st1 {v1.h}[1], [x18], x7 + st1 {v1.h}[1], [x19], x7 st1 {v17.h}[6], [x16], x7 st1 {v18.4h}, [x15], x7 ins v2.s[0], v18.s[2] st1 {v2.h}[0], [x17], x7 - st1 {v2.h}[1], [x18], x7 + st1 {v2.h}[1], [x19], x7 st1 {v18.h}[6], [x16], x7 st1 {v19.4h}, [x15], x7 ins v3.s[0], v19.s[2] st1 {v3.h}[0], [x17], x7 - st1 {v3.h}[1], [x18], x7 + st1 {v3.h}[1], [x19], x7 st1 {v19.h}[6], [x16], x7 st1 {v20.4h}, [x15], x7 ins v4.s[0], v20.s[2] st1 {v4.h}[0], [x17], x7 - st1 {v4.h}[1], [x18], x7 + st1 {v4.h}[1], [x19], x7 st1 {v20.h}[6], [x16], x7 st1 {v21.4h}, [x15], x7 ins v5.s[0], v21.s[2] st1 {v5.h}[0], [x17], x7 - st1 {v5.h}[1], [x18], x7 + st1 {v5.h}[1], [x19], x7 st1 {v21.h}[6], [x16], x7 st1 {v22.4h}, [x15], x7 ins v6.s[0], v22.s[2] st1 {v6.h}[0], [x17], x7 - st1 {v6.h}[1], [x18], x7 + st1 {v6.h}[1], [x19], x7 st1 {v22.h}[6], [x16], x7 st1 {v23.4h}, [x15], x7 ins v7.s[0], v23.s[2] st1 {v7.h}[0], [x17], x7 - st1 {v7.h}[1], [x18], x7 + st1 {v7.h}[1], [x19], x7 st1 {v23.h}[6], [x16], x7 st1 {v24.4h}, [x15], x7 ins v8.s[0], v24.s[2] st1 {v8.h}[0], [x17], x7 - st1 {v8.h}[1], [x18], x7 + st1 {v8.h}[1], [x19], x7 st1 {v24.h}[6], [x16], x7 st1 {v25.4h}, [x15], x7 ins v9.s[0], v25.s[2] st1 {v9.h}[0], [x17], x7 - st1 {v9.h}[1], [x18], x7 + st1 {v9.h}[1], [x19], x7 st1 {v25.h}[6], [x16], x7 st1 {v26.4h}, [x15], x7 ins v10.s[0], v26.s[2] st1 {v10.h}[0], [x17], x7 - st1 {v10.h}[1], [x18], x7 + st1 {v10.h}[1], [x19], x7 st1 {v26.h}[6], [x16], x7 st1 {v27.4h}, [x15], x7 ins v11.s[0], v27.s[2] st1 {v11.h}[0], [x17], x7 - st1 {v11.h}[1], [x18], x7 + st1 {v11.h}[1], [x19], x7 st1 {v27.h}[6], [x16], x7 st1 {v28.4h}, [x15], x7 ins v12.s[0], v28.s[2] st1 {v12.h}[0], [x17], x7 - st1 {v12.h}[1], [x18], x7 + st1 {v12.h}[1], [x19], x7 st1 {v28.h}[6], [x16], x7 st1 {v29.4h}, [x15], x7 ins v13.s[0], v29.s[2] st1 {v13.h}[0], [x17], x7 - st1 {v13.h}[1], [x18], x7 + st1 {v13.h}[1], [x19], x7 st1 {v29.h}[6], [x16], x7 st1 {v30.4h}, [x15], x7 ins v14.s[0], v30.s[2] st1 {v14.h}[0], [x17], x7 - st1 {v14.h}[1], [x18], x7 + st1 {v14.h}[1], [x19], x7 st1 {v30.h}[6], [x16], x7 st1 {v31.4h}, [x15] ins v15.s[0], v31.s[2] st1 {v15.h}[0], [x17] - st1 {v15.h}[1], [x18] + st1 {v15.h}[1], [x19] st1 {v31.h}[6], [x16] add x0, x0, #14 b WriteEnd @@ -661,9 +662,10 @@ IndirectGemmStart: NoStepForward: bgt LoopOc - sub sp, sp, #128 + sub sp, sp, #144 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldp x19, x20, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S index bc3644ad21..dac86acd0e 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S @@ -21,21 +21,22 @@ // w13: writeC8 asm_function MatmulFp16Neon64 - sub sp, sp, #128 + sub sp, sp, #144 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + stp x19, x20, [sp], #16 mov w18, #16 // sizeof(float16) * 8 mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth mov x11, x3 // bias flag - mov x18, #2 + mov x19, #2 ldr x17, [sp] - mul x17, x17, x18 + mul x17, x17, x19 L1: mov w10, w6 // reload lhs row mov x12, x0 // reload lhs ptr - mov x18, x2 // reload dst ptr + mov x19, x2 // reload dst ptr L2: mov x16, x1 // reload rhs ptr @@ -314,490 +315,490 @@ Write: b Write8 Write1: - st1 {v16.h}[0], [x18], x17 + st1 {v16.h}[0], [x19], x17 cmp w10, #1 beq WriteEnd - st1 {v17.h}[0], [x18], x17 + st1 {v17.h}[0], [x19], x17 cmp w10, #2 beq WriteEnd - st1 {v18.h}[0], [x18], x17 + st1 {v18.h}[0], [x19], x17 cmp w10, #3 beq WriteEnd - st1 {v19.h}[0], [x18], x17 + st1 {v19.h}[0], [x19], x17 cmp w10, #4 beq WriteEnd - st1 {v20.h}[0], [x18], x17 + st1 {v20.h}[0], [x19], x17 cmp w10, #5 beq WriteEnd - st1 {v21.h}[0], [x18], x17 + st1 {v21.h}[0], [x19], x17 cmp w10, #6 beq WriteEnd - st1 {v22.h}[0], [x18], x17 + st1 {v22.h}[0], [x19], x17 cmp w10, #7 beq WriteEnd - st1 {v23.h}[0], [x18], x17 + st1 {v23.h}[0], [x19], x17 cmp w10, #8 beq WriteEnd - st1 {v24.h}[0], [x18], x17 + st1 {v24.h}[0], [x19], x17 cmp w10, #9 beq WriteEnd - st1 {v25.h}[0], [x18], x17 + st1 {v25.h}[0], [x19], x17 cmp w10, #10 beq WriteEnd - st1 {v26.h}[0], [x18], x17 + st1 {v26.h}[0], [x19], x17 cmp w10, #11 beq WriteEnd - st1 {v27.h}[0], [x18], x17 + st1 {v27.h}[0], [x19], x17 cmp w10, #12 beq WriteEnd - st1 {v28.h}[0], [x18], x17 + st1 {v28.h}[0], [x19], x17 cmp w10, #13 beq WriteEnd - st1 {v29.h}[0], [x18], x17 + st1 {v29.h}[0], [x19], x17 cmp w10, #14 beq WriteEnd - st1 {v30.h}[0], [x18], x17 + st1 {v30.h}[0], [x19], x17 cmp w10, #15 beq WriteEnd - st1 {v31.h}[0], [x18], x17 + st1 {v31.h}[0], [x19], x17 b WriteEnd Write2: - add x13, x18, #2 - st1 {v16.h}[0], [x18], x17 + add x13, x19, #2 + st1 {v16.h}[0], [x19], x17 st1 {v16.h}[1], [x13], x17 cmp w10, #1 beq WriteEnd - st1 {v17.h}[0], [x18], x17 + st1 {v17.h}[0], [x19], x17 st1 {v17.h}[1], [x13], x17 cmp w10, #2 beq WriteEnd - st1 {v18.h}[0], [x18], x17 + st1 {v18.h}[0], [x19], x17 st1 {v18.h}[1], [x13], x17 cmp w10, #3 beq WriteEnd - st1 {v19.h}[0], [x18], x17 + st1 {v19.h}[0], [x19], x17 st1 {v19.h}[1], [x13], x17 cmp w10, #4 beq WriteEnd - st1 {v20.h}[0], [x18], x17 + st1 {v20.h}[0], [x19], x17 st1 {v20.h}[1], [x13], x17 cmp w10, #5 beq WriteEnd - st1 {v21.h}[0], [x18], x17 + st1 {v21.h}[0], [x19], x17 st1 {v21.h}[1], [x13], x17 cmp w10, #6 beq WriteEnd - st1 {v22.h}[0], [x18], x17 + st1 {v22.h}[0], [x19], x17 st1 {v22.h}[1], [x13], x17 cmp w10, #7 beq WriteEnd - st1 {v23.h}[0], [x18], x17 + st1 {v23.h}[0], [x19], x17 st1 {v23.h}[1], [x13], x17 cmp w10, #8 beq WriteEnd - st1 {v24.h}[0], [x18], x17 + st1 {v24.h}[0], [x19], x17 st1 {v24.h}[1], [x13], x17 cmp w10, #9 beq WriteEnd - st1 {v25.h}[0], [x18], x17 + st1 {v25.h}[0], [x19], x17 st1 {v25.h}[1], [x13], x17 cmp w10, #10 beq WriteEnd - st1 {v26.h}[0], [x18], x17 + st1 {v26.h}[0], [x19], x17 st1 {v26.h}[1], [x13], x17 cmp w10, #11 beq WriteEnd - st1 {v27.h}[0], [x18], x17 + st1 {v27.h}[0], [x19], x17 st1 {v27.h}[1], [x13], x17 cmp w10, #12 beq WriteEnd - st1 {v28.h}[0], [x18], x17 + st1 {v28.h}[0], [x19], x17 st1 {v28.h}[1], [x13], x17 cmp w10, #13 beq WriteEnd - st1 {v29.h}[0], [x18], x17 + st1 {v29.h}[0], [x19], x17 st1 {v29.h}[1], [x13], x17 cmp w10, #14 beq WriteEnd - st1 {v30.h}[0], [x18], x17 + st1 {v30.h}[0], [x19], x17 st1 {v30.h}[1], [x13], x17 cmp w10, #15 beq WriteEnd - st1 {v31.h}[0], [x18], x17 + st1 {v31.h}[0], [x19], x17 st1 {v31.h}[1], [x13], x17 b WriteEnd Write3: - add x13, x18, #2 - add x14, x18, #4 - st1 {v16.h}[0], [x18], x17 + add x13, x19, #2 + add x14, x19, #4 + st1 {v16.h}[0], [x19], x17 st1 {v16.h}[1], [x13], x17 st1 {v16.h}[2], [x14], x17 cmp w10, #1 beq WriteEnd - st1 {v17.h}[0], [x18], x17 + st1 {v17.h}[0], [x19], x17 st1 {v17.h}[1], [x13], x17 st1 {v17.h}[2], [x14], x17 cmp w10, #2 beq WriteEnd - st1 {v18.h}[0], [x18], x17 + st1 {v18.h}[0], [x19], x17 st1 {v18.h}[1], [x13], x17 st1 {v18.h}[2], [x14], x17 cmp w10, #3 beq WriteEnd - st1 {v19.h}[0], [x18], x17 + st1 {v19.h}[0], [x19], x17 st1 {v19.h}[1], [x13], x17 st1 {v19.h}[2], [x14], x17 cmp w10, #4 beq WriteEnd - st1 {v20.h}[0], [x18], x17 + st1 {v20.h}[0], [x19], x17 st1 {v20.h}[1], [x13], x17 st1 {v20.h}[2], [x14], x17 cmp w10, #5 beq WriteEnd - st1 {v21.h}[0], [x18], x17 + st1 {v21.h}[0], [x19], x17 st1 {v21.h}[1], [x13], x17 st1 {v21.h}[2], [x14], x17 cmp w10, #6 beq WriteEnd - st1 {v22.h}[0], [x18], x17 + st1 {v22.h}[0], [x19], x17 st1 {v22.h}[1], [x13], x17 st1 {v22.h}[2], [x14], x17 cmp w10, #7 beq WriteEnd - st1 {v23.h}[0], [x18], x17 + st1 {v23.h}[0], [x19], x17 st1 {v23.h}[1], [x13], x17 st1 {v23.h}[2], [x14], x17 cmp w10, #8 beq WriteEnd - st1 {v24.h}[0], [x18], x17 + st1 {v24.h}[0], [x19], x17 st1 {v24.h}[1], [x13], x17 st1 {v24.h}[2], [x14], x17 cmp w10, #9 beq WriteEnd - st1 {v25.h}[0], [x18], x17 + st1 {v25.h}[0], [x19], x17 st1 {v25.h}[1], [x13], x17 st1 {v25.h}[2], [x14], x17 cmp w10, #10 beq WriteEnd - st1 {v26.h}[0], [x18], x17 + st1 {v26.h}[0], [x19], x17 st1 {v26.h}[1], [x13], x17 st1 {v26.h}[2], [x14], x17 cmp w10, #11 beq WriteEnd - st1 {v27.h}[0], [x18], x17 + st1 {v27.h}[0], [x19], x17 st1 {v27.h}[1], [x13], x17 st1 {v27.h}[2], [x14], x17 cmp w10, #12 beq WriteEnd - st1 {v28.h}[0], [x18], x17 + st1 {v28.h}[0], [x19], x17 st1 {v28.h}[1], [x13], x17 st1 {v28.h}[2], [x14], x17 cmp w10, #13 beq WriteEnd - st1 {v29.h}[0], [x18], x17 + st1 {v29.h}[0], [x19], x17 st1 {v29.h}[1], [x13], x17 st1 {v29.h}[2], [x14], x17 cmp w10, #14 beq WriteEnd - st1 {v30.h}[0], [x18], x17 + st1 {v30.h}[0], [x19], x17 st1 {v30.h}[1], [x13], x17 st1 {v30.h}[2], [x14], x17 cmp w10, #15 beq WriteEnd - st1 {v31.h}[0], [x18], x17 + st1 {v31.h}[0], [x19], x17 st1 {v31.h}[1], [x13], x17 st1 {v31.h}[2], [x14], x17 b WriteEnd Write4: - st1 {v16.4h}, [x18], x17 + st1 {v16.4h}, [x19], x17 cmp w10, #1 beq WriteEnd - st1 {v17.4h}, [x18], x17 + st1 {v17.4h}, [x19], x17 cmp w10, #2 beq WriteEnd - st1 {v18.4h}, [x18], x17 + st1 {v18.4h}, [x19], x17 cmp w10, #3 beq WriteEnd - st1 {v19.4h}, [x18], x17 + st1 {v19.4h}, [x19], x17 cmp w10, #4 beq WriteEnd - st1 {v20.4h}, [x18], x17 + st1 {v20.4h}, [x19], x17 cmp w10, #5 beq WriteEnd - st1 {v21.4h}, [x18], x17 + st1 {v21.4h}, [x19], x17 cmp w10, #6 beq WriteEnd - st1 {v22.4h}, [x18], x17 + st1 {v22.4h}, [x19], x17 cmp w10, #7 beq WriteEnd - st1 {v23.4h}, [x18], x17 + st1 {v23.4h}, [x19], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4h}, [x18], x17 + st1 {v24.4h}, [x19], x17 cmp w10, #9 beq WriteEnd - st1 {v25.4h}, [x18], x17 + st1 {v25.4h}, [x19], x17 cmp w10, #10 beq WriteEnd - st1 {v26.4h}, [x18], x17 + st1 {v26.4h}, [x19], x17 cmp w10, #11 beq WriteEnd - st1 {v27.4h}, [x18], x17 + st1 {v27.4h}, [x19], x17 cmp w10, #12 beq WriteEnd - st1 {v28.4h}, [x18], x17 + st1 {v28.4h}, [x19], x17 cmp w10, #13 beq WriteEnd - st1 {v29.4h}, [x18], x17 + st1 {v29.4h}, [x19], x17 cmp w10, #14 beq WriteEnd - st1 {v30.4h}, [x18], x17 + st1 {v30.4h}, [x19], x17 cmp w10, #15 beq WriteEnd - st1 {v31.4h}, [x18], x17 + st1 {v31.4h}, [x19], x17 b WriteEnd Write5: - add x13, x18, #8 - st1 {v16.4h}, [x18], x17 + add x13, x19, #8 + st1 {v16.4h}, [x19], x17 st1 {v16.h}[4], [x13], x17 cmp w10, #1 beq WriteEnd - st1 {v17.4h}, [x18], x17 + st1 {v17.4h}, [x19], x17 st1 {v17.h}[4], [x13], x17 cmp w10, #2 beq WriteEnd - st1 {v18.4h}, [x18], x17 + st1 {v18.4h}, [x19], x17 st1 {v18.h}[4], [x13], x17 cmp w10, #3 beq WriteEnd - st1 {v19.4h}, [x18], x17 + st1 {v19.4h}, [x19], x17 st1 {v19.h}[4], [x13], x17 cmp w10, #4 beq WriteEnd - st1 {v20.4h}, [x18], x17 + st1 {v20.4h}, [x19], x17 st1 {v20.h}[4], [x13], x17 cmp w10, #5 beq WriteEnd - st1 {v21.4h}, [x18], x17 + st1 {v21.4h}, [x19], x17 st1 {v21.h}[4], [x13], x17 cmp w10, #6 beq WriteEnd - st1 {v22.4h}, [x18], x17 + st1 {v22.4h}, [x19], x17 st1 {v22.h}[4], [x13], x17 cmp w10, #7 beq WriteEnd - st1 {v23.4h}, [x18], x17 + st1 {v23.4h}, [x19], x17 st1 {v23.h}[4], [x13], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4h}, [x18], x17 + st1 {v24.4h}, [x19], x17 st1 {v24.h}[4], [x13], x17 cmp w10, #9 beq WriteEnd - st1 {v25.4h}, [x18], x17 + st1 {v25.4h}, [x19], x17 st1 {v25.h}[4], [x13], x17 cmp w10, #10 beq WriteEnd - st1 {v26.4h}, [x18], x17 + st1 {v26.4h}, [x19], x17 st1 {v26.h}[4], [x13], x17 cmp w10, #11 beq WriteEnd - st1 {v27.4h}, [x18], x17 + st1 {v27.4h}, [x19], x17 st1 {v27.h}[4], [x13], x17 cmp w10, #12 beq WriteEnd - st1 {v28.4h}, [x18], x17 + st1 {v28.4h}, [x19], x17 st1 {v28.h}[4], [x13], x17 cmp w10, #13 beq WriteEnd - st1 {v29.4h}, [x18], x17 + st1 {v29.4h}, [x19], x17 st1 {v29.h}[4], [x13], x17 cmp w10, #14 beq WriteEnd - st1 {v30.4h}, [x18], x17 + st1 {v30.4h}, [x19], x17 st1 {v30.h}[4], [x13], x17 cmp w10, #15 beq WriteEnd - st1 {v31.4h}, [x18], x17 + st1 {v31.4h}, [x19], x17 st1 {v31.h}[4], [x13], x17 b WriteEnd Write6: - add x13, x18, #8 - add x14, x18, #10 - st1 {v16.4h}, [x18], x17 + add x13, x19, #8 + add x14, x19, #10 + st1 {v16.4h}, [x19], x17 st1 {v16.h}[4], [x13], x17 st1 {v16.h}[5], [x14], x17 cmp w10, #1 beq WriteEnd - st1 {v17.4h}, [x18], x17 + st1 {v17.4h}, [x19], x17 st1 {v17.h}[4], [x13], x17 st1 {v17.h}[5], [x14], x17 cmp w10, #2 beq WriteEnd - st1 {v18.4h}, [x18], x17 + st1 {v18.4h}, [x19], x17 st1 {v18.h}[4], [x13], x17 st1 {v18.h}[5], [x14], x17 cmp w10, #3 beq WriteEnd - st1 {v19.4h}, [x18], x17 + st1 {v19.4h}, [x19], x17 st1 {v19.h}[4], [x13], x17 st1 {v19.h}[5], [x14], x17 cmp w10, #4 beq WriteEnd - st1 {v20.4h}, [x18], x17 + st1 {v20.4h}, [x19], x17 st1 {v20.h}[4], [x13], x17 st1 {v20.h}[5], [x14], x17 cmp w10, #5 beq WriteEnd - st1 {v21.4h}, [x18], x17 + st1 {v21.4h}, [x19], x17 st1 {v21.h}[4], [x13], x17 st1 {v21.h}[5], [x14], x17 cmp w10, #6 beq WriteEnd - st1 {v22.4h}, [x18], x17 + st1 {v22.4h}, [x19], x17 st1 {v22.h}[4], [x13], x17 st1 {v22.h}[5], [x14], x17 cmp w10, #7 beq WriteEnd - st1 {v23.4h}, [x18], x17 + st1 {v23.4h}, [x19], x17 st1 {v23.h}[4], [x13], x17 st1 {v23.h}[5], [x14], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4h}, [x18], x17 + st1 {v24.4h}, [x19], x17 st1 {v24.h}[4], [x13], x17 st1 {v24.h}[5], [x14], x17 cmp w10, #9 beq WriteEnd - st1 {v25.4h}, [x18], x17 + st1 {v25.4h}, [x19], x17 st1 {v25.h}[4], [x13], x17 st1 {v25.h}[5], [x14], x17 cmp w10, #10 beq WriteEnd - st1 {v26.4h}, [x18], x17 + st1 {v26.4h}, [x19], x17 st1 {v26.h}[4], [x13], x17 st1 {v26.h}[5], [x14], x17 cmp w10, #11 beq WriteEnd - st1 {v27.4h}, [x18], x17 + st1 {v27.4h}, [x19], x17 st1 {v27.h}[4], [x13], x17 st1 {v27.h}[5], [x14], x17 cmp w10, #12 beq WriteEnd - st1 {v28.4h}, [x18], x17 + st1 {v28.4h}, [x19], x17 st1 {v28.h}[4], [x13], x17 st1 {v28.h}[5], [x14], x17 cmp w10, #13 beq WriteEnd - st1 {v29.4h}, [x18], x17 + st1 {v29.4h}, [x19], x17 st1 {v29.h}[4], [x13], x17 st1 {v29.h}[5], [x14], x17 cmp w10, #14 beq WriteEnd - st1 {v30.4h}, [x18], x17 + st1 {v30.4h}, [x19], x17 st1 {v30.h}[4], [x13], x17 st1 {v30.h}[5], [x14], x17 cmp w10, #15 beq WriteEnd - st1 {v31.4h}, [x18], x17 + st1 {v31.4h}, [x19], x17 st1 {v31.h}[4], [x13], x17 st1 {v31.h}[5], [x14], x17 b WriteEnd Write7: - add x13, x18, #8 - add x14, x18, #10 - add x16, x18, #12 - st1 {v16.4h}, [x18], x17 + add x13, x19, #8 + add x14, x19, #10 + add x16, x19, #12 + st1 {v16.4h}, [x19], x17 st1 {v16.h}[4], [x13], x17 st1 {v16.h}[5], [x14], x17 st1 {v16.h}[6], [x16], x17 cmp w10, #1 beq WriteEnd - st1 {v17.4h}, [x18], x17 + st1 {v17.4h}, [x19], x17 st1 {v17.h}[4], [x13], x17 st1 {v17.h}[5], [x14], x17 st1 {v17.h}[6], [x16], x17 cmp w10, #2 beq WriteEnd - st1 {v18.4h}, [x18], x17 + st1 {v18.4h}, [x19], x17 st1 {v18.h}[4], [x13], x17 st1 {v18.h}[5], [x14], x17 st1 {v18.h}[6], [x16], x17 cmp w10, #3 beq WriteEnd - st1 {v19.4h}, [x18], x17 + st1 {v19.4h}, [x19], x17 st1 {v19.h}[4], [x13], x17 st1 {v19.h}[5], [x14], x17 st1 {v19.h}[6], [x16], x17 cmp w10, #4 beq WriteEnd - st1 {v20.4h}, [x18], x17 + st1 {v20.4h}, [x19], x17 st1 {v20.h}[4], [x13], x17 st1 {v20.h}[5], [x14], x17 st1 {v20.h}[6], [x16], x17 cmp w10, #5 beq WriteEnd - st1 {v21.4h}, [x18], x17 + st1 {v21.4h}, [x19], x17 st1 {v21.h}[4], [x13], x17 st1 {v21.h}[5], [x14], x17 st1 {v21.h}[6], [x16], x17 cmp w10, #6 beq WriteEnd - st1 {v22.4h}, [x18], x17 + st1 {v22.4h}, [x19], x17 st1 {v22.h}[4], [x13], x17 st1 {v22.h}[5], [x14], x17 st1 {v22.h}[6], [x16], x17 cmp w10, #7 beq WriteEnd - st1 {v23.4h}, [x18], x17 + st1 {v23.4h}, [x19], x17 st1 {v23.h}[4], [x13], x17 st1 {v23.h}[5], [x14], x17 st1 {v23.h}[6], [x16], x17 cmp w10, #8 beq WriteEnd - st1 {v24.4h}, [x18], x17 + st1 {v24.4h}, [x19], x17 st1 {v24.h}[4], [x13], x17 st1 {v24.h}[5], [x14], x17 st1 {v24.h}[6], [x16], x17 cmp w10, #9 beq WriteEnd - st1 {v25.4h}, [x18], x17 + st1 {v25.4h}, [x19], x17 st1 {v25.h}[4], [x13], x17 st1 {v25.h}[5], [x14], x17 st1 {v25.h}[6], [x16], x17 cmp w10, #10 beq WriteEnd - st1 {v26.4h}, [x18], x17 + st1 {v26.4h}, [x19], x17 st1 {v26.h}[4], [x13], x17 st1 {v26.h}[5], [x14], x17 st1 {v26.h}[6], [x16], x17 cmp w10, #11 beq WriteEnd - st1 {v27.4h}, [x18], x17 + st1 {v27.4h}, [x19], x17 st1 {v27.h}[4], [x13], x17 st1 {v27.h}[5], [x14], x17 st1 {v27.h}[6], [x16], x17 cmp w10, #12 beq WriteEnd - st1 {v28.4h}, [x18], x17 + st1 {v28.4h}, [x19], x17 st1 {v28.h}[4], [x13], x17 st1 {v28.h}[5], [x14], x17 st1 {v28.h}[6], [x16], x17 cmp w10, #13 beq WriteEnd - st1 {v29.4h}, [x18], x17 + st1 {v29.4h}, [x19], x17 st1 {v29.h}[4], [x13], x17 st1 {v29.h}[5], [x14], x17 st1 {v29.h}[6], [x16], x17 cmp w10, #14 beq WriteEnd - st1 {v30.4h}, [x18], x17 + st1 {v30.4h}, [x19], x17 st1 {v30.h}[4], [x13], x17 st1 {v30.h}[5], [x14], x17 st1 {v30.h}[6], [x16], x17 cmp w10, #15 beq WriteEnd - st1 {v31.4h}, [x18], x17 + st1 {v31.4h}, [x19], x17 st1 {v31.h}[4], [x13], x17 st1 {v31.h}[5], [x14], x17 st1 {v31.h}[6], [x16], x17 @@ -809,52 +810,52 @@ WriteC8: st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 b WriteEnd Write8: - st1 {v16.8h}, [x18], x17 + st1 {v16.8h}, [x19], x17 cmp w10, #1 beq WriteEnd - st1 {v17.8h}, [x18], x17 + st1 {v17.8h}, [x19], x17 cmp w10, #2 beq WriteEnd - st1 {v18.8h}, [x18], x17 + st1 {v18.8h}, [x19], x17 cmp w10, #3 beq WriteEnd - st1 {v19.8h}, [x18], x17 + st1 {v19.8h}, [x19], x17 cmp w10, #4 beq WriteEnd - st1 {v20.8h}, [x18], x17 + st1 {v20.8h}, [x19], x17 cmp w10, #5 beq WriteEnd - st1 {v21.8h}, [x18], x17 + st1 {v21.8h}, [x19], x17 cmp w10, #6 beq WriteEnd - st1 {v22.8h}, [x18], x17 + st1 {v22.8h}, [x19], x17 cmp w10, #7 beq WriteEnd - st1 {v23.8h}, [x18], x17 + st1 {v23.8h}, [x19], x17 cmp w10, #8 beq WriteEnd - st1 {v24.8h}, [x18], x17 + st1 {v24.8h}, [x19], x17 cmp w10, #9 beq WriteEnd - st1 {v25.8h}, [x18], x17 + st1 {v25.8h}, [x19], x17 cmp w10, #10 beq WriteEnd - st1 {v26.8h}, [x18], x17 + st1 {v26.8h}, [x19], x17 cmp w10, #11 beq WriteEnd - st1 {v27.8h}, [x18], x17 + st1 {v27.8h}, [x19], x17 cmp w10, #12 beq WriteEnd - st1 {v28.8h}, [x18], x17 + st1 {v28.8h}, [x19], x17 cmp w10, #13 beq WriteEnd - st1 {v29.8h}, [x18], x17 + st1 {v29.8h}, [x19], x17 cmp w10, #14 beq WriteEnd - st1 {v30.8h}, [x18], x17 + st1 {v30.8h}, [x19], x17 cmp w10, #15 beq WriteEnd - st1 {v31.8h}, [x18], x17 + st1 {v31.8h}, [x19], x17 WriteEnd: subs w10, w10, #16 // lhs row - 8 @@ -871,8 +872,9 @@ NoDstStep: bgt L1 End1: - sub sp, sp, #128 + sub sp, sp, #144 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ldp x19, x20, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S index 1d2eb479bc..38699e37b8 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S @@ -21,30 +21,31 @@ // x9: writeMode asm_function MatmulFp16Neon64Opt - sub sp, sp, #80 + sub sp, sp, #96 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 ldr x8, [sp] ldr x9, [sp, #8] - mov x18, #32 // sizeof(float16_t) * 16 - mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth + mov x21, #32 // sizeof(float16_t) * 16 + mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth cbnz x9, NoC8Steps mov x11, x2 - mov x18, #16 - mul x16, x6, x18 // row * 8 * sizeof(float16_t) + mov x21, #16 + mul x16, x6, x21 // row * 8 * sizeof(float16_t) NoC8Steps: cmp x9, #2 bne NoWinoSteps - mov x18, #2 + mov x21, #2 mul x15, x7, x8 - mul x15, x15, x18 // kernel_size * col *sizeof(float16_t) - mov x18, #16 - mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t) + mul x15, x15, x21 // kernel_size * col *sizeof(float16_t) + mov x21, #16 + mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t) NoWinoSteps: - mov x18, #2 - mul x8, x8, x18 + mov x21, #2 + mul x8, x8, x21 LoopRowStart: cmp x6, #1 @@ -1221,9 +1222,9 @@ LoopRow: LoopColEnd: add x0, x0, x17 cbz x9, C8DstStep - mov x18, #2 - mul x18, x18, x7 - sub x11, x11, x18 + mov x21, #2 + mul x21, x21, x7 + sub x11, x11, x21 mov x2, x11 b NoDstStep C8DstStep: @@ -1233,8 +1234,9 @@ LoopColEnd: subs x6, x6, #16 bgt LoopRowStart - sub sp, sp, #80 + sub sp, sp, #96 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S index daaed9163a..029365b0a9 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S @@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16 mov x14, x1 // mat_b LoopN: mov x16, x0 // mat_a_m - sub x18, x5, x15 // ni + sub x22, x5, x15 // ni sub x19, x17, x3 // mi - mul x18, x18, x17 // ni * m + mul x22, x22, x17 // ni * m mov x11, x6 // in_channel - add x18, x18, x19 // (ni * m) + mi - mul x18, x18, x13 // x18 * channel_in * 2 - add x20, x2, x18 // dst + offset + add x22, x22, x19 // (ni * m) + mi + mul x22, x22, x13 // x22 * channel_in * 2 + add x20, x2, x22 // dst + offset cmp x11, #32 bge LoopC32 cmp x11, #16 diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S index df1d88750e..ccb782881d 100644 --- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S @@ -9,8 +9,8 @@ asm_function WinogradTransLeftFp16 -sub sp, sp, #32 -stp x19, x20, [sp], #32 +sub sp, sp, #16 +stp x19, x20, [sp], #16 mov x8, #8 // 4 * sizeof(float16) mul x8, x6, x8 @@ -46,16 +46,16 @@ LoopH: ld1 {v0.h}[2], [x17], x10 ld1 {v0.h}[3], [x17], x10 mov x11, x6 - mov x18, x17 - add x18, x14, x7 - add x16, x18, x7 + mov x20, x17 + add x20, x14, x7 + add x16, x20, x7 add x19, x16, x7 LoopLength4: ld1 {v16.4h}, [x2] ld1 {v20.4h}, [x14], #8 fmla v16.4h, v20.4h, v0.h[0] - ld1 {v21.4h}, [x18], #8 + ld1 {v21.4h}, [x20], #8 fmul v17.4h, v21.4h, v0.h[1] ld1 {v20.4h}, [x16], #8 fmla v16.4h, v20.4h, v0.h[2] @@ -81,14 +81,14 @@ LoopH: ld1 {v0.h}[1], [x17], x10 ld1 {v0.h}[2], [x17], x10 mov x11, x6 - mov x18, x17 - add x18, x14, x7 - add x16, x18, x7 + mov x20, x17 + add x20, x14, x7 + add x16, x20, x7 LoopLength3: ld1 {v16.4h}, [x2] ld1 {v20.4h}, [x14], #8 fmla v16.4h, v20.4h, v0.h[0] - ld1 {v21.4h}, [x18], #8 + ld1 {v21.4h}, [x20], #8 fmul v17.4h, v21.4h, v0.h[1] ld1 {v20.4h}, [x16], #8 fmla v16.4h, v20.4h, v0.h[2] @@ -132,6 +132,6 @@ LoopH: subs x4, x4, #1 bne LoopH - sub sp, sp, #32 - ldp x19, x20, [sp], #32 + sub sp, sp, #16 + ldp x19, x20, [sp], #16 ret diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S index c889803691..73c1e517d7 100644 --- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S @@ -9,6 +9,9 @@ asm_function WinogradTransRightFp16 +sub sp, sp, #16 +stp x19, x20, [sp], #16 + mov x8, #8 // 4 * sizeof(float16) mul x8, x6, x8 mul x9, x5, x8 // step for S @@ -34,7 +37,7 @@ LoopH: cmp x12, #4 blt LoopKStart3 mov x16, x15 - mov x18, x4 + mov x19, x4 LoopK4: ld1 {v0.h}[0], [x13], x10 ld1 {v0.h}[1], [x13], x10 @@ -45,7 +48,7 @@ LoopH: add x14, x17, x8 add x16, x14, x8 - add x18, x16, x8 + add x19, x16, x8 LoopLength4: ld1 {v16.4h}, [x2] @@ -55,7 +58,7 @@ LoopH: fmul v17.4h, v21.4h, v0.h[1] ld1 {v20.4h}, [x16], #8 fmla v16.4h, v20.4h, v0.h[2] - ld1 {v21.4h}, [x18], #8 + ld1 {v21.4h}, [x19], #8 fmla v17.4h, v21.4h, v0.h[3] fadd v17.4h, v16.4h, v17.4h @@ -64,7 +67,7 @@ LoopH: bne LoopLength4 sub x2, x2, x8 sub x12, x12, #4 - mov x17, x18 + mov x17, x19 cmp x12, #4 bge LoopK4 @@ -98,7 +101,7 @@ LoopH: bne LoopLength3 sub x2, x2, x8 sub x12, x12, #3 - mov x17, x18 + mov x17, x19 cmp x12, #3 bge LoopK3 @@ -132,4 +135,7 @@ LoopH: subs x4, x4, #1 bne LoopH + sub sp, sp, #16 + ldp x19, x20, [sp], #16 + ret \ No newline at end of file diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S index 38a38433b1..5bc1e5095c 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S @@ -66,7 +66,7 @@ L2: cmp w16, #0 beq End2 - mov x18, x1 // reload b ptr + mov x28, x1 // reload b ptr mov x19, x7 // reload bias ptr mov w20, w5 // reload depth dup v16.4s, wzr @@ -91,7 +91,7 @@ L3: LoopD16: ld1 {v0.16b, v1.16b}, [x17], #32 - ld1 {v2.16b, v3.16b}, [x18], #32 + ld1 {v2.16b, v3.16b}, [x28], #32 sdot v16.4s, v2.16b, v0.4b[0] sdot v18.4s, v2.16b, v0.4b[1] @@ -104,7 +104,7 @@ LoopD16: sdot v28.4s, v2.16b, v1.4b[2] sdot v30.4s, v2.16b, v1.4b[3] - ld1 {v6.16b, v7.16b}, [x18], #32 + ld1 {v6.16b, v7.16b}, [x28], #32 sdot v17.4s, v3.16b, v0.4b[0] sdot v19.4s, v3.16b, v0.4b[1] sdot v21.4s, v3.16b, v0.4b[2] @@ -126,7 +126,7 @@ LoopD16: sdot v28.4s, v6.16b, v5.4b[2] sdot v30.4s, v6.16b, v5.4b[3] - ld1 {v10.16b, v11.16b}, [x18], #32 + ld1 {v10.16b, v11.16b}, [x28], #32 sdot v17.4s, v7.16b, v4.4b[0] sdot v19.4s, v7.16b, v4.4b[1] sdot v21.4s, v7.16b, v4.4b[2] @@ -148,7 +148,7 @@ LoopD16: sdot v28.4s, v10.16b, v9.4b[2] sdot v30.4s, v10.16b, v9.4b[3] - ld1 {v14.16b, v15.16b}, [x18], #32 + ld1 {v14.16b, v15.16b}, [x28], #32 sdot v17.4s, v11.16b, v8.4b[0] sdot v19.4s, v11.16b, v8.4b[1] sdot v21.4s, v11.16b, v8.4b[2] @@ -187,7 +187,7 @@ LoopD4: beq End3 ld1 {v0.16b, v1.16b}, [x17], #32 - ld1 {v2.16b, v3.16b}, [x18], #32 + ld1 {v2.16b, v3.16b}, [x28], #32 sdot v16.4s, v2.16b, v0.4b[0] sdot v18.4s, v2.16b, v0.4b[1] diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S index fc3ef28b86..95f30fe123 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S @@ -30,7 +30,7 @@ // x28: filter_zp asm_function MatmulInt8DpOpt - sub sp, sp, #208 + sub sp, sp, #224 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 stp x19, x20, [sp], #16 @@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt stp x23, x24, [sp], #16 stp x25, x26, [sp], #16 stp x27, x28, [sp], #16 + stp x29, x30, [sp], #16 ldr w8, [sp] ldr w9, [sp, #8] @@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt LoopRow: mov x16, x1 // reload rhs ptr mov x17, x4 // reload rhs col - mov x18, x7 // reload bias ptr + mov x29, x7 // reload bias ptr mov x25, x6 // reload input_sum ptr mov x27, x2 // reload dst ptr ldr x28, [sp, #64] // reload filter_zp @@ -113,7 +114,7 @@ LoopRow: Bias: cbz x7, NoReadBias - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64 add v16.4s, v16.4s, v0.4s add v17.4s, v17.4s, v1.4s add v18.4s, v18.4s, v2.4s @@ -423,8 +424,8 @@ LoopRow: BiasHalf: cbz x7, NoReadBiasHalf - ld1 {v0.4s, v1.4s}, [x18] - add x18, x18, #64 + ld1 {v0.4s, v1.4s}, [x29] + add x29, x29, #64 add v16.4s, v16.4s, v0.4s add v17.4s, v17.4s, v1.4s add v20.4s, v20.4s, v0.4s @@ -612,8 +613,8 @@ LoopRow: BiasQuarter: cbz x7, NoReadBiasQuarter - ld1 {v0.4s}, [x18] - add x18, x18, #64 + ld1 {v0.4s}, [x29] + add x29, x29, #64 add v16.4s, v16.4s, v0.4s add v20.4s, v20.4s, v0.4s add v24.4s, v24.4s, v0.4s @@ -1072,7 +1073,7 @@ LoopColEnd: b LoopRow LoopRowEnd: - sub sp, sp, #208 + sub sp, sp, #224 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ldp x19, x20, [sp], #16 @@ -1080,5 +1081,6 @@ LoopRowEnd: ldp x23, x24, [sp], #16 ldp x25, x26, [sp], #16 ldp x27, x28, [sp], #16 + ldp x29, x30, [sp], #16 ret #endif diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S index 03342a3986..e769ae4185 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S @@ -20,9 +20,10 @@ // x7: bias asm_function MatMulOptR4Int8Neon64 - sub sp, sp, #128 + sub sp, sp, #144 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + stp x19, x20, [sp], #16 mov w15, #0 // b col index mov w16, #0 // a row index @@ -40,7 +41,7 @@ L2: cmp w16, w3 beq End2 - mov x18, x1 // reload b ptr + mov x19, x1 // reload b ptr mov x10, x7 // reload bias ptr mov w11, w5 // reload depth dup v16.4s, wzr @@ -67,10 +68,10 @@ L3: ld1 {v1.16b}, [x17], #16 ld1 {v2.16b}, [x17], #16 ld1 {v3.16b}, [x17], #16 - ld1 {v4.16b}, [x18], #16 - ld1 {v5.16b}, [x18], #16 - ld1 {v6.16b}, [x18], #16 - ld1 {v7.16b}, [x18], #16 + ld1 {v4.16b}, [x19], #16 + ld1 {v5.16b}, [x19], #16 + ld1 {v6.16b}, [x19], #16 + ld1 {v7.16b}, [x19], #16 sdot v16.4s, v4.16b, v0.16b sdot v17.4s, v5.16b, v0.16b @@ -135,8 +136,9 @@ End2: b L1 End1: - sub sp, sp, #128 + sub sp, sp, #144 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldp x19, x20, [sp], #16 ret #endif