!14676 [MS][LITE][Develop]remove use of x18 on apple devices

From: @lx0095 Reviewed-by: @hangangqiang,@zhang_xue_tong Signed-off-by: @zhang_xue_tong
4 years ago · 2513ed1ba7
parent 21fe99a010 f1e1d054bf
commit 2513ed1ba7
31 changed files with 572 additions and 521 deletions
--- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
@ -28,11 +28,11 @@ asm_function AdderFloatNeon64

    ldr x8, [sp]

-    mov x18, #48 // sizeof(float) * 12
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    mov x20, #48 // sizeof(float) * 12
+    mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth

-    mov x18, #4
-    mul x8, x8, x18
+    mov x20, #4
+    mul x8, x8, x20

 LoopRowStart:
    cmp x6, #4
@ -595,9 +595,9 @@ LoopRow4:

 LoopColEnd:
        add x0, x0, x17
-        mov x18, #4
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x20, #4
+        mul x20, x20, x7
+        sub x11, x11, x20
        mov x2, x11
        subs x6, x6, #12
        bgt LoopRowStart
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
@ -33,12 +33,13 @@
 // w16: per_channel

 asm_function ConvDw3x3Int8Neon64
-  sub sp, sp, #176
+  sub sp, sp, #192
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  stp x19, x20, [sp], #16
  stp x21, x22, [sp], #16
  stp x23, x24, [sp], #16
+  stp x25, x26, [sp], #16

  ldr x8, [sp]
  ldr x9, [sp, #8]
@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64

  mov x16, x1
  add x17, x16, x5
-  add x18, x17, x5
+  add x25, x17, x5
  ld1 {v9.8b}, [x16], x4
  ld1 {v10.8b}, [x16], x4
  ld1 {v11.8b}, [x16], x4
  ld1 {v13.8b}, [x17], x4
  ld1 {v14.8b}, [x17], x4
  ld1 {v15.8b}, [x17], x4
-  ld1 {v17.8b}, [x18], x4
-  ld1 {v18.8b}, [x18], x4
-  ld1 {v19.8b}, [x18], x4
+  ld1 {v17.8b}, [x25], x4
+  ld1 {v18.8b}, [x25], x4
+  ld1 {v19.8b}, [x25], x4

  ld1 {v21.4s}, [x3]
  ld1 {v22.4s}, [x19]
@ -123,13 +124,13 @@ HEIGHT1_LOOP:
  ld1 {v16.8b}, [x17]
  smlal v23.4s, v0.4h, v10.4h
  smlal2 v24.4s, v0.8h, v10.8h
-  ld1 {v20.8b}, [x18]
+  ld1 {v20.8b}, [x25]
  add x1, x1, x21  
  ssubl v12.8h, v12.8b, v25.8b
  smlal v21.4s, v1.4h, v10.4h
  mov x16, x1
  add x17, x16, x5
-  add x18, x17, x5
+  add x25, x17, x5
  smlal2 v22.4s, v1.8h, v10.8h
  ld1 {v9.8b}, [x16], x4
  ssubl v16.8h, v16.8b, v25.8b
@ -159,17 +160,17 @@ HEIGHT1_LOOP:
  smlal2 v24.4s, v5.8h, v16.8h
  smlal v21.4s, v6.4h, v17.4h
  smlal2 v22.4s, v6.8h, v17.8h
-  ld1 {v17.8b}, [x18], x4
+  ld1 {v17.8b}, [x25], x4
  smlal v23.4s, v6.4h, v18.4h
  smlal2 v24.4s, v6.8h, v18.8h
  smlal v21.4s, v7.4h, v18.4h
  smlal2 v22.4s, v7.8h, v18.8h
-  ld1 {v18.8b}, [x18], x4
+  ld1 {v18.8b}, [x25], x4
  smlal v23.4s, v7.4h, v19.4h
  smlal2 v24.4s, v7.8h, v19.8h
  smlal v21.4s, v8.4h, v19.4h
  smlal2 v22.4s, v8.8h, v19.8h
-  ld1 {v19.8b}, [x18], x4
+  ld1 {v19.8b}, [x25], x4
  smlal v23.4s, v8.4h, v20.4h
  smlal2 v24.4s, v8.8h, v20.8h

@ -278,7 +279,7 @@ WIDTH2_LEFT:
  smlal2 v24.4s, v1.8h, v11.8h
  smlal v21.4s, v2.4h, v11.4h
  smlal2 v22.4s, v2.8h, v11.8h
-  ld1 {v20.8b}, [x18]
+  ld1 {v20.8b}, [x25]
  smlal v23.4s, v2.4h, v12.4h
  smlal2 v24.4s, v2.8h, v12.8h
  smlal v21.4s, v3.4h, v13.4h
@ -443,12 +444,13 @@ OUTZP3:
  st1 {v21.8b}, [x0], x6

 End:
-  sub sp, sp, #176
+  sub sp, sp, #192
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  ldp x19, x20, [sp], #16
  ldp x21, x22, [sp], #16
  ldp x23, x24, [sp], #16
+  ldp x25, x26, [sp], #16
  ret

 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
@ -33,12 +33,13 @@
 // w16: per_channel

 asm_function ConvDw3x3Int8Stride2
-    sub sp, sp, #176
+    sub sp, sp, #192
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2

    mov x16, x1
    add x17, x16, x5
-    add x18, x17, x5
+    add x25, x17, x5
    ld1 {v9.8b}, [x16], x4
    ld1 {v10.8b}, [x16], x4
    ssubl v9.8h, v9.8b, v28.8b
@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2
    ssubl v14.8h, v14.8b, v28.8b
    ld1 {v16.8b}, [x17], x4
    ssubl v15.8h, v15.8b, v28.8b
-    ld1 {v19.8b}, [x18], x4
+    ld1 {v19.8b}, [x25], x4
    ssubl v16.8h, v16.8b, v28.8b
-    ld1 {v20.8b}, [x18], x4
+    ld1 {v20.8b}, [x25], x4
    ssubl v19.8h, v19.8b, v28.8b
-    ld1 {v21.8b}, [x18], x4
+    ld1 {v21.8b}, [x25], x4
    ssubl v20.8h, v20.8b, v28.8b
    ssubl v21.8h, v21.8b, v28.8b

@ -108,7 +109,7 @@ HEIGHT1_LOOP:
    ld1 {v17.8b}, [x17], x4
    ssubl v12.8h, v12.8b, v28.8b
    smlal v26.4s, v0.4h, v11.4h
-    ld1 {v22.8b}, [x18], x4
+    ld1 {v22.8b}, [x25], x4
    ssubl v17.8h, v17.8b, v28.8b
    smlal2 v27.4s, v0.8h, v11.8h
    ld1 {v13.8b}, [x16], x4
@ -117,7 +118,7 @@ HEIGHT1_LOOP:
    ld1 {v18.8b}, [x17], x4
    ssubl v13.8h, v13.8b, v28.8b
    smlal2 v25.4s, v1.8h, v10.8h
-    ld1 {v23.8b}, [x18], x4
+    ld1 {v23.8b}, [x25], x4
    ssubl v18.8h, v18.8b, v28.8b
    smlal v26.4s, v1.4h, v12.4h
    mov v9.16b, v13.16b
@ -157,12 +158,12 @@ HEIGHT1_LOOP:
    smlal2 v27.4s, v6.8h, v21.8h
    smlal v24.4s, v7.4h, v20.4h
    smlal2 v25.4s, v7.8h, v20.8h
-    ld1 {v20.8b}, [x18], x4
+    ld1 {v20.8b}, [x25], x4
    smlal v26.4s, v7.4h, v22.4h
    smlal2 v27.4s, v7.8h, v22.8h
    smlal v24.4s, v8.4h, v21.4h
    smlal2 v25.4s, v8.8h, v21.8h
-    ld1 {v21.8b}, [x18], x4
+    ld1 {v21.8b}, [x25], x4
    ssubl v20.8h, v20.8b, v28.8b
    smlal v26.4s, v8.4h, v23.4h
    ssubl v21.8h, v21.8b, v28.8b
@ -260,7 +261,7 @@ WIDTH2_LEFT:
    ld1 {v17.8b}, [x17], x4
    ssubl v12.8h, v12.8b, v28.8b
    smlal v26.4s, v0.4h, v11.4h
-    ld1 {v22.8b}, [x18], x4
+    ld1 {v22.8b}, [x25], x4
    ssubl v17.8h, v17.8b, v28.8b
    smlal2 v27.4s, v0.8h, v11.8h
    ld1 {v13.8b}, [x16], x4
@ -269,7 +270,7 @@ WIDTH2_LEFT:
    ld1 {v18.8b}, [x17], x4
    ssubl v13.8h, v13.8b, v28.8b
    smlal2 v25.4s, v1.8h, v10.8h
-    ld1 {v23.8b}, [x18], x4
+    ld1 {v23.8b}, [x25], x4
    ssubl v18.8h, v18.8b, v28.8b
    smlal v26.4s, v1.4h, v12.4h
    ssubl v23.8h, v23.8b, v28.8b
@ -452,11 +453,12 @@ OUTZP3:
    st1 {v24.8b}, [x0], x6

 End:
-    sub sp, sp, #176
+    sub sp, sp, #192
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
@ -19,12 +19,13 @@ asm_function ConvDwFp32Center
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
@ -72,7 +73,7 @@ asm_function ConvDwFp32Center
            mov v14.16b, v24.16b
            mov v15.16b, v24.16b
            LoopKh16:
-                mov x18, x7
+                mov x25, x7
                mov x21, x16
                LoopKw16:
                    mov x22, x21
@ -109,7 +110,7 @@ asm_function ConvDwFp32Center
                    ld1 {v23.4s}, [x22], x11
                    fmla v14.4s, v22.4s, v25.4s
                    fmla v15.4s, v23.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    add x21, x21, x13
                    bne LoopKw16
                add x16, x16, x12
@ -192,7 +193,7 @@ asm_function ConvDwFp32Center
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            LoopKh8:
-                mov x18, x7
+                mov x25, x7
                mov x21, x16
                LoopKw8:
                    mov x22, x21
@ -213,7 +214,7 @@ asm_function ConvDwFp32Center
                    ld1 {v23.4s}, [x22], x11
                    fmla v6.4s, v22.4s, v25.4s
                    fmla v7.4s, v23.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    add x21, x21, x13
                    bne LoopKw8
                add x16, x16, x12
@ -261,13 +262,13 @@ asm_function ConvDwFp32Center
            mov x20, x6
            mov v0.16b, v24.16b
            LoopKh:
-                mov x18, x7
+                mov x25, x7
                mov x22, x16
                LoopKw:
                    ld1 {v16.4s}, [x22], x13
                    ld1 {v25.4s}, [x17], #16
                    fmla v0.4s, v16.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    bne LoopKw
                add x16, x16, x12
                subs x20, x20, #1
@ -290,11 +291,12 @@ asm_function ConvDwFp32Center
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #176
+    sub sp, sp, #192
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
@ -13,8 +13,9 @@
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

 asm_function ConvDwFp32Indirect3x3
-    sub sp, sp, #16
+    sub sp, sp, #32
    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16

    movi v31.4s, #6
    scvtf v31.4s, v31.4s
@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3
        ldp x12, x13, [x1]
        ldp x14, x15, [x1, #16]
        ldp x16, x17, [x1, #32]
-        ldp x18, x19, [x1, #48]
+        ldp x21, x19, [x1, #48]
        ldr x20, [x1, #64]
        mov x9, x2
        mov x10, x3
@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3
            ld1 {v5.4s}, [x17], #16
            ld1 {v22.4s}, [x9], #16
            fmla v29.4s, v3.4s, v20.4s
-            ld1 {v6.4s}, [x18], #16
+            ld1 {v6.4s}, [x21], #16
            ld1 {v23.4s}, [x9], #16
            fmla v29.4s, v4.4s, v21.4s
            ld1 {v7.4s}, [x19], #16
@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3
            ld1 {v5.4s}, [x17], #16
            ld1 {v22.4s}, [x9], #16
            fmla v29.4s, v3.4s, v20.4s
-            ld1 {v6.4s}, [x18], #16
+            ld1 {v6.4s}, [x21], #16
            ld1 {v23.4s}, [x9], #16
            fmla v29.4s, v4.4s, v21.4s
            ld1 {v7.4s}, [x19], #16
@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3
        cmp x5, #0
        bgt LoopPixel
 End:
-    sub sp, sp, #16
+    sub sp, sp, #32
    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
@ -13,17 +13,18 @@
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

 asm_function ConvDwFp32Indirect5x5
-    sub sp, sp, #160
+    sub sp, sp, #176
    stp x19, x20, [sp, #64]
    stp x21, x22, [sp, #80]
    stp x23, x24, [sp, #96]
    stp x25, x26, [sp, #112]
    stp x27, x28, [sp, #128]
    stp x29, x30, [sp, #144]
-    ldrb w8, [sp, #160]
+    ldrb w8, [sp, #176]
    stp x2, x3, [sp]
    stp x4, x6, [sp, #16]
    stp x7, x8, [sp, #32]
+    stp x0, x1, [sp, #160]

    movi v31.4s, #6
    scvtf v31.4s, v31.4s
@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5
        ldp x12, x13, [x1, #48]
        ldp x14, x15, [x1, #64]
        ldp x16, x17, [x1, #80]
-        ldp x18, x19, [x1, #96]
+        ldp x0, x19, [x1, #96]
        ldp x20, x21, [x1, #112]
        ldp x22, x23, [x1, #128]
        ldp x24, x25, [x1, #144]
@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5
            ld1 {v1.4s}, [x17], #16
            ld1 {v19.4s}, [x5], #16
            fmla v29.4s, v7.4s, v25.4s
-            ld1 {v2.4s}, [x18], #16
+            ld1 {v2.4s}, [x0], #16
            ld1 {v20.4s}, [x5], #16
            fmla v29.4s, v16.4s, v26.4s
            ld1 {v3.4s}, [x19], #16
@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5
            RELU:
                fmax v29.4s, v29.4s, v30.4s
            WRITE:
-                st1 {v29.4s}, [x0], #16
+                ldr x4, [sp, #160]
+                st1 {v29.4s}, [x4], #16
+                str x4, [sp, #160]

            ldr x4, [sp, #56]
            ld1 {v29.4s}, [x4], #16
@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5
            ld1 {v1.4s}, [x17], #16
            ld1 {v19.4s}, [x5], #16
            fmla v29.4s, v7.4s, v25.4s
-            ld1 {v2.4s}, [x18], #16
+            ld1 {v2.4s}, [x0], #16
            ld1 {v20.4s}, [x5], #16
            fmla v29.4s, v16.4s, v26.4s
            ld1 {v3.4s}, [x19], #16
@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5
            LeftWrite:
                cmp x2, #4
                bne Write3
-                st1 {v29.4s}, [x0], #16
+                ldr x4, [sp, #160]
+                st1 {v29.4s}, [x4], #16
+                str x4, [sp, #160]
                b NextPixel
            Write3:
                sxtw x2, w2
                tbnz w2, #1, Write2
                tbnz w2, #0, Write1
            Write2:
-                st1 {v29.2s}, [x0], #8
+                ldr x4, [sp, #160]
+                st1 {v29.2s}, [x4], #8
+                str x4, [sp, #160]
                ext v29.16b, v29.16b, v29.16b, #8
                tbz w2, #0, NextPixel
            Write1:
-                str s29, [x0], #4
+                ldr x4, [sp, #160]
+                str s29, [x4], #4
+                str x4, [sp, #160]

    NextPixel:
        ldr x2, [sp, #24]
@ -279,6 +288,6 @@ End:
    ldp x25, x26, [sp, #112]
    ldp x27, x28, [sp, #128]
    ldp x29, x30, [sp, #144]
-    add sp, sp, #160
+    add sp, sp, #176
 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
@ -22,12 +22,13 @@ asm_function ConvDwInt8Center
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
@ -51,9 +52,9 @@ asm_function ConvDwInt8Center
    ld1 {v24.4s}, [x17], #16
    ld1 {v25.4s}, [x17], #16

-    ldr x18, [sp, #80] // right shift
-    ld1 {v26.4s}, [x18], #16
-    ld1 {v27.4s}, [x18], #16
+    ldr x25, [sp, #80] // right shift
+    ld1 {v26.4s}, [x25], #16
+    ld1 {v27.4s}, [x25], #16

    ldr x19, [sp, #88] // acc_min
    ld1 {v28.4s}, [x19], #16
@ -90,7 +91,7 @@ asm_function ConvDwInt8Center
            mov v6.16b, v17.16b
            mov v7.16b, v18.16b
            LoopKh4:
-                mov x18, x7
+                mov x25, x7
                mov x21, x16
                LoopKw4:
                    mov x22, x21
@ -116,7 +117,7 @@ asm_function ConvDwInt8Center
                    smlal v6.4s, v8.4h, v16.4h
                    smlal2 v7.4s, v8.8h, v16.8h

-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    add x21, x21, x13
                    bne LoopKw4
                add x16, x16, x12
@ -194,15 +195,15 @@ asm_function ConvDwInt8Center

            mov x16, x3
            add x17, x16, x9
-            add x18, x17, x9
-            add x21, x18, x9
+            add x25, x17, x9
+            add x21, x25, x9

            st1 {v0.s}[0], [x16], #4
            st1 {v1.s}[0], [x16], #4
            st1 {v2.s}[0], [x17], #4
            st1 {v3.s}[0], [x17], #4
-            st1 {v4.s}[0], [x18], #4
-            st1 {v5.s}[0], [x18], #4
+            st1 {v4.s}[0], [x25], #4
+            st1 {v5.s}[0], [x25], #4
            st1 {v6.s}[0], [x21], #4
            st1 {v7.s}[0], [x21], #4

@ -221,7 +222,7 @@ asm_function ConvDwInt8Center
            mov v0.16b, v17.16b
            mov v1.16b, v18.16b
            LoopKh:
-                mov x18, x7
+                mov x25, x7
                mov x22, x16
                LoopKw:
                    ld1 {v15.8b}, [x22], x13
@ -229,7 +230,7 @@ asm_function ConvDwInt8Center
                    ld1 {v16.8h}, [x17], #16
                    smlal v0.4s, v14.4h, v16.4h
                    smlal2 v1.4s, v14.8h, v16.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    bne LoopKw
                add x16, x16, x12
                subs x20, x20, #1
@ -271,11 +272,12 @@ asm_function ConvDwInt8Center
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #176
+    sub sp, sp, #192
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
@ -47,11 +47,11 @@ asm_function ConvSwFp32Center

    LoopH:
        mov x17, x1
-        mov x18, x5
+        mov x28, x5
        mov x3, x0
-        cmp x18, #8
+        cmp x28, #8
        blt LoopW
-        cmp x18, #16
+        cmp x28, #16
        blt LoopW8

        LoopW16:
@ -244,12 +244,12 @@ asm_function ConvSwFp32Center
            st1 {v14.4s}, [x3], x9
            st1 {v15.4s}, [x3], x9
            add x17, x17, x19
-            sub x18, x18, #16
-            cmp x18, #0
+            sub x28, x28, #16
+            cmp x28, #0
            ble LoopWEnd
-            cmp x18, #8
+            cmp x28, #8
            blt LoopW
-            cmp x18, #16
+            cmp x28, #16
            bge LoopW16
        LoopW8:
            mov x19, #8
@ -369,10 +369,10 @@ asm_function ConvSwFp32Center
            st1 {v6.4s}, [x3], x9
            st1 {v7.4s}, [x3], x9
            add x17, x17, x19
-            sub x18, x18, #8
-            cmp x18, #0
+            sub x28, x28, #8
+            cmp x28, #0
            ble LoopWEnd
-            cmp x18, #8
+            cmp x28, #8
            bge LoopW8
        LoopW:
            mov x20, x17
@ -427,7 +427,7 @@ asm_function ConvSwFp32Center
        Write:
            st1 {v0.4s}, [x3], x9
            add x17, x17, x12
-            subs x18, x18, #1
+            subs x28, x28, #1
            bne LoopW
    LoopWEnd:
        add x0, x0, x8
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center
        mov x16, x1
        mov x17, x4
        LoopW:
-            mov x18, x15
+            mov x22, x15
            mov x19, x2
            mov x20, x5
            ld1 {v1.4s}, [x16], x8
            LoopKh:
-                mov x21, x18
+                mov x21, x22
                mov x13, x6
                LoopKw:
                    ld1 {v0.4s}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center
                    st1 {v0.4s}, [x21], x12
                    subs x13, x13, #1
                    bne LoopKw
-                add x18, x18, x11
+                add x22, x22, x11
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@ -21,31 +21,32 @@
 // x9: writeMode

 asm_function MatmulFloatNeon64Opt
-    sub sp, sp, #144
+    sub sp, sp, #160
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]

-    mov x18, #48 // sizeof(float) * 12
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    mov x21, #48 // sizeof(float) * 12
+    mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
    cbnz x9, NoC8Steps
    mov x11, x2
-    mov x18, #32
-    mul x16, x6, x18 // row * 8 * sizeof(float)
+    mov x21, #32
+    mul x16, x6, x21 // row * 8 * sizeof(float)
 NoC8Steps:
    cmp x9, #2
    bne NoWinoSteps
-    mov x18, #4
+    mov x21, #4
    mul x15, x7, x8
-    mul x15, x15, x18 // kernel_size * col *sizeof(float)
-    mov x18, #32
-    mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
+    mul x15, x15, x21 // kernel_size * col *sizeof(float)
+    mov x21, #32
+    mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
 NoWinoSteps:
-    mov x18, #4
-    mul x8, x8, x18
+    mov x21, #4
+    mul x8, x8, x21

 LoopRowStart:
    cmp x6, #4
@ -1117,9 +1118,9 @@ LoopRow4:
 LoopColEnd:
        add x0, x0, x17
        cbz x9, C8DstStep
-        mov x18, #4
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x21, #4
+        mul x21, x21, x7
+        sub x11, x11, x21
        mov x2, x11
        b NoDstStep
    C8DstStep:
@ -1129,9 +1130,10 @@ LoopColEnd:
        subs x6, x6, #12
        bgt LoopRowStart

-  sub sp, sp, #144
+  sub sp, sp, #160
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  ldp x19, x20, [sp], #16
+  ldp x21, x22, [sp], #16
  ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
@ -67,7 +67,7 @@ L2:
  cmp w16, #0
  beq End2

-  mov x18, x1     // reload b ptr
+  mov x28, x1     // reload b ptr
  mov x19, x7     // reload bias ptr
  mov w20, w5     // reload depth
  dup v16.4s, wzr
@ -94,10 +94,10 @@ L3:
  ld1 {v1.16b}, [x17], #16
  ld1 {v2.16b}, [x17], #16
  ld1 {v3.16b}, [x17], #16
-  ld1 {v4.16b}, [x18], #16
-  ld1 {v5.16b}, [x18], #16
-  ld1 {v6.16b}, [x18], #16
-  ld1 {v7.16b}, [x18], #16
+  ld1 {v4.16b}, [x28], #16
+  ld1 {v5.16b}, [x28], #16
+  ld1 {v6.16b}, [x28], #16
+  ld1 {v7.16b}, [x28], #16

  smull v8.8h, v4.8b, v0.8b
  smull v9.8h, v5.8b, v0.8b
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
@ -30,7 +30,7 @@
 // x28: filter_zp

 asm_function MatmulInt8Opt
-    sub sp, sp, #208
+    sub sp, sp, #224
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
@ -38,6 +38,7 @@ asm_function MatmulInt8Opt
    stp x23, x24, [sp], #16
    stp x25, x26, [sp], #16
    stp x27, x28, [sp], #16
+    stp x29, x30, [sp], #16

    ldr w8, [sp]
    ldr w9, [sp, #8]
@ -55,7 +56,7 @@ asm_function MatmulInt8Opt
 LoopRow:
    mov x16, x1 // reload rhs ptr
    mov x17, x4 // reload rhs col
-    mov x18, x7 // reload bias ptr
+    mov x29, x7 // reload bias ptr
    mov x27, x2 // reload dst ptr
    ldr x28, [sp, #64] // reload filter_zp

@ -158,7 +159,7 @@ LoopRow:

        Bias:
            cbz x7, NoBias
-            ld1 {v15.4s}, [x18], #16
+            ld1 {v15.4s}, [x29], #16
            add v16.4s, v16.4s, v15.4s
            add v17.4s, v17.4s, v15.4s
            add v18.4s, v18.4s, v15.4s
@ -330,7 +331,7 @@ LoopColEnd:
    b LoopRow

 LoopRowEnd:
-    sub sp, sp, #208
+    sub sp, sp, #224
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
@ -338,5 +339,6 @@ LoopRowEnd:
    ldp x23, x24, [sp], #16
    ldp x25, x26, [sp], #16
    ldp x27, x28, [sp], #16
+    ldp x29, x30, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
@ -20,9 +20,10 @@
 // x7: bias

 asm_function MatMulR4Int8Neon64
-  sub sp, sp, #128
+  sub sp, sp, #144
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  stp x19, x20, [sp], #16

  mov w15, #0       // b col index
  mov w16, #0       // a row index
@ -40,7 +41,7 @@ L2:
  cmp w16, w3
  beq End2

-  mov x18, x1     // reload b ptr
+  mov x19, x1     // reload b ptr
  mov x10, x7    // reload bias ptr
  mov w11, w5     // reload depth
  dup v16.4s, wzr
@ -67,10 +68,10 @@ L3:
  ld1 {v1.16b}, [x17], #16
  ld1 {v2.16b}, [x17], #16
  ld1 {v3.16b}, [x17], #16
-  ld1 {v4.16b}, [x18], #16
-  ld1 {v5.16b}, [x18], #16
-  ld1 {v6.16b}, [x18], #16
-  ld1 {v7.16b}, [x18], #16
+  ld1 {v4.16b}, [x19], #16
+  ld1 {v5.16b}, [x19], #16
+  ld1 {v6.16b}, [x19], #16
+  ld1 {v7.16b}, [x19], #16

  smull v8.8h, v4.8b, v0.8b
  smull v9.8h, v5.8b, v0.8b
@ -172,8 +173,9 @@ End2:
  b L1

 End1:
-  sub sp, sp, #128
+  sub sp, sp, #144
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
  ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd
        mov x14, x1  // mat_b
        LoopN:
            mov x16, x0  // mat_a_m
-            sub x18, x5, x15   // ni
+            sub x22, x5, x15   // ni
            sub x19, x17, x3   // mi
-            mul x18, x18, x17  // ni * m
+            mul x22, x22, x17  // ni * m
            mov x11, x6 // in_channel
-            add x18, x18, x19  // (ni * m) + mi
-            mul x18, x18, x7   // x18 * c4_channel
-            add x20, x2, x18   // dst + offset
+            add x22, x22, x19  // (ni * m) + mi
+            mul x22, x22, x7   // x22 * c4_channel
+            add x20, x2, x22   // dst + offset
            cmp x11, #16
            bge LoopC16
            cmp x11, #8
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
@ -1,6 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
-
    .text
    .align 5
    //.p2align 5,,15
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
@ -55,16 +55,16 @@ LoopH:
            ld1 {v0.s}[2], [x17], x10
            ld1 {v0.s}[3], [x17], x10
            mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
            add x19, x16, x7

            LoopLength4:
                ld1 {v16.4s}, [x2]
                ld1 {v20.4s}, [x14], #16
                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x18], #16
+                ld1 {v21.4s}, [x20], #16
                fmul v17.4s, v21.4s, v0.s[1]
                ld1 {v20.4s}, [x16], #16
                fmla v16.4s, v20.4s, v0.s[2]
@ -90,14 +90,14 @@ LoopH:
            ld1 {v0.s}[1], [x17], x10
            ld1 {v0.s}[2], [x17], x10
            mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
            LoopLength3:
                ld1 {v16.4s}, [x2]
                ld1 {v20.4s}, [x14], #16
                fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x18], #16
+                ld1 {v21.4s}, [x20], #16
                fmul v17.4s, v21.4s, v0.s[1]
                ld1 {v20.4s}, [x16], #16
                fmla v16.4s, v20.4s, v0.s[2]
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
@ -18,6 +18,9 @@ asm_function WinogradTransRight
 //x5: k
 //x6: length

+sub sp, sp, #16
+stp x19, x20, [sp], #16
+
 mov x8, #16 // 4 * sizeof(float)
 mul x8, x6, x8
 mul x9, x5, x8 // step for S
@ -43,7 +46,7 @@ LoopH:
            cmp x12, #4
            blt LoopKStart3
            mov x16, x15
-            mov x18, x4
+            mov x19, x4
            LoopK4:
                ld1 {v0.s}[0], [x13], x10
                ld1 {v0.s}[1], [x13], x10
@ -54,7 +57,7 @@ LoopH:

                add x14, x17, x8
                add x16, x14, x8
-                add x18, x16, x8
+                add x19, x16, x8

                LoopLength4:
                    ld1 {v16.4s}, [x2]
@ -64,7 +67,7 @@ LoopH:
                    fmul v17.4s, v21.4s, v0.s[1]
                    ld1 {v20.4s}, [x16], #16
                    fmla v16.4s, v20.4s, v0.s[2]
-                    ld1 {v21.4s}, [x18], #16
+                    ld1 {v21.4s}, [x19], #16
                    fmla v17.4s, v21.4s, v0.s[3]

                    fadd v17.4s, v16.4s, v17.4s
@ -73,7 +76,7 @@ LoopH:
                    bne LoopLength4
                sub x2, x2, x8
                sub x12, x12, #4
-                mov x17, x18
+                mov x17, x19

                cmp x12, #4
                bge LoopK4
@ -107,7 +110,7 @@ LoopH:
                    bne LoopLength3
                sub x2, x2, x8
                sub x12, x12, #3
-                mov x17, x18
+                mov x17, x19
                cmp x12, #3
                bge LoopK3

@ -141,5 +144,7 @@ LoopH:
    subs x4, x4, #1
    bne LoopH

+    sub sp, sp, #16
+    ldp x19, x20, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
+++ b/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
@ -1,4 +1,5 @@
 #ifdef ENABLE_AVX
+#include "nnacl/assembly_global.h"
 .text
 .align 4
 .global ConvDwFp32Avx3x3
@ -31,7 +32,7 @@
 // 56: input_stride
 // 64: relu
 // 72: relu6
-ConvDwFp32Avx3x3:
+asm_function ConvDwFp32Avx3x3
    pushq %r15
    pushq %r14
    pushq %r13
--- a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
+++ b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
@ -1,4 +1,5 @@
 #ifdef ENABLE_AVX
+#include "nnacl/assembly_global.h"
    .text
    .align 4
    .global MatmulFloatAvxOpt
@ -34,7 +35,7 @@
 // 72: stride
 // 80: writeMode

-MatmulFloatAvxOpt:
+asm_function MatmulFloatAvxOpt
    // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention
    pushq %r15
    pushq %r14
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
@ -19,12 +19,13 @@ asm_function ConvDwFp16Center
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDwFp16Center
            mov v14.16b, v24.16b
            mov v15.16b, v24.16b
            LoopKh16:
-                mov x18, x7
+                mov x25, x7
                mov x21, x16
                LoopKw16:
                    mov x22, x21
@ -108,7 +109,7 @@ asm_function ConvDwFp16Center
                    ld1 {v23.8h}, [x22], x11
                    fmla v14.8h, v22.8h, v25.8h
                    fmla v15.8h, v23.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    add x21, x21, x13
                    bne LoopKw16
                add x16, x16, x12
@ -191,7 +192,7 @@ asm_function ConvDwFp16Center
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            LoopKh8:
-                mov x18, x7
+                mov x25, x7
                mov x21, x16
                LoopKw8:
                    mov x22, x21
@ -212,7 +213,7 @@ asm_function ConvDwFp16Center
                    ld1 {v23.8h}, [x22], x11
                    fmla v6.8h, v22.8h, v25.8h
                    fmla v7.8h, v23.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    add x21, x21, x13
                    bne LoopKw8
                add x16, x16, x12
@ -260,13 +261,13 @@ asm_function ConvDwFp16Center
            mov x20, x6
            mov v0.16b, v24.16b
            LoopKh:
-                mov x18, x7
+                mov x25, x7
                mov x22, x16
                LoopKw:
                    ld1 {v16.8h}, [x22], x13
                    ld1 {v25.8h}, [x17], #16
                    fmla v0.8h, v16.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                    bne LoopKw
                add x16, x16, x12
                subs x20, x20, #1
@ -289,11 +290,12 @@ asm_function ConvDwFp16Center
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #176
+    sub sp, sp, #192
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center
        mov x16, x1
        mov x17, x4
        LoopW:
-            mov x18, x15
+            mov x22, x15
            mov x19, x2
            mov x20, x5
            ld1 {v1.8h}, [x16], x8
            LoopKh:
-                mov x21, x18
+                mov x21, x22
                mov x13, x6
                LoopKw:
                    ld1 {v0.8h}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center
                    st1 {v0.8h}, [x21], x12
                    subs x13, x13, #1
                    bne LoopKw
-                add x18, x18, x11
+                add x22, x22, x11
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
--- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
+++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ r29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #128
+    sub sp, sp, #144
    // performance between storing 4 registers at the same time and separately storing them on in-order cores
    // is not tested yet
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    stp x19, x20, [sp], #16

    ldr x8, [sp, #0]
    ldr x9, [sp, #8]
@ -548,87 +549,87 @@ IndirectGemmStart:
                b WriteEnd
            Write7:
                add x17, x15, #8
-                add x18, x15, #10
+                add x19, x15, #10
                add x16, x15, #12
                st1 {v16.4h}, [x15], x7
                ins v0.s[0], v16.s[2]
                st1 {v0.h}[0], [x17], x7
-                st1 {v0.h}[1], [x18], x7
+                st1 {v0.h}[1], [x19], x7
                st1 {v16.h}[6], [x16], x7
                st1 {v17.4h}, [x15], x7
                ins v1.s[0], v17.s[2]
                st1 {v1.h}[0], [x17], x7
-                st1 {v1.h}[1], [x18], x7
+                st1 {v1.h}[1], [x19], x7
                st1 {v17.h}[6], [x16], x7
                st1 {v18.4h}, [x15], x7
                ins v2.s[0], v18.s[2]
                st1 {v2.h}[0], [x17], x7
-                st1 {v2.h}[1], [x18], x7
+                st1 {v2.h}[1], [x19], x7
                st1 {v18.h}[6], [x16], x7
                st1 {v19.4h}, [x15], x7
                ins v3.s[0], v19.s[2]
                st1 {v3.h}[0], [x17], x7
-                st1 {v3.h}[1], [x18], x7
+                st1 {v3.h}[1], [x19], x7
                st1 {v19.h}[6], [x16], x7
                st1 {v20.4h}, [x15], x7
                ins v4.s[0], v20.s[2]
                st1 {v4.h}[0], [x17], x7
-                st1 {v4.h}[1], [x18], x7
+                st1 {v4.h}[1], [x19], x7
                st1 {v20.h}[6], [x16], x7
                st1 {v21.4h}, [x15], x7
                ins v5.s[0], v21.s[2]
                st1 {v5.h}[0], [x17], x7
-                st1 {v5.h}[1], [x18], x7
+                st1 {v5.h}[1], [x19], x7
                st1 {v21.h}[6], [x16], x7
                st1 {v22.4h}, [x15], x7
                ins v6.s[0], v22.s[2]
                st1 {v6.h}[0], [x17], x7
-                st1 {v6.h}[1], [x18], x7
+                st1 {v6.h}[1], [x19], x7
                st1 {v22.h}[6], [x16], x7
                st1 {v23.4h}, [x15], x7
                ins v7.s[0], v23.s[2]
                st1 {v7.h}[0], [x17], x7
-                st1 {v7.h}[1], [x18], x7
+                st1 {v7.h}[1], [x19], x7
                st1 {v23.h}[6], [x16], x7
                st1 {v24.4h}, [x15], x7
                ins v8.s[0], v24.s[2]
                st1 {v8.h}[0], [x17], x7
-                st1 {v8.h}[1], [x18], x7
+                st1 {v8.h}[1], [x19], x7
                st1 {v24.h}[6], [x16], x7
                st1 {v25.4h}, [x15], x7
                ins v9.s[0], v25.s[2]
                st1 {v9.h}[0], [x17], x7
-                st1 {v9.h}[1], [x18], x7
+                st1 {v9.h}[1], [x19], x7
                st1 {v25.h}[6], [x16], x7
                st1 {v26.4h}, [x15], x7
                ins v10.s[0], v26.s[2]
                st1 {v10.h}[0], [x17], x7
-                st1 {v10.h}[1], [x18], x7
+                st1 {v10.h}[1], [x19], x7
                st1 {v26.h}[6], [x16], x7
                st1 {v27.4h}, [x15], x7
                ins v11.s[0], v27.s[2]
                st1 {v11.h}[0], [x17], x7
-                st1 {v11.h}[1], [x18], x7
+                st1 {v11.h}[1], [x19], x7
                st1 {v27.h}[6], [x16], x7
                st1 {v28.4h}, [x15], x7
                ins v12.s[0], v28.s[2]
                st1 {v12.h}[0], [x17], x7
-                st1 {v12.h}[1], [x18], x7
+                st1 {v12.h}[1], [x19], x7
                st1 {v28.h}[6], [x16], x7
                st1 {v29.4h}, [x15], x7
                ins v13.s[0], v29.s[2]
                st1 {v13.h}[0], [x17], x7
-                st1 {v13.h}[1], [x18], x7
+                st1 {v13.h}[1], [x19], x7
                st1 {v29.h}[6], [x16], x7
                st1 {v30.4h}, [x15], x7
                ins v14.s[0], v30.s[2]
                st1 {v14.h}[0], [x17], x7
-                st1 {v14.h}[1], [x18], x7
+                st1 {v14.h}[1], [x19], x7
                st1 {v30.h}[6], [x16], x7
                st1 {v31.4h}, [x15]
                ins v15.s[0], v31.s[2]
                st1 {v15.h}[0], [x17]
-                st1 {v15.h}[1], [x18]
+                st1 {v15.h}[1], [x19]
                st1 {v31.h}[6], [x16]
                add x0, x0, #14
                b WriteEnd
@ -661,9 +662,10 @@ IndirectGemmStart:
    NoStepForward:
        bgt LoopOc

-    sub sp, sp, #128
+    sub sp, sp, #144
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    ldp x19, x20, [sp], #16
    ret
 #endif

--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@ -21,30 +21,31 @@
 // x9: writeMode

 asm_function MatmulFp16Neon64Opt
-    sub sp, sp, #80
+    sub sp, sp, #96
    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]

-    mov x18, #32 // sizeof(float16_t) * 16
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
+    mov x21, #32 // sizeof(float16_t) * 16
+    mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
    cbnz x9, NoC8Steps
    mov x11, x2
-    mov x18, #16
-    mul x16, x6, x18 // row * 8 * sizeof(float16_t)
+    mov x21, #16
+    mul x16, x6, x21 // row * 8 * sizeof(float16_t)
 NoC8Steps:
    cmp x9, #2
    bne NoWinoSteps
-    mov x18, #2
+    mov x21, #2
    mul x15, x7, x8
-    mul x15, x15, x18 // kernel_size * col *sizeof(float16_t)
-    mov x18, #16
-    mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t)
+    mul x15, x15, x21 // kernel_size * col *sizeof(float16_t)
+    mov x21, #16
+    mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t)
 NoWinoSteps:
-    mov x18, #2
-    mul x8, x8, x18
+    mov x21, #2
+    mul x8, x8, x21

 LoopRowStart:
    cmp x6, #1
@ -1221,9 +1222,9 @@ LoopRow:
 LoopColEnd:
        add x0, x0, x17
        cbz x9, C8DstStep
-        mov x18, #2
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x21, #2
+        mul x21, x21, x7
+        sub x11, x11, x21
        mov x2, x11
        b NoDstStep
    C8DstStep:
@ -1233,8 +1234,9 @@ LoopColEnd:
        subs x6, x6, #16
        bgt LoopRowStart

-    sub sp, sp, #80
+    sub sp, sp, #96
    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
    ret
 #endif
--- a/Show More
+++ b/Show More