diff --git a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
index df9e94fffa..621fd6eeac 100644
--- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
@@ -28,11 +28,11 @@ asm_function AdderFloatNeon64
 
     ldr x8, [sp]
 
-    mov x18, #48 // sizeof(float) * 12
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    mov x20, #48 // sizeof(float) * 12
+    mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
 
-    mov x18, #4
-    mul x8, x8, x18
+    mov x20, #4
+    mul x8, x8, x20
 
 LoopRowStart:
     cmp x6, #4
@@ -595,9 +595,9 @@ LoopRow4:
 
 LoopColEnd:
         add x0, x0, x17
-        mov x18, #4
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x20, #4
+        mul x20, x20, x7
+        sub x11, x11, x20
         mov x2, x11
         subs x6, x6, #12
         bgt LoopRowStart
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
index 3b46f4d810..391401e88f 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
@@ -33,12 +33,13 @@
 // w16: per_channel
 
 asm_function ConvDw3x3Int8Neon64
-  sub sp, sp, #176
+  sub sp, sp, #192
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   stp x19, x20, [sp], #16
   stp x21, x22, [sp], #16
   stp x23, x24, [sp], #16
+  stp x25, x26, [sp], #16
 
   ldr x8, [sp]
   ldr x9, [sp, #8]
@@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64
 
   mov x16, x1
   add x17, x16, x5
-  add x18, x17, x5
+  add x25, x17, x5
   ld1 {v9.8b}, [x16], x4
   ld1 {v10.8b}, [x16], x4
   ld1 {v11.8b}, [x16], x4
   ld1 {v13.8b}, [x17], x4
   ld1 {v14.8b}, [x17], x4
   ld1 {v15.8b}, [x17], x4
-  ld1 {v17.8b}, [x18], x4
-  ld1 {v18.8b}, [x18], x4
-  ld1 {v19.8b}, [x18], x4
+  ld1 {v17.8b}, [x25], x4
+  ld1 {v18.8b}, [x25], x4
+  ld1 {v19.8b}, [x25], x4
 
   ld1 {v21.4s}, [x3]
   ld1 {v22.4s}, [x19]
@@ -123,13 +124,13 @@ HEIGHT1_LOOP:
   ld1 {v16.8b}, [x17]
   smlal v23.4s, v0.4h, v10.4h
   smlal2 v24.4s, v0.8h, v10.8h
-  ld1 {v20.8b}, [x18]
+  ld1 {v20.8b}, [x25]
   add x1, x1, x21  
   ssubl v12.8h, v12.8b, v25.8b
   smlal v21.4s, v1.4h, v10.4h
   mov x16, x1
   add x17, x16, x5
-  add x18, x17, x5
+  add x25, x17, x5
   smlal2 v22.4s, v1.8h, v10.8h
   ld1 {v9.8b}, [x16], x4
   ssubl v16.8h, v16.8b, v25.8b
@@ -159,17 +160,17 @@ HEIGHT1_LOOP:
   smlal2 v24.4s, v5.8h, v16.8h
   smlal v21.4s, v6.4h, v17.4h
   smlal2 v22.4s, v6.8h, v17.8h
-  ld1 {v17.8b}, [x18], x4
+  ld1 {v17.8b}, [x25], x4
   smlal v23.4s, v6.4h, v18.4h
   smlal2 v24.4s, v6.8h, v18.8h
   smlal v21.4s, v7.4h, v18.4h
   smlal2 v22.4s, v7.8h, v18.8h
-  ld1 {v18.8b}, [x18], x4
+  ld1 {v18.8b}, [x25], x4
   smlal v23.4s, v7.4h, v19.4h
   smlal2 v24.4s, v7.8h, v19.8h
   smlal v21.4s, v8.4h, v19.4h
   smlal2 v22.4s, v8.8h, v19.8h
-  ld1 {v19.8b}, [x18], x4
+  ld1 {v19.8b}, [x25], x4
   smlal v23.4s, v8.4h, v20.4h
   smlal2 v24.4s, v8.8h, v20.8h
 
@@ -278,7 +279,7 @@ WIDTH2_LEFT:
   smlal2 v24.4s, v1.8h, v11.8h
   smlal v21.4s, v2.4h, v11.4h
   smlal2 v22.4s, v2.8h, v11.8h
-  ld1 {v20.8b}, [x18]
+  ld1 {v20.8b}, [x25]
   smlal v23.4s, v2.4h, v12.4h
   smlal2 v24.4s, v2.8h, v12.8h
   smlal v21.4s, v3.4h, v13.4h
@@ -443,12 +444,13 @@ OUTZP3:
   st1 {v21.8b}, [x0], x6
 
 End:
-  sub sp, sp, #176
+  sub sp, sp, #192
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   ldp x19, x20, [sp], #16
   ldp x21, x22, [sp], #16
   ldp x23, x24, [sp], #16
+  ldp x25, x26, [sp], #16
   ret
 
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
index 8f843192db..2162ade6bb 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
@@ -33,12 +33,13 @@
 // w16: per_channel
 
 asm_function ConvDw3x3Int8Stride2
-    sub sp, sp, #176
+    sub sp, sp, #192
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
     stp x21, x22, [sp], #16
     stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
@@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2
 
     mov x16, x1
     add x17, x16, x5
-    add x18, x17, x5
+    add x25, x17, x5
     ld1 {v9.8b}, [x16], x4
     ld1 {v10.8b}, [x16], x4
     ssubl v9.8h, v9.8b, v28.8b
@@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2
     ssubl v14.8h, v14.8b, v28.8b
     ld1 {v16.8b}, [x17], x4
     ssubl v15.8h, v15.8b, v28.8b
-    ld1 {v19.8b}, [x18], x4
+    ld1 {v19.8b}, [x25], x4
     ssubl v16.8h, v16.8b, v28.8b
-    ld1 {v20.8b}, [x18], x4
+    ld1 {v20.8b}, [x25], x4
     ssubl v19.8h, v19.8b, v28.8b
-    ld1 {v21.8b}, [x18], x4
+    ld1 {v21.8b}, [x25], x4
     ssubl v20.8h, v20.8b, v28.8b
     ssubl v21.8h, v21.8b, v28.8b
 
@@ -108,7 +109,7 @@ HEIGHT1_LOOP:
     ld1 {v17.8b}, [x17], x4
     ssubl v12.8h, v12.8b, v28.8b
     smlal v26.4s, v0.4h, v11.4h
-    ld1 {v22.8b}, [x18], x4
+    ld1 {v22.8b}, [x25], x4
     ssubl v17.8h, v17.8b, v28.8b
     smlal2 v27.4s, v0.8h, v11.8h
     ld1 {v13.8b}, [x16], x4
@@ -117,7 +118,7 @@ HEIGHT1_LOOP:
     ld1 {v18.8b}, [x17], x4
     ssubl v13.8h, v13.8b, v28.8b
     smlal2 v25.4s, v1.8h, v10.8h
-    ld1 {v23.8b}, [x18], x4
+    ld1 {v23.8b}, [x25], x4
     ssubl v18.8h, v18.8b, v28.8b
     smlal v26.4s, v1.4h, v12.4h
     mov v9.16b, v13.16b
@@ -157,12 +158,12 @@ HEIGHT1_LOOP:
     smlal2 v27.4s, v6.8h, v21.8h
     smlal v24.4s, v7.4h, v20.4h
     smlal2 v25.4s, v7.8h, v20.8h
-    ld1 {v20.8b}, [x18], x4
+    ld1 {v20.8b}, [x25], x4
     smlal v26.4s, v7.4h, v22.4h
     smlal2 v27.4s, v7.8h, v22.8h
     smlal v24.4s, v8.4h, v21.4h
     smlal2 v25.4s, v8.8h, v21.8h
-    ld1 {v21.8b}, [x18], x4
+    ld1 {v21.8b}, [x25], x4
     ssubl v20.8h, v20.8b, v28.8b
     smlal v26.4s, v8.4h, v23.4h
     ssubl v21.8h, v21.8b, v28.8b
@@ -260,7 +261,7 @@ WIDTH2_LEFT:
     ld1 {v17.8b}, [x17], x4
     ssubl v12.8h, v12.8b, v28.8b
     smlal v26.4s, v0.4h, v11.4h
-    ld1 {v22.8b}, [x18], x4
+    ld1 {v22.8b}, [x25], x4
     ssubl v17.8h, v17.8b, v28.8b
     smlal2 v27.4s, v0.8h, v11.8h
     ld1 {v13.8b}, [x16], x4
@@ -269,7 +270,7 @@ WIDTH2_LEFT:
     ld1 {v18.8b}, [x17], x4
     ssubl v13.8h, v13.8b, v28.8b
     smlal2 v25.4s, v1.8h, v10.8h
-    ld1 {v23.8b}, [x18], x4
+    ld1 {v23.8b}, [x25], x4
     ssubl v18.8h, v18.8b, v28.8b
     smlal v26.4s, v1.4h, v12.4h
     ssubl v23.8h, v23.8b, v28.8b
@@ -452,11 +453,12 @@ OUTZP3:
     st1 {v24.8b}, [x0], x6
 
 End:
-    sub sp, sp, #176
+    sub sp, sp, #192
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     ldp x19, x20, [sp], #16
     ldp x21, x22, [sp], #16
     ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
index c43932f5ec..d4e6be641e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
@@ -19,12 +19,13 @@ asm_function ConvDwFp32Center
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
     // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
     stp x21, x22, [sp], #16
     stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
@@ -72,7 +73,7 @@ asm_function ConvDwFp32Center
             mov v14.16b, v24.16b
             mov v15.16b, v24.16b
             LoopKh16:
-                mov x18, x7
+                mov x25, x7
                 mov x21, x16
                 LoopKw16:
                     mov x22, x21
@@ -109,7 +110,7 @@ asm_function ConvDwFp32Center
                     ld1 {v23.4s}, [x22], x11
                     fmla v14.4s, v22.4s, v25.4s
                     fmla v15.4s, v23.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     add x21, x21, x13
                     bne LoopKw16
                 add x16, x16, x12
@@ -192,7 +193,7 @@ asm_function ConvDwFp32Center
             mov v6.16b, v24.16b
             mov v7.16b, v24.16b
             LoopKh8:
-                mov x18, x7
+                mov x25, x7
                 mov x21, x16
                 LoopKw8:
                     mov x22, x21
@@ -213,7 +214,7 @@ asm_function ConvDwFp32Center
                     ld1 {v23.4s}, [x22], x11
                     fmla v6.4s, v22.4s, v25.4s
                     fmla v7.4s, v23.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     add x21, x21, x13
                     bne LoopKw8
                 add x16, x16, x12
@@ -261,13 +262,13 @@ asm_function ConvDwFp32Center
             mov x20, x6
             mov v0.16b, v24.16b
             LoopKh:
-                mov x18, x7
+                mov x25, x7
                 mov x22, x16
                 LoopKw:
                     ld1 {v16.4s}, [x22], x13
                     ld1 {v25.4s}, [x17], #16
                     fmla v0.4s, v16.4s, v25.4s
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     bne LoopKw
                 add x16, x16, x12
                 subs x20, x20, #1
@@ -290,11 +291,12 @@ asm_function ConvDwFp32Center
         subs x4, x4, #1
         bne LoopH
 
-    sub sp, sp, #176
+    sub sp, sp, #192
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     ldp x19, x20, [sp], #16
     ldp x21, x22, [sp], #16
     ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
index a60a27fe05..246d8bfab4 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
@@ -13,8 +13,9 @@
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
 
 asm_function ConvDwFp32Indirect3x3
-    sub sp, sp, #16
+    sub sp, sp, #32
     stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
 
     movi v31.4s, #6
     scvtf v31.4s, v31.4s
@@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3
         ldp x12, x13, [x1]
         ldp x14, x15, [x1, #16]
         ldp x16, x17, [x1, #32]
-        ldp x18, x19, [x1, #48]
+        ldp x21, x19, [x1, #48]
         ldr x20, [x1, #64]
         mov x9, x2
         mov x10, x3
@@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3
             ld1 {v5.4s}, [x17], #16
             ld1 {v22.4s}, [x9], #16
             fmla v29.4s, v3.4s, v20.4s
-            ld1 {v6.4s}, [x18], #16
+            ld1 {v6.4s}, [x21], #16
             ld1 {v23.4s}, [x9], #16
             fmla v29.4s, v4.4s, v21.4s
             ld1 {v7.4s}, [x19], #16
@@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3
             ld1 {v5.4s}, [x17], #16
             ld1 {v22.4s}, [x9], #16
             fmla v29.4s, v3.4s, v20.4s
-            ld1 {v6.4s}, [x18], #16
+            ld1 {v6.4s}, [x21], #16
             ld1 {v23.4s}, [x9], #16
             fmla v29.4s, v4.4s, v21.4s
             ld1 {v7.4s}, [x19], #16
@@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3
         cmp x5, #0
         bgt LoopPixel
 End:
-    sub sp, sp, #16
+    sub sp, sp, #32
     ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
 ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
index 5e1045aa72..6ff7307f78 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
@@ -13,17 +13,18 @@
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
 
 asm_function ConvDwFp32Indirect5x5
-    sub sp, sp, #160
+    sub sp, sp, #176
     stp x19, x20, [sp, #64]
     stp x21, x22, [sp, #80]
     stp x23, x24, [sp, #96]
     stp x25, x26, [sp, #112]
     stp x27, x28, [sp, #128]
     stp x29, x30, [sp, #144]
-    ldrb w8, [sp, #160]
+    ldrb w8, [sp, #176]
     stp x2, x3, [sp]
     stp x4, x6, [sp, #16]
     stp x7, x8, [sp, #32]
+    stp x0, x1, [sp, #160]
 
     movi v31.4s, #6
     scvtf v31.4s, v31.4s
@@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5
         ldp x12, x13, [x1, #48]
         ldp x14, x15, [x1, #64]
         ldp x16, x17, [x1, #80]
-        ldp x18, x19, [x1, #96]
+        ldp x0, x19, [x1, #96]
         ldp x20, x21, [x1, #112]
         ldp x22, x23, [x1, #128]
         ldp x24, x25, [x1, #144]
@@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5
             ld1 {v1.4s}, [x17], #16
             ld1 {v19.4s}, [x5], #16
             fmla v29.4s, v7.4s, v25.4s
-            ld1 {v2.4s}, [x18], #16
+            ld1 {v2.4s}, [x0], #16
             ld1 {v20.4s}, [x5], #16
             fmla v29.4s, v16.4s, v26.4s
             ld1 {v3.4s}, [x19], #16
@@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5
             RELU:
                 fmax v29.4s, v29.4s, v30.4s
             WRITE:
-                st1 {v29.4s}, [x0], #16
+                ldr x4, [sp, #160]
+                st1 {v29.4s}, [x4], #16
+                str x4, [sp, #160]
 
             ldr x4, [sp, #56]
             ld1 {v29.4s}, [x4], #16
@@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5
             ld1 {v1.4s}, [x17], #16
             ld1 {v19.4s}, [x5], #16
             fmla v29.4s, v7.4s, v25.4s
-            ld1 {v2.4s}, [x18], #16
+            ld1 {v2.4s}, [x0], #16
             ld1 {v20.4s}, [x5], #16
             fmla v29.4s, v16.4s, v26.4s
             ld1 {v3.4s}, [x19], #16
@@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5
             LeftWrite:
                 cmp x2, #4
                 bne Write3
-                st1 {v29.4s}, [x0], #16
+                ldr x4, [sp, #160]
+                st1 {v29.4s}, [x4], #16
+                str x4, [sp, #160]
                 b NextPixel
             Write3:
                 sxtw x2, w2
                 tbnz w2, #1, Write2
                 tbnz w2, #0, Write1
             Write2:
-                st1 {v29.2s}, [x0], #8
+                ldr x4, [sp, #160]
+                st1 {v29.2s}, [x4], #8
+                str x4, [sp, #160]
                 ext v29.16b, v29.16b, v29.16b, #8
                 tbz w2, #0, NextPixel
             Write1:
-                str s29, [x0], #4
+                ldr x4, [sp, #160]
+                str s29, [x4], #4
+                str x4, [sp, #160]
 
     NextPixel:
         ldr x2, [sp, #24]
@@ -279,6 +288,6 @@ End:
     ldp x25, x26, [sp, #112]
     ldp x27, x28, [sp, #128]
     ldp x29, x30, [sp, #144]
-    add sp, sp, #160
+    add sp, sp, #176
 ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
index 03fd8afe0c..017732e7ca 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
@@ -22,12 +22,13 @@ asm_function ConvDwInt8Center
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
     // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
     stp x21, x22, [sp], #16
     stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
@@ -51,9 +52,9 @@ asm_function ConvDwInt8Center
     ld1 {v24.4s}, [x17], #16
     ld1 {v25.4s}, [x17], #16
 
-    ldr x18, [sp, #80] // right shift
-    ld1 {v26.4s}, [x18], #16
-    ld1 {v27.4s}, [x18], #16
+    ldr x25, [sp, #80] // right shift
+    ld1 {v26.4s}, [x25], #16
+    ld1 {v27.4s}, [x25], #16
 
     ldr x19, [sp, #88] // acc_min
     ld1 {v28.4s}, [x19], #16
@@ -90,7 +91,7 @@ asm_function ConvDwInt8Center
             mov v6.16b, v17.16b
             mov v7.16b, v18.16b
             LoopKh4:
-                mov x18, x7
+                mov x25, x7
                 mov x21, x16
                 LoopKw4:
                     mov x22, x21
@@ -116,7 +117,7 @@ asm_function ConvDwInt8Center
                     smlal v6.4s, v8.4h, v16.4h
                     smlal2 v7.4s, v8.8h, v16.8h
 
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     add x21, x21, x13
                     bne LoopKw4
                 add x16, x16, x12
@@ -194,15 +195,15 @@ asm_function ConvDwInt8Center
 
             mov x16, x3
             add x17, x16, x9
-            add x18, x17, x9
-            add x21, x18, x9
+            add x25, x17, x9
+            add x21, x25, x9
 
             st1 {v0.s}[0], [x16], #4
             st1 {v1.s}[0], [x16], #4
             st1 {v2.s}[0], [x17], #4
             st1 {v3.s}[0], [x17], #4
-            st1 {v4.s}[0], [x18], #4
-            st1 {v5.s}[0], [x18], #4
+            st1 {v4.s}[0], [x25], #4
+            st1 {v5.s}[0], [x25], #4
             st1 {v6.s}[0], [x21], #4
             st1 {v7.s}[0], [x21], #4
 
@@ -221,7 +222,7 @@ asm_function ConvDwInt8Center
             mov v0.16b, v17.16b
             mov v1.16b, v18.16b
             LoopKh:
-                mov x18, x7
+                mov x25, x7
                 mov x22, x16
                 LoopKw:
                     ld1 {v15.8b}, [x22], x13
@@ -229,7 +230,7 @@ asm_function ConvDwInt8Center
                     ld1 {v16.8h}, [x17], #16
                     smlal v0.4s, v14.4h, v16.4h
                     smlal2 v1.4s, v14.8h, v16.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     bne LoopKw
                 add x16, x16, x12
                 subs x20, x20, #1
@@ -271,11 +272,12 @@ asm_function ConvDwInt8Center
         subs x4, x4, #1
         bne LoopH
 
-    sub sp, sp, #176
+    sub sp, sp, #192
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     ldp x19, x20, [sp], #16
     ldp x21, x22, [sp], #16
     ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
index ff4ac86616..277f3ebd10 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
@@ -47,11 +47,11 @@ asm_function ConvSwFp32Center
 
     LoopH:
         mov x17, x1
-        mov x18, x5
+        mov x28, x5
         mov x3, x0
-        cmp x18, #8
+        cmp x28, #8
         blt LoopW
-        cmp x18, #16
+        cmp x28, #16
         blt LoopW8
 
         LoopW16:
@@ -244,12 +244,12 @@ asm_function ConvSwFp32Center
             st1 {v14.4s}, [x3], x9
             st1 {v15.4s}, [x3], x9
             add x17, x17, x19
-            sub x18, x18, #16
-            cmp x18, #0
+            sub x28, x28, #16
+            cmp x28, #0
             ble LoopWEnd
-            cmp x18, #8
+            cmp x28, #8
             blt LoopW
-            cmp x18, #16
+            cmp x28, #16
             bge LoopW16
         LoopW8:
             mov x19, #8
@@ -369,10 +369,10 @@ asm_function ConvSwFp32Center
             st1 {v6.4s}, [x3], x9
             st1 {v7.4s}, [x3], x9
             add x17, x17, x19
-            sub x18, x18, #8
-            cmp x18, #0
+            sub x28, x28, #8
+            cmp x28, #0
             ble LoopWEnd
-            cmp x18, #8
+            cmp x28, #8
             bge LoopW8
         LoopW:
             mov x20, x17
@@ -427,7 +427,7 @@ asm_function ConvSwFp32Center
         Write:
             st1 {v0.4s}, [x3], x9
             add x17, x17, x12
-            subs x18, x18, #1
+            subs x28, x28, #1
             bne LoopW
     LoopWEnd:
         add x0, x0, x8
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
index 19601f5779..d4c49827d2 100644
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
@@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center
         mov x16, x1
         mov x17, x4
         LoopW:
-            mov x18, x15
+            mov x22, x15
             mov x19, x2
             mov x20, x5
             ld1 {v1.4s}, [x16], x8
             LoopKh:
-                mov x21, x18
+                mov x21, x22
                 mov x13, x6
                 LoopKw:
                     ld1 {v0.4s}, [x21]
@@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center
                     st1 {v0.4s}, [x21], x12
                     subs x13, x13, #1
                     bne LoopKw
-                add x18, x18, x11
+                add x22, x22, x11
                 subs x20, x20, #1
                 bne LoopKh
             add x15, x15, x10
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
index 5c7024ea94..47aaeb121e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
@@ -21,30 +21,31 @@
 // w13: c8_nhwc_c4
 
 asm_function MatmulFloatNeon64
-  sub sp, sp, #128
+  sub sp, sp, #144
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  stp x19, x20, [sp], #16
 
   ldr x9, [sp, #8]
   ldr x14, [sp, #16]
 
-  mov w18, #32 // sizeof(float) * 8
-  mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
-  mov x18, #4
+  mov w19, #32 // sizeof(float) * 8
+  mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
+  mov x19, #4
   ldr x17, [sp]
   cbz x14, NoWinoSteps
   mul x8, x7, x17
   mov x11, #8
   mul x11, x11, x17
-  mul x8, x8, x18
-  mul x11, x11, x18
+  mul x8, x8, x19
+  mul x11, x11, x19
 NoWinoSteps:
-  mul x17, x17, x18
+  mul x17, x17, x19
 
 L1:
   mov w10, w6 // reload lhs row
   mov x12, x0 // reload lhs ptr
-  mov x18, x2 // reload dst ptr
+  mov x19, x2 // reload dst ptr
 
 L2:
   mov x16, x1 // reload rhs ptr
@@ -254,435 +255,435 @@ Write:
   b Write8
 
 Write1:
-  str s8, [x18]
+  str s8, [x19]
   cmp w10, #1
   beq WriteEnd
-  add x18, x18, x17
-  str s10, [x18]
+  add x19, x19, x17
+  str s10, [x19]
   cmp w10, #2
   beq WriteEnd
-  add x18, x18, x17
-  str s12, [x18]
+  add x19, x19, x17
+  str s12, [x19]
   cmp w10, #3
   beq WriteEnd
-  add x18, x18, x17
-  str s14, [x18]
+  add x19, x19, x17
+  str s14, [x19]
   cmp w10, #4
   beq WriteEnd
-  add x18, x18, x17
-  str s16, [x18]
+  add x19, x19, x17
+  str s16, [x19]
   cmp w10, #5
   beq WriteEnd
-  add x18, x18, x17
-  str s18, [x18]
+  add x19, x19, x17
+  str s18, [x19]
   cmp w10, #6
   beq WriteEnd
-  add x18, x18, x17
-  str s20, [x18]
+  add x19, x19, x17
+  str s20, [x19]
   cmp w10, #7
   beq WriteEnd
-  add x18, x18, x17
-  str s22, [x18]
+  add x19, x19, x17
+  str s22, [x19]
   cmp w10, #8
   beq WriteEnd
-  add x18, x18, x17
-  str s24, [x18]
+  add x19, x19, x17
+  str s24, [x19]
   cmp w10, #9
   beq WriteEnd
-  add x18, x18, x17
-  str s26, [x18]
+  add x19, x19, x17
+  str s26, [x19]
   cmp w10, #10
   beq WriteEnd
-  add x18, x18, x17
-  str s28, [x18]
+  add x19, x19, x17
+  str s28, [x19]
   cmp w10, #11
   beq WriteEnd
-  add x18, x18, x17
-  str s30, [x18]
-  add x18, x18, x17
+  add x19, x19, x17
+  str s30, [x19]
+  add x19, x19, x17
   b WriteEnd
 Write2:
   dup s9, v8.s[1]
-  stp s8, s9, [x18]
+  stp s8, s9, [x19]
   cmp w10, #1
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s11, v10.s[1]
-  stp s10, s11, [x18]
+  stp s10, s11, [x19]
   cmp w10, #2
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s13, v12.s[1]
-  stp s12, s13, [x18]
+  stp s12, s13, [x19]
   cmp w10, #3
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s15, v14.s[1]
-  stp s14, s15, [x18]
+  stp s14, s15, [x19]
   cmp w10, #4
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s17, v16.s[1]
-  stp s16, s17, [x18]
+  stp s16, s17, [x19]
   cmp w10, #5
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s19, v18.s[1]
-  stp s18, s19, [x18]
+  stp s18, s19, [x19]
   cmp w10, #6
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s21, v20.s[1]
-  stp s20, s21, [x18]
+  stp s20, s21, [x19]
   cmp w10, #7
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s23, v22.s[1]
-  stp s22, s23, [x18]
+  stp s22, s23, [x19]
   cmp w10, #8
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s25, v24.s[1]
-  stp s24, s25, [x18]
+  stp s24, s25, [x19]
   cmp w10, #9
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s27, v26.s[1]
-  stp s26, s27, [x18]
+  stp s26, s27, [x19]
   cmp w10, #10
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s29, v28.s[1]
-  stp s28, s29, [x18]
+  stp s28, s29, [x19]
   cmp w10, #11
   beq WriteEnd
-  add x18, x18, x17
+  add x19, x19, x17
   dup s31, v30.s[1]
-  stp s30, s31, [x18]
-  add x18, x18, x17
+  stp s30, s31, [x19]
+  add x19, x19, x17
   b WriteEnd
 Write3:
-  add x13, x18, #8
+  add x13, x19, #8
   dup s9, v8.s[1]
-  stp s8, s9, [x18]
-  add x18, x18, x17
+  stp s8, s9, [x19]
+  add x19, x19, x17
   st1 {v8.s}[2], [x13], x17
   cmp w10, #1
   beq WriteEnd
   dup s11, v10.s[1]
-  stp s10, s11, [x18]
-  add x18, x18, x17
+  stp s10, s11, [x19]
+  add x19, x19, x17
   st1 {v10.s}[2], [x13], x17
   cmp w10, #2
   beq WriteEnd
   dup s13, v12.s[1]
-  stp s12, s13, [x18]
-  add x18, x18, x17
+  stp s12, s13, [x19]
+  add x19, x19, x17
   st1 {v12.s}[2], [x13], x17
   cmp w10, #3
   beq WriteEnd
   dup s15, v14.s[1]
-  stp s14, s15, [x18]
-  add x18, x18, x17
+  stp s14, s15, [x19]
+  add x19, x19, x17
   st1 {v14.s}[2], [x13], x17
   cmp w10, #4
   beq WriteEnd
   dup s17, v16.s[1]
-  stp s16, s17, [x18]
-  add x18, x18, x17
+  stp s16, s17, [x19]
+  add x19, x19, x17
   st1 {v16.s}[2], [x13], x17
   cmp w10, #5
   beq WriteEnd
   dup s19, v18.s[1]
-  stp s18, s19, [x18]
-  add x18, x18, x17
+  stp s18, s19, [x19]
+  add x19, x19, x17
   st1 {v18.s}[2], [x13], x17
   cmp w10, #6
   beq WriteEnd
   dup s21, v20.s[1]
-  stp s20, s21, [x18]
-  add x18, x18, x17
+  stp s20, s21, [x19]
+  add x19, x19, x17
   st1 {v20.s}[2], [x13], x17
   cmp w10, #7
   beq WriteEnd
   dup s23, v22.s[1]
-  stp s22, s23, [x18]
-  add x18, x18, x17
+  stp s22, s23, [x19]
+  add x19, x19, x17
   st1 {v22.s}[2], [x13], x17
   cmp w10, #8
   beq WriteEnd
   dup s25, v24.s[1]
-  stp s24, s25, [x18]
-  add x18, x18, x17
+  stp s24, s25, [x19]
+  add x19, x19, x17
   st1 {v24.s}[2], [x13], x17
   cmp w10, #9
   beq WriteEnd
   dup s27, v26.s[1]
-  stp s26, s27, [x18]
-  add x18, x18, x17
+  stp s26, s27, [x19]
+  add x19, x19, x17
   st1 {v26.s}[2], [x13], x17
   cmp w10, #10
   beq WriteEnd
   dup s29, v28.s[1]
-  stp s28, s29, [x18]
-  add x18, x18, x17
+  stp s28, s29, [x19]
+  add x19, x19, x17
   st1 {v28.s}[2], [x13], x17
   cmp w10, #11
   beq WriteEnd
   dup s31, v30.s[1]
-  stp s30, s31, [x18]
-  add x18, x18, x17
+  stp s30, s31, [x19]
+  add x19, x19, x17
   st1 {v30.s}[2], [x13]
   b WriteEnd
 Write4:
-  st1 {v8.4s}, [x18], x17
+  st1 {v8.4s}, [x19], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v10.4s}, [x18], x17
+  st1 {v10.4s}, [x19], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v12.4s}, [x18], x17
+  st1 {v12.4s}, [x19], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v14.4s}, [x18], x17
+  st1 {v14.4s}, [x19], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v16.4s}, [x18], x17
+  st1 {v16.4s}, [x19], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v18.4s}, [x18], x17
+  st1 {v18.4s}, [x19], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v20.4s}, [x18], x17
+  st1 {v20.4s}, [x19], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v22.4s}, [x18], x17
+  st1 {v22.4s}, [x19], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4s}, [x18], x17
+  st1 {v24.4s}, [x19], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v26.4s}, [x18], x17
+  st1 {v26.4s}, [x19], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v28.4s}, [x18], x17
+  st1 {v28.4s}, [x19], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v30.4s}, [x18], x17
+  st1 {v30.4s}, [x19], x17
   b WriteEnd
 Write5:
-  add x13, x18, #16
-  st1 {v8.4s}, [x18], x17
+  add x13, x19, #16
+  st1 {v8.4s}, [x19], x17
   str s9, [x13]
   cmp w10, #1
   beq WriteEnd
   add x13, x13, x17
-  st1 {v10.4s}, [x18], x17
+  st1 {v10.4s}, [x19], x17
   str s11, [x13]
   cmp w10, #2
   beq WriteEnd
   add x13, x13, x17
-  st1 {v12.4s}, [x18], x17
+  st1 {v12.4s}, [x19], x17
   str s13, [x13]
   cmp w10, #3
   beq WriteEnd
   add x13, x13, x17
-  st1 {v14.4s}, [x18], x17
+  st1 {v14.4s}, [x19], x17
   str s15, [x13]
   cmp w10, #4
   beq WriteEnd
   add x13, x13, x17
-  st1 {v16.4s}, [x18], x17
+  st1 {v16.4s}, [x19], x17
   str s17, [x13]
   cmp w10, #5
   beq WriteEnd
   add x13, x13, x17
-  st1 {v18.4s}, [x18], x17
+  st1 {v18.4s}, [x19], x17
   str s19, [x13]
   cmp w10, #6
   beq WriteEnd
   add x13, x13, x17
-  st1 {v20.4s}, [x18], x17
+  st1 {v20.4s}, [x19], x17
   str s21, [x13]
   cmp w10, #7
   beq WriteEnd
   add x13, x13, x17
-  st1 {v22.4s}, [x18], x17
+  st1 {v22.4s}, [x19], x17
   str s23, [x13]
   cmp w10, #8
   beq WriteEnd
   add x13, x13, x17
-  st1 {v24.4s}, [x18], x17
+  st1 {v24.4s}, [x19], x17
   str s25, [x13]
   cmp w10, #9
   beq WriteEnd
   add x13, x13, x17
-  st1 {v26.4s}, [x18], x17
+  st1 {v26.4s}, [x19], x17
   str s27, [x13]
   cmp w10, #10
   beq WriteEnd
   add x13, x13, x17
-  st1 {v28.4s}, [x18], x17
+  st1 {v28.4s}, [x19], x17
   str s29, [x13]
   cmp w10, #11
   beq WriteEnd
   add x13, x13, x17
-  st1 {v30.4s}, [x18], x17
+  st1 {v30.4s}, [x19], x17
   str s31, [x13]
   b WriteEnd
 Write6:
-  add x13, x18, #16
-  st1 {v8.4s}, [x18], x17
+  add x13, x19, #16
+  st1 {v8.4s}, [x19], x17
   dup s8, v9.s[1]
   stp s9, s8, [x13]
   cmp w10, #1
   beq WriteEnd
   add x13, x13, x17
-  st1 {v10.4s}, [x18], x17
+  st1 {v10.4s}, [x19], x17
   dup s10, v11.s[1]
   stp s11, s10, [x13]
   cmp w10, #2
   beq WriteEnd
   add x13, x13, x17
-  st1 {v12.4s}, [x18], x17
+  st1 {v12.4s}, [x19], x17
   dup s12, v13.s[1]
   stp s13, s12, [x13]
   cmp w10, #3
   beq WriteEnd
   add x13, x13, x17
-  st1 {v14.4s}, [x18], x17
+  st1 {v14.4s}, [x19], x17
   dup s14, v15.s[1]
   stp s15, s14, [x13]
   cmp w10, #4
   beq WriteEnd
   add x13, x13, x17
-  st1 {v16.4s}, [x18], x17
+  st1 {v16.4s}, [x19], x17
   dup s16, v17.s[1]
   stp s17, s16, [x13]
   cmp w10, #5
   beq WriteEnd
   add x13, x13, x17
-  st1 {v18.4s}, [x18], x17
+  st1 {v18.4s}, [x19], x17
   dup s18, v19.s[1]
   stp s19, s18, [x13]
   cmp w10, #6
   beq WriteEnd
   add x13, x13, x17
-  st1 {v20.4s}, [x18], x17
+  st1 {v20.4s}, [x19], x17
   dup s20, v21.s[1]
   stp s21, s20, [x13]
   cmp w10, #7
   beq WriteEnd
   add x13, x13, x17
-  st1 {v22.4s}, [x18], x17
+  st1 {v22.4s}, [x19], x17
   dup s22, v23.s[1]
   stp s23, s22, [x13]
   cmp w10, #8
   beq WriteEnd
   add x13, x13, x17
-  st1 {v24.4s}, [x18], x17
+  st1 {v24.4s}, [x19], x17
   dup s24, v25.s[1]
   stp s25, s24, [x13]
   cmp w10, #9
   beq WriteEnd
   add x13, x13, x17
-  st1 {v26.4s}, [x18], x17
+  st1 {v26.4s}, [x19], x17
   dup s26, v27.s[1]
   stp s27, s26, [x13]
   cmp w10, #10
   beq WriteEnd
   add x13, x13, x17
-  st1 {v28.4s}, [x18], x17
+  st1 {v28.4s}, [x19], x17
   dup s28, v29.s[1]
   stp s29, s28, [x13]
   cmp w10, #11
   beq WriteEnd
   add x13, x13, x17
-  st1 {v30.4s}, [x18], x17
+  st1 {v30.4s}, [x19], x17
   dup s30, v31.s[1]
   stp s31, s30, [x13]
   b WriteEnd
 Write7:
-  add x13, x18, #16
-  add x16, x18, #24
-  st1 {v8.4s}, [x18], x17
+  add x13, x19, #16
+  add x16, x19, #24
+  st1 {v8.4s}, [x19], x17
   dup s8, v9.s[1]
   stp s9, s8, [x13]
   add x13, x13, x17
   st1 {v9.s}[2], [x16], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v10.4s}, [x18], x17
+  st1 {v10.4s}, [x19], x17
   dup s10, v11.s[1]
   stp s11, s10, [x13]
   add x13, x13, x17
   st1 {v11.s}[2], [x16], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v12.4s}, [x18], x17
+  st1 {v12.4s}, [x19], x17
   dup s12, v13.s[1]
   stp s13, s12, [x13]
   add x13, x13, x17
   st1 {v13.s}[2], [x16], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v14.4s}, [x18], x17
+  st1 {v14.4s}, [x19], x17
   dup s14, v15.s[1]
   stp s15, s14, [x13]
   add x13, x13, x17
   st1 {v15.s}[2], [x16], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v16.4s}, [x18], x17
+  st1 {v16.4s}, [x19], x17
   dup s16, v17.s[1]
   stp s17, s16, [x13]
   add x13, x13, x17
   st1 {v17.s}[2], [x16], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v18.4s}, [x18], x17
+  st1 {v18.4s}, [x19], x17
   dup s18, v19.s[1]
   stp s19, s18, [x13]
   add x13, x13, x17
   st1 {v19.s}[2], [x16], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v20.4s}, [x18], x17
+  st1 {v20.4s}, [x19], x17
   dup s20, v21.s[1]
   stp s21, s20, [x13]
   add x13, x13, x17
   st1 {v21.s}[2], [x16], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v22.4s}, [x18], x17
+  st1 {v22.4s}, [x19], x17
   dup s22, v23.s[1]
   stp s23, s22, [x13]
   add x13, x13, x17
   st1 {v23.s}[2], [x16], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4s}, [x18], x17
+  st1 {v24.4s}, [x19], x17
   dup s24, v25.s[1]
   stp s25, s24, [x13]
   add x13, x13, x17
   st1 {v25.s}[2], [x16], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v26.4s}, [x18], x17
+  st1 {v26.4s}, [x19], x17
   dup s26, v27.s[1]
   stp s27, s26, [x13]
   add x13, x13, x17
   st1 {v27.s}[2], [x16], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v28.4s}, [x18], x17
+  st1 {v28.4s}, [x19], x17
   dup s28, v29.s[1]
   stp s29, s28, [x13]
   add x13, x13, x17
   st1 {v29.s}[2], [x16], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v30.4s}, [x18], x17
+  st1 {v30.4s}, [x19], x17
   dup s30, v31.s[1]
   stp s31, s30, [x13]
   add x13, x13, x17
@@ -697,54 +698,54 @@ WriteC8:
   st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
   b WriteEnd
 WriteWino:
-  st1 {v8.4s, v9.4s}, [x18], x8
-  st1 {v10.4s, v11.4s}, [x18], x8
-  st1 {v12.4s, v13.4s}, [x18], x8
-  st1 {v14.4s, v15.4s}, [x18], x8
-  st1 {v16.4s, v17.4s}, [x18], x8
-  st1 {v18.4s, v19.4s}, [x18], x8
-  st1 {v20.4s, v21.4s}, [x18], x8
-  st1 {v22.4s, v23.4s}, [x18], x8
-  st1 {v24.4s, v25.4s}, [x18], x8
-  st1 {v26.4s, v27.4s}, [x18], x8
-  st1 {v28.4s, v29.4s}, [x18], x8
-  st1 {v30.4s, v31.4s}, [x18], x8
+  st1 {v8.4s, v9.4s}, [x19], x8
+  st1 {v10.4s, v11.4s}, [x19], x8
+  st1 {v12.4s, v13.4s}, [x19], x8
+  st1 {v14.4s, v15.4s}, [x19], x8
+  st1 {v16.4s, v17.4s}, [x19], x8
+  st1 {v18.4s, v19.4s}, [x19], x8
+  st1 {v20.4s, v21.4s}, [x19], x8
+  st1 {v22.4s, v23.4s}, [x19], x8
+  st1 {v24.4s, v25.4s}, [x19], x8
+  st1 {v26.4s, v27.4s}, [x19], x8
+  st1 {v28.4s, v29.4s}, [x19], x8
+  st1 {v30.4s, v31.4s}, [x19], x8
   b WriteEnd
 Write8:
-  st1 {v8.4s, v9.4s}, [x18], x17
+  st1 {v8.4s, v9.4s}, [x19], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v10.4s, v11.4s}, [x18], x17
+  st1 {v10.4s, v11.4s}, [x19], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v12.4s, v13.4s}, [x18], x17
+  st1 {v12.4s, v13.4s}, [x19], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v14.4s, v15.4s}, [x18], x17
+  st1 {v14.4s, v15.4s}, [x19], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v16.4s, v17.4s}, [x18], x17
+  st1 {v16.4s, v17.4s}, [x19], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v18.4s, v19.4s}, [x18], x17
+  st1 {v18.4s, v19.4s}, [x19], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v20.4s, v21.4s}, [x18], x17
+  st1 {v20.4s, v21.4s}, [x19], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v22.4s, v23.4s}, [x18], x17
+  st1 {v22.4s, v23.4s}, [x19], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4s, v25.4s}, [x18], x17
+  st1 {v24.4s, v25.4s}, [x19], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v26.4s, v27.4s}, [x18], x17
+  st1 {v26.4s, v27.4s}, [x19], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v28.4s, v29.4s}, [x18], x17
+  st1 {v28.4s, v29.4s}, [x19], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v30.4s, v31.4s}, [x18], x17
+  st1 {v30.4s, v31.4s}, [x19], x17
 
 WriteEnd:
   subs w10, w10, #12 // lhs row - 12
@@ -766,8 +767,9 @@ NoDstStep:
   bgt L1
 
 End1:
-  sub sp, sp, #128
+  sub sp, sp, #144
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
index e495feec78..07a87a8e81 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -21,31 +21,32 @@
 // x9: writeMode
 
 asm_function MatmulFloatNeon64Opt
-    sub sp, sp, #144
+    sub sp, sp, #160
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
 
-    mov x18, #48 // sizeof(float) * 12
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    mov x21, #48 // sizeof(float) * 12
+    mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
     cbnz x9, NoC8Steps
     mov x11, x2
-    mov x18, #32
-    mul x16, x6, x18 // row * 8 * sizeof(float)
+    mov x21, #32
+    mul x16, x6, x21 // row * 8 * sizeof(float)
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
-    mov x18, #4
+    mov x21, #4
     mul x15, x7, x8
-    mul x15, x15, x18 // kernel_size * col *sizeof(float)
-    mov x18, #32
-    mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
+    mul x15, x15, x21 // kernel_size * col *sizeof(float)
+    mov x21, #32
+    mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
 NoWinoSteps:
-    mov x18, #4
-    mul x8, x8, x18
+    mov x21, #4
+    mul x8, x8, x21
 
 LoopRowStart:
     cmp x6, #4
@@ -1117,9 +1118,9 @@ LoopRow4:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
-        mov x18, #4
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x21, #4
+        mul x21, x21, x7
+        sub x11, x11, x21
         mov x2, x11
         b NoDstStep
     C8DstStep:
@@ -1129,9 +1130,10 @@ LoopColEnd:
         subs x6, x6, #12
         bgt LoopRowStart
 
-  sub sp, sp, #144
+  sub sp, sp, #160
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   ldp x19, x20, [sp], #16
+  ldp x21, x22, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
index 883d07fb09..600f122e16 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
@@ -67,7 +67,7 @@ L2:
   cmp w16, #0
   beq End2
 
-  mov x18, x1     // reload b ptr
+  mov x28, x1     // reload b ptr
   mov x19, x7     // reload bias ptr
   mov w20, w5     // reload depth
   dup v16.4s, wzr
@@ -94,10 +94,10 @@ L3:
   ld1 {v1.16b}, [x17], #16
   ld1 {v2.16b}, [x17], #16
   ld1 {v3.16b}, [x17], #16
-  ld1 {v4.16b}, [x18], #16
-  ld1 {v5.16b}, [x18], #16
-  ld1 {v6.16b}, [x18], #16
-  ld1 {v7.16b}, [x18], #16
+  ld1 {v4.16b}, [x28], #16
+  ld1 {v5.16b}, [x28], #16
+  ld1 {v6.16b}, [x28], #16
+  ld1 {v7.16b}, [x28], #16
 
   smull v8.8h, v4.8b, v0.8b
   smull v9.8h, v5.8b, v0.8b
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
index c08607df9e..fd31cc0f9c 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
@@ -30,7 +30,7 @@
 // x28: filter_zp
 
 asm_function MatmulInt8Opt
-    sub sp, sp, #208
+    sub sp, sp, #224
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
@@ -38,6 +38,7 @@ asm_function MatmulInt8Opt
     stp x23, x24, [sp], #16
     stp x25, x26, [sp], #16
     stp x27, x28, [sp], #16
+    stp x29, x30, [sp], #16
 
     ldr w8, [sp]
     ldr w9, [sp, #8]
@@ -55,7 +56,7 @@ asm_function MatmulInt8Opt
 LoopRow:
     mov x16, x1 // reload rhs ptr
     mov x17, x4 // reload rhs col
-    mov x18, x7 // reload bias ptr
+    mov x29, x7 // reload bias ptr
     mov x27, x2 // reload dst ptr
     ldr x28, [sp, #64] // reload filter_zp
 
@@ -158,7 +159,7 @@ LoopRow:
 
         Bias:
             cbz x7, NoBias
-            ld1 {v15.4s}, [x18], #16
+            ld1 {v15.4s}, [x29], #16
             add v16.4s, v16.4s, v15.4s
             add v17.4s, v17.4s, v15.4s
             add v18.4s, v18.4s, v15.4s
@@ -330,7 +331,7 @@ LoopColEnd:
     b LoopRow
 
 LoopRowEnd:
-    sub sp, sp, #208
+    sub sp, sp, #224
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     ldp x19, x20, [sp], #16
@@ -338,5 +339,6 @@ LoopRowEnd:
     ldp x23, x24, [sp], #16
     ldp x25, x26, [sp], #16
     ldp x27, x28, [sp], #16
+    ldp x29, x30, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
index 3f6cf4644b..98426e2120 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
@@ -20,9 +20,10 @@
 // x7: bias
 
 asm_function MatMulR4Int8Neon64
-  sub sp, sp, #128
+  sub sp, sp, #144
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  stp x19, x20, [sp], #16
 
   mov w15, #0       // b col index
   mov w16, #0       // a row index
@@ -40,7 +41,7 @@ L2:
   cmp w16, w3
   beq End2
 
-  mov x18, x1     // reload b ptr
+  mov x19, x1     // reload b ptr
   mov x10, x7    // reload bias ptr
   mov w11, w5     // reload depth
   dup v16.4s, wzr
@@ -67,10 +68,10 @@ L3:
   ld1 {v1.16b}, [x17], #16
   ld1 {v2.16b}, [x17], #16
   ld1 {v3.16b}, [x17], #16
-  ld1 {v4.16b}, [x18], #16
-  ld1 {v5.16b}, [x18], #16
-  ld1 {v6.16b}, [x18], #16
-  ld1 {v7.16b}, [x18], #16
+  ld1 {v4.16b}, [x19], #16
+  ld1 {v5.16b}, [x19], #16
+  ld1 {v6.16b}, [x19], #16
+  ld1 {v7.16b}, [x19], #16
 
   smull v8.8h, v4.8b, v0.8b
   smull v9.8h, v5.8b, v0.8b
@@ -172,8 +173,9 @@ End2:
   b L1
 
 End1:
-  sub sp, sp, #128
+  sub sp, sp, #144
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
index a378f1527e..182e7f85ab 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
@@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd
         mov x14, x1  // mat_b
         LoopN:
             mov x16, x0  // mat_a_m
-            sub x18, x5, x15   // ni
+            sub x22, x5, x15   // ni
             sub x19, x17, x3   // mi
-            mul x18, x18, x17  // ni * m
+            mul x22, x22, x17  // ni * m
             mov x11, x6 // in_channel
-            add x18, x18, x19  // (ni * m) + mi
-            mul x18, x18, x7   // x18 * c4_channel
-            add x20, x2, x18   // dst + offset
+            add x22, x22, x19  // (ni * m) + mi
+            mul x22, x22, x7   // x22 * c4_channel
+            add x20, x2, x22   // dst + offset
             cmp x11, #16
             bge LoopC16
             cmp x11, #8
diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
index 374c5d60de..eb62903d91 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
@@ -1,6 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
-
     .text
     .align 5
     //.p2align 5,,15
diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
index 84a0ed9ab4..e469642058 100644
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
@@ -55,16 +55,16 @@ LoopH:
             ld1 {v0.s}[2], [x17], x10
             ld1 {v0.s}[3], [x17], x10
             mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
             add x19, x16, x7
 
             LoopLength4:
                 ld1 {v16.4s}, [x2]
                 ld1 {v20.4s}, [x14], #16
                 fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x18], #16
+                ld1 {v21.4s}, [x20], #16
                 fmul v17.4s, v21.4s, v0.s[1]
                 ld1 {v20.4s}, [x16], #16
                 fmla v16.4s, v20.4s, v0.s[2]
@@ -90,14 +90,14 @@ LoopH:
             ld1 {v0.s}[1], [x17], x10
             ld1 {v0.s}[2], [x17], x10
             mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
             LoopLength3:
                 ld1 {v16.4s}, [x2]
                 ld1 {v20.4s}, [x14], #16
                 fmla v16.4s, v20.4s, v0.s[0]
-                ld1 {v21.4s}, [x18], #16
+                ld1 {v21.4s}, [x20], #16
                 fmul v17.4s, v21.4s, v0.s[1]
                 ld1 {v20.4s}, [x16], #16
                 fmla v16.4s, v20.4s, v0.s[2]
diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
index 7b96ed500e..a413cf5c01 100644
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
@@ -18,6 +18,9 @@ asm_function WinogradTransRight
 //x5: k
 //x6: length
 
+sub sp, sp, #16
+stp x19, x20, [sp], #16
+
 mov x8, #16 // 4 * sizeof(float)
 mul x8, x6, x8
 mul x9, x5, x8 // step for S
@@ -43,7 +46,7 @@ LoopH:
             cmp x12, #4
             blt LoopKStart3
             mov x16, x15
-            mov x18, x4
+            mov x19, x4
             LoopK4:
                 ld1 {v0.s}[0], [x13], x10
                 ld1 {v0.s}[1], [x13], x10
@@ -54,7 +57,7 @@ LoopH:
 
                 add x14, x17, x8
                 add x16, x14, x8
-                add x18, x16, x8
+                add x19, x16, x8
 
                 LoopLength4:
                     ld1 {v16.4s}, [x2]
@@ -64,7 +67,7 @@ LoopH:
                     fmul v17.4s, v21.4s, v0.s[1]
                     ld1 {v20.4s}, [x16], #16
                     fmla v16.4s, v20.4s, v0.s[2]
-                    ld1 {v21.4s}, [x18], #16
+                    ld1 {v21.4s}, [x19], #16
                     fmla v17.4s, v21.4s, v0.s[3]
 
                     fadd v17.4s, v16.4s, v17.4s
@@ -73,7 +76,7 @@ LoopH:
                     bne LoopLength4
                 sub x2, x2, x8
                 sub x12, x12, #4
-                mov x17, x18
+                mov x17, x19
 
                 cmp x12, #4
                 bge LoopK4
@@ -107,7 +110,7 @@ LoopH:
                     bne LoopLength3
                 sub x2, x2, x8
                 sub x12, x12, #3
-                mov x17, x18
+                mov x17, x19
                 cmp x12, #3
                 bge LoopK3
 
@@ -141,5 +144,7 @@ LoopH:
     subs x4, x4, #1
     bne LoopH
 
+    sub sp, sp, #16
+    ldp x19, x20, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S b/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
index 8ea32fdf1a..a55642d6c7 100644
--- a/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
+++ b/mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_AVX
+#include "nnacl/assembly_global.h"
 .text
 .align 4
 .global ConvDwFp32Avx3x3
@@ -31,7 +32,7 @@
 // 56: input_stride
 // 64: relu
 // 72: relu6
-ConvDwFp32Avx3x3:
+asm_function ConvDwFp32Avx3x3
     pushq %r15
     pushq %r14
     pushq %r13
diff --git a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
index ff762c462b..643c1b3d0e 100644
--- a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
+++ b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_AVX
+#include "nnacl/assembly_global.h"
     .text
     .align 4
     .global MatmulFloatAvxOpt
@@ -34,7 +35,7 @@
 // 72: stride
 // 80: writeMode
 
-MatmulFloatAvxOpt:
+asm_function MatmulFloatAvxOpt
     // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention
     pushq %r15
     pushq %r14
diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
index 74cc4c4bf7..359160786e 100644
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
@@ -19,12 +19,13 @@ asm_function ConvDwFp16Center
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
     // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #176
+    sub sp, sp, #192
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     stp x19, x20, [sp], #16
     stp x21, x22, [sp], #16
     stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
@@ -71,7 +72,7 @@ asm_function ConvDwFp16Center
             mov v14.16b, v24.16b
             mov v15.16b, v24.16b
             LoopKh16:
-                mov x18, x7
+                mov x25, x7
                 mov x21, x16
                 LoopKw16:
                     mov x22, x21
@@ -108,7 +109,7 @@ asm_function ConvDwFp16Center
                     ld1 {v23.8h}, [x22], x11
                     fmla v14.8h, v22.8h, v25.8h
                     fmla v15.8h, v23.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     add x21, x21, x13
                     bne LoopKw16
                 add x16, x16, x12
@@ -191,7 +192,7 @@ asm_function ConvDwFp16Center
             mov v6.16b, v24.16b
             mov v7.16b, v24.16b
             LoopKh8:
-                mov x18, x7
+                mov x25, x7
                 mov x21, x16
                 LoopKw8:
                     mov x22, x21
@@ -212,7 +213,7 @@ asm_function ConvDwFp16Center
                     ld1 {v23.8h}, [x22], x11
                     fmla v6.8h, v22.8h, v25.8h
                     fmla v7.8h, v23.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     add x21, x21, x13
                     bne LoopKw8
                 add x16, x16, x12
@@ -260,13 +261,13 @@ asm_function ConvDwFp16Center
             mov x20, x6
             mov v0.16b, v24.16b
             LoopKh:
-                mov x18, x7
+                mov x25, x7
                 mov x22, x16
                 LoopKw:
                     ld1 {v16.8h}, [x22], x13
                     ld1 {v25.8h}, [x17], #16
                     fmla v0.8h, v16.8h, v25.8h
-                    subs x18, x18, #1
+                    subs x25, x25, #1
                     bne LoopKw
                 add x16, x16, x12
                 subs x20, x20, #1
@@ -289,11 +290,12 @@ asm_function ConvDwFp16Center
         subs x4, x4, #1
         bne LoopH
 
-    sub sp, sp, #176
+    sub sp, sp, #192
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
     ldp x19, x20, [sp], #16
     ldp x21, x22, [sp], #16
     ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
index c0ec1a6bbe..d315ac914f 100644
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
@@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center
         mov x16, x1
         mov x17, x4
         LoopW:
-            mov x18, x15
+            mov x22, x15
             mov x19, x2
             mov x20, x5
             ld1 {v1.8h}, [x16], x8
             LoopKh:
-                mov x21, x18
+                mov x21, x22
                 mov x13, x6
                 LoopKw:
                     ld1 {v0.8h}, [x21]
@@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center
                     st1 {v0.8h}, [x21], x12
                     subs x13, x13, #1
                     bne LoopKw
-                add x18, x18, x11
+                add x22, x22, x11
                 subs x20, x20, #1
                 bne LoopKh
             add x15, x15, x10
diff --git a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
index 5f2c7e641e..c4c2e5e311 100644
--- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
+++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
@@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ r29 should be also preserved
     // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #128
+    sub sp, sp, #144
     // performance between storing 4 registers at the same time and separately storing them on in-order cores
     // is not tested yet
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    stp x19, x20, [sp], #16
 
     ldr x8, [sp, #0]
     ldr x9, [sp, #8]
@@ -548,87 +549,87 @@ IndirectGemmStart:
                 b WriteEnd
             Write7:
                 add x17, x15, #8
-                add x18, x15, #10
+                add x19, x15, #10
                 add x16, x15, #12
                 st1 {v16.4h}, [x15], x7
                 ins v0.s[0], v16.s[2]
                 st1 {v0.h}[0], [x17], x7
-                st1 {v0.h}[1], [x18], x7
+                st1 {v0.h}[1], [x19], x7
                 st1 {v16.h}[6], [x16], x7
                 st1 {v17.4h}, [x15], x7
                 ins v1.s[0], v17.s[2]
                 st1 {v1.h}[0], [x17], x7
-                st1 {v1.h}[1], [x18], x7
+                st1 {v1.h}[1], [x19], x7
                 st1 {v17.h}[6], [x16], x7
                 st1 {v18.4h}, [x15], x7
                 ins v2.s[0], v18.s[2]
                 st1 {v2.h}[0], [x17], x7
-                st1 {v2.h}[1], [x18], x7
+                st1 {v2.h}[1], [x19], x7
                 st1 {v18.h}[6], [x16], x7
                 st1 {v19.4h}, [x15], x7
                 ins v3.s[0], v19.s[2]
                 st1 {v3.h}[0], [x17], x7
-                st1 {v3.h}[1], [x18], x7
+                st1 {v3.h}[1], [x19], x7
                 st1 {v19.h}[6], [x16], x7
                 st1 {v20.4h}, [x15], x7
                 ins v4.s[0], v20.s[2]
                 st1 {v4.h}[0], [x17], x7
-                st1 {v4.h}[1], [x18], x7
+                st1 {v4.h}[1], [x19], x7
                 st1 {v20.h}[6], [x16], x7
                 st1 {v21.4h}, [x15], x7
                 ins v5.s[0], v21.s[2]
                 st1 {v5.h}[0], [x17], x7
-                st1 {v5.h}[1], [x18], x7
+                st1 {v5.h}[1], [x19], x7
                 st1 {v21.h}[6], [x16], x7
                 st1 {v22.4h}, [x15], x7
                 ins v6.s[0], v22.s[2]
                 st1 {v6.h}[0], [x17], x7
-                st1 {v6.h}[1], [x18], x7
+                st1 {v6.h}[1], [x19], x7
                 st1 {v22.h}[6], [x16], x7
                 st1 {v23.4h}, [x15], x7
                 ins v7.s[0], v23.s[2]
                 st1 {v7.h}[0], [x17], x7
-                st1 {v7.h}[1], [x18], x7
+                st1 {v7.h}[1], [x19], x7
                 st1 {v23.h}[6], [x16], x7
                 st1 {v24.4h}, [x15], x7
                 ins v8.s[0], v24.s[2]
                 st1 {v8.h}[0], [x17], x7
-                st1 {v8.h}[1], [x18], x7
+                st1 {v8.h}[1], [x19], x7
                 st1 {v24.h}[6], [x16], x7
                 st1 {v25.4h}, [x15], x7
                 ins v9.s[0], v25.s[2]
                 st1 {v9.h}[0], [x17], x7
-                st1 {v9.h}[1], [x18], x7
+                st1 {v9.h}[1], [x19], x7
                 st1 {v25.h}[6], [x16], x7
                 st1 {v26.4h}, [x15], x7
                 ins v10.s[0], v26.s[2]
                 st1 {v10.h}[0], [x17], x7
-                st1 {v10.h}[1], [x18], x7
+                st1 {v10.h}[1], [x19], x7
                 st1 {v26.h}[6], [x16], x7
                 st1 {v27.4h}, [x15], x7
                 ins v11.s[0], v27.s[2]
                 st1 {v11.h}[0], [x17], x7
-                st1 {v11.h}[1], [x18], x7
+                st1 {v11.h}[1], [x19], x7
                 st1 {v27.h}[6], [x16], x7
                 st1 {v28.4h}, [x15], x7
                 ins v12.s[0], v28.s[2]
                 st1 {v12.h}[0], [x17], x7
-                st1 {v12.h}[1], [x18], x7
+                st1 {v12.h}[1], [x19], x7
                 st1 {v28.h}[6], [x16], x7
                 st1 {v29.4h}, [x15], x7
                 ins v13.s[0], v29.s[2]
                 st1 {v13.h}[0], [x17], x7
-                st1 {v13.h}[1], [x18], x7
+                st1 {v13.h}[1], [x19], x7
                 st1 {v29.h}[6], [x16], x7
                 st1 {v30.4h}, [x15], x7
                 ins v14.s[0], v30.s[2]
                 st1 {v14.h}[0], [x17], x7
-                st1 {v14.h}[1], [x18], x7
+                st1 {v14.h}[1], [x19], x7
                 st1 {v30.h}[6], [x16], x7
                 st1 {v31.4h}, [x15]
                 ins v15.s[0], v31.s[2]
                 st1 {v15.h}[0], [x17]
-                st1 {v15.h}[1], [x18]
+                st1 {v15.h}[1], [x19]
                 st1 {v31.h}[6], [x16]
                 add x0, x0, #14
                 b WriteEnd
@@ -661,9 +662,10 @@ IndirectGemmStart:
     NoStepForward:
         bgt LoopOc
 
-    sub sp, sp, #128
+    sub sp, sp, #144
     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    ldp x19, x20, [sp], #16
     ret
 #endif
 
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
index bc3644ad21..dac86acd0e 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
@@ -21,21 +21,22 @@
 // w13: writeC8
 
 asm_function MatmulFp16Neon64
-  sub sp, sp, #128
+  sub sp, sp, #144
   st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
   st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+  stp x19, x20, [sp], #16
 
   mov w18, #16 // sizeof(float16) * 8
   mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth
   mov x11, x3 // bias flag
-  mov x18, #2
+  mov x19, #2
   ldr x17, [sp]
-  mul x17, x17, x18
+  mul x17, x17, x19
 
 L1:
   mov w10, w6 // reload lhs row
   mov x12, x0 // reload lhs ptr
-  mov x18, x2 // reload dst ptr
+  mov x19, x2 // reload dst ptr
 
 L2:
   mov x16, x1 // reload rhs ptr
@@ -314,490 +315,490 @@ Write:
   b Write8
 
 Write1:
-  st1 {v16.h}[0], [x18], x17
+  st1 {v16.h}[0], [x19], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.h}[0], [x18], x17
+  st1 {v17.h}[0], [x19], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.h}[0], [x18], x17
+  st1 {v18.h}[0], [x19], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.h}[0], [x18], x17
+  st1 {v19.h}[0], [x19], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.h}[0], [x18], x17
+  st1 {v20.h}[0], [x19], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.h}[0], [x18], x17
+  st1 {v21.h}[0], [x19], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.h}[0], [x18], x17
+  st1 {v22.h}[0], [x19], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.h}[0], [x18], x17
+  st1 {v23.h}[0], [x19], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.h}[0], [x18], x17
+  st1 {v24.h}[0], [x19], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.h}[0], [x18], x17
+  st1 {v25.h}[0], [x19], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.h}[0], [x18], x17
+  st1 {v26.h}[0], [x19], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.h}[0], [x18], x17
+  st1 {v27.h}[0], [x19], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.h}[0], [x18], x17
+  st1 {v28.h}[0], [x19], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.h}[0], [x18], x17
+  st1 {v29.h}[0], [x19], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.h}[0], [x18], x17
+  st1 {v30.h}[0], [x19], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.h}[0], [x18], x17
+  st1 {v31.h}[0], [x19], x17
   b WriteEnd
 Write2:
-  add x13, x18, #2
-  st1 {v16.h}[0], [x18], x17
+  add x13, x19, #2
+  st1 {v16.h}[0], [x19], x17
   st1 {v16.h}[1], [x13], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.h}[0], [x18], x17
+  st1 {v17.h}[0], [x19], x17
   st1 {v17.h}[1], [x13], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.h}[0], [x18], x17
+  st1 {v18.h}[0], [x19], x17
   st1 {v18.h}[1], [x13], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.h}[0], [x18], x17
+  st1 {v19.h}[0], [x19], x17
   st1 {v19.h}[1], [x13], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.h}[0], [x18], x17
+  st1 {v20.h}[0], [x19], x17
   st1 {v20.h}[1], [x13], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.h}[0], [x18], x17
+  st1 {v21.h}[0], [x19], x17
   st1 {v21.h}[1], [x13], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.h}[0], [x18], x17
+  st1 {v22.h}[0], [x19], x17
   st1 {v22.h}[1], [x13], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.h}[0], [x18], x17
+  st1 {v23.h}[0], [x19], x17
   st1 {v23.h}[1], [x13], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.h}[0], [x18], x17
+  st1 {v24.h}[0], [x19], x17
   st1 {v24.h}[1], [x13], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.h}[0], [x18], x17
+  st1 {v25.h}[0], [x19], x17
   st1 {v25.h}[1], [x13], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.h}[0], [x18], x17
+  st1 {v26.h}[0], [x19], x17
   st1 {v26.h}[1], [x13], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.h}[0], [x18], x17
+  st1 {v27.h}[0], [x19], x17
   st1 {v27.h}[1], [x13], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.h}[0], [x18], x17
+  st1 {v28.h}[0], [x19], x17
   st1 {v28.h}[1], [x13], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.h}[0], [x18], x17
+  st1 {v29.h}[0], [x19], x17
   st1 {v29.h}[1], [x13], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.h}[0], [x18], x17
+  st1 {v30.h}[0], [x19], x17
   st1 {v30.h}[1], [x13], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.h}[0], [x18], x17
+  st1 {v31.h}[0], [x19], x17
   st1 {v31.h}[1], [x13], x17
   b WriteEnd
 Write3:
-  add x13, x18, #2
-  add x14, x18, #4
-  st1 {v16.h}[0], [x18], x17
+  add x13, x19, #2
+  add x14, x19, #4
+  st1 {v16.h}[0], [x19], x17
   st1 {v16.h}[1], [x13], x17
   st1 {v16.h}[2], [x14], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.h}[0], [x18], x17
+  st1 {v17.h}[0], [x19], x17
   st1 {v17.h}[1], [x13], x17
   st1 {v17.h}[2], [x14], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.h}[0], [x18], x17
+  st1 {v18.h}[0], [x19], x17
   st1 {v18.h}[1], [x13], x17
   st1 {v18.h}[2], [x14], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.h}[0], [x18], x17
+  st1 {v19.h}[0], [x19], x17
   st1 {v19.h}[1], [x13], x17
   st1 {v19.h}[2], [x14], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.h}[0], [x18], x17
+  st1 {v20.h}[0], [x19], x17
   st1 {v20.h}[1], [x13], x17
   st1 {v20.h}[2], [x14], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.h}[0], [x18], x17
+  st1 {v21.h}[0], [x19], x17
   st1 {v21.h}[1], [x13], x17
   st1 {v21.h}[2], [x14], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.h}[0], [x18], x17
+  st1 {v22.h}[0], [x19], x17
   st1 {v22.h}[1], [x13], x17
   st1 {v22.h}[2], [x14], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.h}[0], [x18], x17
+  st1 {v23.h}[0], [x19], x17
   st1 {v23.h}[1], [x13], x17
   st1 {v23.h}[2], [x14], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.h}[0], [x18], x17
+  st1 {v24.h}[0], [x19], x17
   st1 {v24.h}[1], [x13], x17
   st1 {v24.h}[2], [x14], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.h}[0], [x18], x17
+  st1 {v25.h}[0], [x19], x17
   st1 {v25.h}[1], [x13], x17
   st1 {v25.h}[2], [x14], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.h}[0], [x18], x17
+  st1 {v26.h}[0], [x19], x17
   st1 {v26.h}[1], [x13], x17
   st1 {v26.h}[2], [x14], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.h}[0], [x18], x17
+  st1 {v27.h}[0], [x19], x17
   st1 {v27.h}[1], [x13], x17
   st1 {v27.h}[2], [x14], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.h}[0], [x18], x17
+  st1 {v28.h}[0], [x19], x17
   st1 {v28.h}[1], [x13], x17
   st1 {v28.h}[2], [x14], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.h}[0], [x18], x17
+  st1 {v29.h}[0], [x19], x17
   st1 {v29.h}[1], [x13], x17
   st1 {v29.h}[2], [x14], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.h}[0], [x18], x17
+  st1 {v30.h}[0], [x19], x17
   st1 {v30.h}[1], [x13], x17
   st1 {v30.h}[2], [x14], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.h}[0], [x18], x17
+  st1 {v31.h}[0], [x19], x17
   st1 {v31.h}[1], [x13], x17
   st1 {v31.h}[2], [x14], x17
   b WriteEnd
 Write4:
-  st1 {v16.4h}, [x18], x17
+  st1 {v16.4h}, [x19], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.4h}, [x18], x17
+  st1 {v17.4h}, [x19], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.4h}, [x18], x17
+  st1 {v18.4h}, [x19], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.4h}, [x18], x17
+  st1 {v19.4h}, [x19], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.4h}, [x18], x17
+  st1 {v20.4h}, [x19], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.4h}, [x18], x17
+  st1 {v21.4h}, [x19], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.4h}, [x18], x17
+  st1 {v22.4h}, [x19], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.4h}, [x18], x17
+  st1 {v23.4h}, [x19], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4h}, [x18], x17
+  st1 {v24.4h}, [x19], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.4h}, [x18], x17
+  st1 {v25.4h}, [x19], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.4h}, [x18], x17
+  st1 {v26.4h}, [x19], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.4h}, [x18], x17
+  st1 {v27.4h}, [x19], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.4h}, [x18], x17
+  st1 {v28.4h}, [x19], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.4h}, [x18], x17
+  st1 {v29.4h}, [x19], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.4h}, [x18], x17
+  st1 {v30.4h}, [x19], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.4h}, [x18], x17
+  st1 {v31.4h}, [x19], x17
   b WriteEnd
 Write5:
-  add x13, x18, #8
-  st1 {v16.4h}, [x18], x17
+  add x13, x19, #8
+  st1 {v16.4h}, [x19], x17
   st1 {v16.h}[4], [x13], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.4h}, [x18], x17
+  st1 {v17.4h}, [x19], x17
   st1 {v17.h}[4], [x13], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.4h}, [x18], x17
+  st1 {v18.4h}, [x19], x17
   st1 {v18.h}[4], [x13], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.4h}, [x18], x17
+  st1 {v19.4h}, [x19], x17
   st1 {v19.h}[4], [x13], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.4h}, [x18], x17
+  st1 {v20.4h}, [x19], x17
   st1 {v20.h}[4], [x13], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.4h}, [x18], x17
+  st1 {v21.4h}, [x19], x17
   st1 {v21.h}[4], [x13], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.4h}, [x18], x17
+  st1 {v22.4h}, [x19], x17
   st1 {v22.h}[4], [x13], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.4h}, [x18], x17
+  st1 {v23.4h}, [x19], x17
   st1 {v23.h}[4], [x13], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4h}, [x18], x17
+  st1 {v24.4h}, [x19], x17
   st1 {v24.h}[4], [x13], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.4h}, [x18], x17
+  st1 {v25.4h}, [x19], x17
   st1 {v25.h}[4], [x13], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.4h}, [x18], x17
+  st1 {v26.4h}, [x19], x17
   st1 {v26.h}[4], [x13], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.4h}, [x18], x17
+  st1 {v27.4h}, [x19], x17
   st1 {v27.h}[4], [x13], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.4h}, [x18], x17
+  st1 {v28.4h}, [x19], x17
   st1 {v28.h}[4], [x13], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.4h}, [x18], x17
+  st1 {v29.4h}, [x19], x17
   st1 {v29.h}[4], [x13], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.4h}, [x18], x17
+  st1 {v30.4h}, [x19], x17
   st1 {v30.h}[4], [x13], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.4h}, [x18], x17
+  st1 {v31.4h}, [x19], x17
   st1 {v31.h}[4], [x13], x17
   b WriteEnd
 Write6:
-  add x13, x18, #8
-  add x14, x18, #10
-  st1 {v16.4h}, [x18], x17
+  add x13, x19, #8
+  add x14, x19, #10
+  st1 {v16.4h}, [x19], x17
   st1 {v16.h}[4], [x13], x17
   st1 {v16.h}[5], [x14], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.4h}, [x18], x17
+  st1 {v17.4h}, [x19], x17
   st1 {v17.h}[4], [x13], x17
   st1 {v17.h}[5], [x14], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.4h}, [x18], x17
+  st1 {v18.4h}, [x19], x17
   st1 {v18.h}[4], [x13], x17
   st1 {v18.h}[5], [x14], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.4h}, [x18], x17
+  st1 {v19.4h}, [x19], x17
   st1 {v19.h}[4], [x13], x17
   st1 {v19.h}[5], [x14], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.4h}, [x18], x17
+  st1 {v20.4h}, [x19], x17
   st1 {v20.h}[4], [x13], x17
   st1 {v20.h}[5], [x14], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.4h}, [x18], x17
+  st1 {v21.4h}, [x19], x17
   st1 {v21.h}[4], [x13], x17
   st1 {v21.h}[5], [x14], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.4h}, [x18], x17
+  st1 {v22.4h}, [x19], x17
   st1 {v22.h}[4], [x13], x17
   st1 {v22.h}[5], [x14], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.4h}, [x18], x17
+  st1 {v23.4h}, [x19], x17
   st1 {v23.h}[4], [x13], x17
   st1 {v23.h}[5], [x14], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4h}, [x18], x17
+  st1 {v24.4h}, [x19], x17
   st1 {v24.h}[4], [x13], x17
   st1 {v24.h}[5], [x14], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.4h}, [x18], x17
+  st1 {v25.4h}, [x19], x17
   st1 {v25.h}[4], [x13], x17
   st1 {v25.h}[5], [x14], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.4h}, [x18], x17
+  st1 {v26.4h}, [x19], x17
   st1 {v26.h}[4], [x13], x17
   st1 {v26.h}[5], [x14], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.4h}, [x18], x17
+  st1 {v27.4h}, [x19], x17
   st1 {v27.h}[4], [x13], x17
   st1 {v27.h}[5], [x14], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.4h}, [x18], x17
+  st1 {v28.4h}, [x19], x17
   st1 {v28.h}[4], [x13], x17
   st1 {v28.h}[5], [x14], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.4h}, [x18], x17
+  st1 {v29.4h}, [x19], x17
   st1 {v29.h}[4], [x13], x17
   st1 {v29.h}[5], [x14], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.4h}, [x18], x17
+  st1 {v30.4h}, [x19], x17
   st1 {v30.h}[4], [x13], x17
   st1 {v30.h}[5], [x14], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.4h}, [x18], x17
+  st1 {v31.4h}, [x19], x17
   st1 {v31.h}[4], [x13], x17
   st1 {v31.h}[5], [x14], x17
   b WriteEnd
 Write7:
-  add x13, x18, #8
-  add x14, x18, #10
-  add x16, x18, #12
-  st1 {v16.4h}, [x18], x17
+  add x13, x19, #8
+  add x14, x19, #10
+  add x16, x19, #12
+  st1 {v16.4h}, [x19], x17
   st1 {v16.h}[4], [x13], x17
   st1 {v16.h}[5], [x14], x17
   st1 {v16.h}[6], [x16], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.4h}, [x18], x17
+  st1 {v17.4h}, [x19], x17
   st1 {v17.h}[4], [x13], x17
   st1 {v17.h}[5], [x14], x17
   st1 {v17.h}[6], [x16], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.4h}, [x18], x17
+  st1 {v18.4h}, [x19], x17
   st1 {v18.h}[4], [x13], x17
   st1 {v18.h}[5], [x14], x17
   st1 {v18.h}[6], [x16], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.4h}, [x18], x17
+  st1 {v19.4h}, [x19], x17
   st1 {v19.h}[4], [x13], x17
   st1 {v19.h}[5], [x14], x17
   st1 {v19.h}[6], [x16], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.4h}, [x18], x17
+  st1 {v20.4h}, [x19], x17
   st1 {v20.h}[4], [x13], x17
   st1 {v20.h}[5], [x14], x17
   st1 {v20.h}[6], [x16], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.4h}, [x18], x17
+  st1 {v21.4h}, [x19], x17
   st1 {v21.h}[4], [x13], x17
   st1 {v21.h}[5], [x14], x17
   st1 {v21.h}[6], [x16], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.4h}, [x18], x17
+  st1 {v22.4h}, [x19], x17
   st1 {v22.h}[4], [x13], x17
   st1 {v22.h}[5], [x14], x17
   st1 {v22.h}[6], [x16], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.4h}, [x18], x17
+  st1 {v23.4h}, [x19], x17
   st1 {v23.h}[4], [x13], x17
   st1 {v23.h}[5], [x14], x17
   st1 {v23.h}[6], [x16], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.4h}, [x18], x17
+  st1 {v24.4h}, [x19], x17
   st1 {v24.h}[4], [x13], x17
   st1 {v24.h}[5], [x14], x17
   st1 {v24.h}[6], [x16], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.4h}, [x18], x17
+  st1 {v25.4h}, [x19], x17
   st1 {v25.h}[4], [x13], x17
   st1 {v25.h}[5], [x14], x17
   st1 {v25.h}[6], [x16], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.4h}, [x18], x17
+  st1 {v26.4h}, [x19], x17
   st1 {v26.h}[4], [x13], x17
   st1 {v26.h}[5], [x14], x17
   st1 {v26.h}[6], [x16], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.4h}, [x18], x17
+  st1 {v27.4h}, [x19], x17
   st1 {v27.h}[4], [x13], x17
   st1 {v27.h}[5], [x14], x17
   st1 {v27.h}[6], [x16], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.4h}, [x18], x17
+  st1 {v28.4h}, [x19], x17
   st1 {v28.h}[4], [x13], x17
   st1 {v28.h}[5], [x14], x17
   st1 {v28.h}[6], [x16], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.4h}, [x18], x17
+  st1 {v29.4h}, [x19], x17
   st1 {v29.h}[4], [x13], x17
   st1 {v29.h}[5], [x14], x17
   st1 {v29.h}[6], [x16], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.4h}, [x18], x17
+  st1 {v30.4h}, [x19], x17
   st1 {v30.h}[4], [x13], x17
   st1 {v30.h}[5], [x14], x17
   st1 {v30.h}[6], [x16], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.4h}, [x18], x17
+  st1 {v31.4h}, [x19], x17
   st1 {v31.h}[4], [x13], x17
   st1 {v31.h}[5], [x14], x17
   st1 {v31.h}[6], [x16], x17
@@ -809,52 +810,52 @@ WriteC8:
   st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
   b WriteEnd
 Write8:
-  st1 {v16.8h}, [x18], x17
+  st1 {v16.8h}, [x19], x17
   cmp w10, #1
   beq WriteEnd
-  st1 {v17.8h}, [x18], x17
+  st1 {v17.8h}, [x19], x17
   cmp w10, #2
   beq WriteEnd
-  st1 {v18.8h}, [x18], x17
+  st1 {v18.8h}, [x19], x17
   cmp w10, #3
   beq WriteEnd
-  st1 {v19.8h}, [x18], x17
+  st1 {v19.8h}, [x19], x17
   cmp w10, #4
   beq WriteEnd
-  st1 {v20.8h}, [x18], x17
+  st1 {v20.8h}, [x19], x17
   cmp w10, #5
   beq WriteEnd
-  st1 {v21.8h}, [x18], x17
+  st1 {v21.8h}, [x19], x17
   cmp w10, #6
   beq WriteEnd
-  st1 {v22.8h}, [x18], x17
+  st1 {v22.8h}, [x19], x17
   cmp w10, #7
   beq WriteEnd
-  st1 {v23.8h}, [x18], x17
+  st1 {v23.8h}, [x19], x17
   cmp w10, #8
   beq WriteEnd
-  st1 {v24.8h}, [x18], x17
+  st1 {v24.8h}, [x19], x17
   cmp w10, #9
   beq WriteEnd
-  st1 {v25.8h}, [x18], x17
+  st1 {v25.8h}, [x19], x17
   cmp w10, #10
   beq WriteEnd
-  st1 {v26.8h}, [x18], x17
+  st1 {v26.8h}, [x19], x17
   cmp w10, #11
   beq WriteEnd
-  st1 {v27.8h}, [x18], x17
+  st1 {v27.8h}, [x19], x17
   cmp w10, #12
   beq WriteEnd
-  st1 {v28.8h}, [x18], x17
+  st1 {v28.8h}, [x19], x17
   cmp w10, #13
   beq WriteEnd
-  st1 {v29.8h}, [x18], x17
+  st1 {v29.8h}, [x19], x17
   cmp w10, #14
   beq WriteEnd
-  st1 {v30.8h}, [x18], x17
+  st1 {v30.8h}, [x19], x17
   cmp w10, #15
   beq WriteEnd
-  st1 {v31.8h}, [x18], x17
+  st1 {v31.8h}, [x19], x17
 
 WriteEnd:
   subs w10, w10, #16 // lhs row - 8
@@ -871,8 +872,9 @@ NoDstStep:
   bgt L1
 
 End1:
-  sub sp, sp, #128
+  sub sp, sp, #144
   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+  ldp x19, x20, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
index 1d2eb479bc..38699e37b8 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@@ -21,30 +21,31 @@
 // x9: writeMode
 
 asm_function MatmulFp16Neon64Opt
-    sub sp, sp, #80
+    sub sp, sp, #96
     st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
     stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
 
     ldr x8, [sp]
     ldr x9, [sp, #8]
 
-    mov x18, #32 // sizeof(float16_t) * 16
-    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
+    mov x21, #32 // sizeof(float16_t) * 16
+    mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
     cbnz x9, NoC8Steps
     mov x11, x2
-    mov x18, #16
-    mul x16, x6, x18 // row * 8 * sizeof(float16_t)
+    mov x21, #16
+    mul x16, x6, x21 // row * 8 * sizeof(float16_t)
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
-    mov x18, #2
+    mov x21, #2
     mul x15, x7, x8
-    mul x15, x15, x18 // kernel_size * col *sizeof(float16_t)
-    mov x18, #16
-    mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t)
+    mul x15, x15, x21 // kernel_size * col *sizeof(float16_t)
+    mov x21, #16
+    mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t)
 NoWinoSteps:
-    mov x18, #2
-    mul x8, x8, x18
+    mov x21, #2
+    mul x8, x8, x21
 
 LoopRowStart:
     cmp x6, #1
@@ -1221,9 +1222,9 @@ LoopRow:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
-        mov x18, #2
-        mul x18, x18, x7
-        sub x11, x11, x18
+        mov x21, #2
+        mul x21, x21, x7
+        sub x11, x11, x21
         mov x2, x11
         b NoDstStep
     C8DstStep:
@@ -1233,8 +1234,9 @@ LoopColEnd:
         subs x6, x6, #16
         bgt LoopRowStart
 
-    sub sp, sp, #80
+    sub sp, sp, #96
     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
     ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
     ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
index daaed9163a..029365b0a9 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
@@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16
         mov x14, x1  // mat_b
         LoopN:
             mov x16, x0  // mat_a_m
-            sub x18, x5, x15   // ni
+            sub x22, x5, x15   // ni
             sub x19, x17, x3   // mi
-            mul x18, x18, x17  // ni * m
+            mul x22, x22, x17  // ni * m
             mov x11, x6 // in_channel
-            add x18, x18, x19  // (ni * m) + mi
-            mul x18, x18, x13   // x18 * channel_in * 2
-            add x20, x2, x18   // dst + offset
+            add x22, x22, x19  // (ni * m) + mi
+            mul x22, x22, x13   // x22 * channel_in * 2
+            add x20, x2, x22   // dst + offset
             cmp x11, #32
             bge LoopC32
             cmp x11, #16
diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
index df1d88750e..ccb782881d 100644
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
@@ -9,8 +9,8 @@
 
 asm_function WinogradTransLeftFp16
 
-sub sp, sp, #32
-stp x19, x20, [sp], #32
+sub sp, sp, #16
+stp x19, x20, [sp], #16
 
 mov x8, #8 // 4 * sizeof(float16)
 mul x8, x6, x8
@@ -46,16 +46,16 @@ LoopH:
             ld1 {v0.h}[2], [x17], x10
             ld1 {v0.h}[3], [x17], x10
             mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
             add x19, x16, x7
 
             LoopLength4:
                 ld1 {v16.4h}, [x2]
                 ld1 {v20.4h}, [x14], #8
                 fmla v16.4h, v20.4h, v0.h[0]
-                ld1 {v21.4h}, [x18], #8
+                ld1 {v21.4h}, [x20], #8
                 fmul v17.4h, v21.4h, v0.h[1]
                 ld1 {v20.4h}, [x16], #8
                 fmla v16.4h, v20.4h, v0.h[2]
@@ -81,14 +81,14 @@ LoopH:
             ld1 {v0.h}[1], [x17], x10
             ld1 {v0.h}[2], [x17], x10
             mov x11, x6
-            mov x18, x17
-            add x18, x14, x7
-            add x16, x18, x7
+            mov x20, x17
+            add x20, x14, x7
+            add x16, x20, x7
             LoopLength3:
                 ld1 {v16.4h}, [x2]
                 ld1 {v20.4h}, [x14], #8
                 fmla v16.4h, v20.4h, v0.h[0]
-                ld1 {v21.4h}, [x18], #8
+                ld1 {v21.4h}, [x20], #8
                 fmul v17.4h, v21.4h, v0.h[1]
                 ld1 {v20.4h}, [x16], #8
                 fmla v16.4h, v20.4h, v0.h[2]
@@ -132,6 +132,6 @@ LoopH:
     subs x4, x4, #1
     bne LoopH
 
-    sub sp, sp, #32
-    ldp x19, x20, [sp], #32
+    sub sp, sp, #16
+    ldp x19, x20, [sp], #16
     ret
diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
index c889803691..73c1e517d7 100644
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
@@ -9,6 +9,9 @@
 
 asm_function WinogradTransRightFp16
 
+sub sp, sp, #16
+stp x19, x20, [sp], #16
+
 mov x8, #8 // 4 * sizeof(float16)
 mul x8, x6, x8
 mul x9, x5, x8 // step for S
@@ -34,7 +37,7 @@ LoopH:
             cmp x12, #4
             blt LoopKStart3
             mov x16, x15
-            mov x18, x4
+            mov x19, x4
             LoopK4:
                 ld1 {v0.h}[0], [x13], x10
                 ld1 {v0.h}[1], [x13], x10
@@ -45,7 +48,7 @@ LoopH:
 
                 add x14, x17, x8
                 add x16, x14, x8
-                add x18, x16, x8
+                add x19, x16, x8
 
                 LoopLength4:
                     ld1 {v16.4h}, [x2]
@@ -55,7 +58,7 @@ LoopH:
                     fmul v17.4h, v21.4h, v0.h[1]
                     ld1 {v20.4h}, [x16], #8
                     fmla v16.4h, v20.4h, v0.h[2]
-                    ld1 {v21.4h}, [x18], #8
+                    ld1 {v21.4h}, [x19], #8
                     fmla v17.4h, v21.4h, v0.h[3]
 
                     fadd v17.4h, v16.4h, v17.4h
@@ -64,7 +67,7 @@ LoopH:
                     bne LoopLength4
                 sub x2, x2, x8
                 sub x12, x12, #4
-                mov x17, x18
+                mov x17, x19
 
                 cmp x12, #4
                 bge LoopK4
@@ -98,7 +101,7 @@ LoopH:
                     bne LoopLength3
                 sub x2, x2, x8
                 sub x12, x12, #3
-                mov x17, x18
+                mov x17, x19
                 cmp x12, #3
                 bge LoopK3
 
@@ -132,4 +135,7 @@ LoopH:
     subs x4, x4, #1
     bne LoopH
 
+    sub sp, sp, #16
+    ldp x19, x20, [sp], #16
+
     ret
\ No newline at end of file
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
index 38a38433b1..5bc1e5095c 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -66,7 +66,7 @@ L2:
   cmp w16, #0
   beq End2
 
-  mov x18, x1     // reload b ptr
+  mov x28, x1     // reload b ptr
   mov x19, x7     // reload bias ptr
   mov w20, w5     // reload depth
   dup v16.4s, wzr
@@ -91,7 +91,7 @@ L3:
 
 LoopD16:
   ld1 {v0.16b, v1.16b}, [x17], #32
-  ld1 {v2.16b, v3.16b}, [x18], #32
+  ld1 {v2.16b, v3.16b}, [x28], #32
 
   sdot v16.4s, v2.16b, v0.4b[0]
   sdot v18.4s, v2.16b, v0.4b[1]
@@ -104,7 +104,7 @@ LoopD16:
   sdot v28.4s, v2.16b, v1.4b[2]
   sdot v30.4s, v2.16b, v1.4b[3]
 
-  ld1 {v6.16b, v7.16b}, [x18], #32
+  ld1 {v6.16b, v7.16b}, [x28], #32
   sdot v17.4s, v3.16b, v0.4b[0]
   sdot v19.4s, v3.16b, v0.4b[1]
   sdot v21.4s, v3.16b, v0.4b[2]
@@ -126,7 +126,7 @@ LoopD16:
   sdot v28.4s, v6.16b, v5.4b[2]
   sdot v30.4s, v6.16b, v5.4b[3]
 
-  ld1 {v10.16b, v11.16b}, [x18], #32
+  ld1 {v10.16b, v11.16b}, [x28], #32
   sdot v17.4s, v7.16b, v4.4b[0]
   sdot v19.4s, v7.16b, v4.4b[1]
   sdot v21.4s, v7.16b, v4.4b[2]
@@ -148,7 +148,7 @@ LoopD16:
   sdot v28.4s, v10.16b, v9.4b[2]
   sdot v30.4s, v10.16b, v9.4b[3]
 
-  ld1 {v14.16b, v15.16b}, [x18], #32
+  ld1 {v14.16b, v15.16b}, [x28], #32
   sdot v17.4s, v11.16b, v8.4b[0]
   sdot v19.4s, v11.16b, v8.4b[1]
   sdot v21.4s, v11.16b, v8.4b[2]
@@ -187,7 +187,7 @@ LoopD4:
   beq End3
 
   ld1 {v0.16b, v1.16b}, [x17], #32
-  ld1 {v2.16b, v3.16b}, [x18], #32
+  ld1 {v2.16b, v3.16b}, [x28], #32
 
   sdot v16.4s, v2.16b, v0.4b[0]
   sdot v18.4s, v2.16b, v0.4b[1]
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
index fc3ef28b86..95f30fe123 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
@@ -30,7 +30,7 @@
 // x28: filter_zp
 
 asm_function MatmulInt8DpOpt
-  sub sp, sp, #208
+  sub sp, sp, #224
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   stp x19, x20, [sp], #16
@@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt
   stp x23, x24, [sp], #16
   stp x25, x26, [sp], #16
   stp x27, x28, [sp], #16
+  stp x29, x30, [sp], #16
 
   ldr w8, [sp]
   ldr w9, [sp, #8]
@@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt
 LoopRow:
     mov x16, x1 // reload rhs ptr
     mov x17, x4 // reload rhs col
-    mov x18, x7 // reload bias ptr
+    mov x29, x7 // reload bias ptr
     mov x25, x6 // reload input_sum ptr
     mov x27, x2 // reload dst ptr
     ldr x28, [sp, #64] // reload filter_zp
@@ -113,7 +114,7 @@ LoopRow:
 
         Bias:
             cbz x7, NoReadBias
-            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64
             add v16.4s, v16.4s, v0.4s
             add v17.4s, v17.4s, v1.4s
             add v18.4s, v18.4s, v2.4s
@@ -423,8 +424,8 @@ LoopRow:
 
         BiasHalf:
             cbz x7, NoReadBiasHalf
-            ld1 {v0.4s, v1.4s}, [x18]
-            add x18, x18, #64
+            ld1 {v0.4s, v1.4s}, [x29]
+            add x29, x29, #64
             add v16.4s, v16.4s, v0.4s
             add v17.4s, v17.4s, v1.4s
             add v20.4s, v20.4s, v0.4s
@@ -612,8 +613,8 @@ LoopRow:
 
         BiasQuarter:
             cbz x7, NoReadBiasQuarter
-            ld1 {v0.4s}, [x18]
-            add x18, x18, #64
+            ld1 {v0.4s}, [x29]
+            add x29, x29, #64
             add v16.4s, v16.4s, v0.4s
             add v20.4s, v20.4s, v0.4s
             add v24.4s, v24.4s, v0.4s
@@ -1072,7 +1073,7 @@ LoopColEnd:
     b LoopRow
 
 LoopRowEnd:
-  sub sp, sp, #208
+  sub sp, sp, #224
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   ldp x19, x20, [sp], #16
@@ -1080,5 +1081,6 @@ LoopRowEnd:
   ldp x23, x24, [sp], #16
   ldp x25, x26, [sp], #16
   ldp x27, x28, [sp], #16
+  ldp x29, x30, [sp], #16
   ret
 #endif
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
index 03342a3986..e769ae4185 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
@@ -20,9 +20,10 @@
 // x7: bias
 
 asm_function MatMulOptR4Int8Neon64
-  sub sp, sp, #128
+  sub sp, sp, #144
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  stp x19, x20, [sp], #16
 
   mov w15, #0       // b col index
   mov w16, #0       // a row index
@@ -40,7 +41,7 @@ L2:
   cmp w16, w3
   beq End2
 
-  mov x18, x1     // reload b ptr
+  mov x19, x1     // reload b ptr
   mov x10, x7    // reload bias ptr
   mov w11, w5     // reload depth
   dup v16.4s, wzr
@@ -67,10 +68,10 @@ L3:
   ld1 {v1.16b}, [x17], #16
   ld1 {v2.16b}, [x17], #16
   ld1 {v3.16b}, [x17], #16
-  ld1 {v4.16b}, [x18], #16
-  ld1 {v5.16b}, [x18], #16
-  ld1 {v6.16b}, [x18], #16
-  ld1 {v7.16b}, [x18], #16
+  ld1 {v4.16b}, [x19], #16
+  ld1 {v5.16b}, [x19], #16
+  ld1 {v6.16b}, [x19], #16
+  ld1 {v7.16b}, [x19], #16
 
   sdot v16.4s, v4.16b, v0.16b
   sdot v17.4s, v5.16b, v0.16b
@@ -135,8 +136,9 @@ End2:
   b L1
 
 End1:
-  sub sp, sp, #128
+  sub sp, sp, #144
   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
   ret
 #endif