add fp32 deconv kernels

5 years ago · aa94e5a91e
parent ba6023b87d
commit aa94e5a91e
12 changed files with 1211 additions and 31 deletions
--- a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
@ -0,0 +1,198 @@
+#ifdef ENABLE_ARM32
+    .text
+    .align 5
+    .global TiledC4MatmulFp32
+#ifndef __APPLE__
+    .type TiledC4MatmulFp32, %function
+#endif
+
+TiledC4MatmulFp32:
+//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
+//x0: dst
+//x1: src
+//x2: weight
+//x3: cal_num
+//x4: ic4
+//x5: oc4
+
+push {r4-r8, lr}
+ldr r4, [sp, #24]
+ldr r5, [sp, #28]
+//step multi by sizeof(float)
+mov r8, #4
+mul r3, r8, r3
+
+vpush {q4-q7}
+
+LoopOc:
+    mov r6, r1
+    mov r8, r0
+    subs r7, r4, #1
+    vld1.32 {q0, q1}, [r1]!
+    vld1.32 {q2, q3}, [r1]!
+    vld1.32 {q4, q5}, [r2]!
+    vld1.32 {q6, q7}, [r2]!
+
+    vmul.f32 q8, q4, d0[0]
+    vmul.f32 q9, q4, d2[0]
+    vmul.f32 q10, q4, d4[0]
+    vmul.f32 q11, q4, d6[0]
+
+    vmla.f32 q8, q5, d0[1]
+    vmla.f32 q9, q5, d2[1]
+    vmla.f32 q10, q5, d4[1]
+    vmla.f32 q11, q5, d6[1]
+
+    vmla.f32 q8, q6, d1[0]
+    vmla.f32 q9, q6, d3[0]
+    vmla.f32 q10, q6, d5[0]
+    vmla.f32 q11, q6, d7[0]
+
+    vmla.f32 q8, q7, d1[1]
+    vmla.f32 q9, q7, d3[1]
+    vmla.f32 q10, q7, d5[1]
+    vmla.f32 q11, q7, d7[1]
+
+    vld1.32 {q0, q1}, [r1]!
+    vld1.32 {q2, q3}, [r1]!
+
+    vmul.f32 q12, q4, d0[0]
+    vmul.f32 q13, q4, d2[0]
+    vmul.f32 q14, q4, d4[0]
+    vmul.f32 q15, q4, d6[0]
+
+    vmla.f32 q12, q5, d0[1]
+    vmla.f32 q13, q5, d2[1]
+    vmla.f32 q14, q5, d4[1]
+    vmla.f32 q15, q5, d6[1]
+
+    vmla.f32 q12, q6, d1[0]
+    vmla.f32 q13, q6, d3[0]
+    vmla.f32 q14, q6, d5[0]
+    vmla.f32 q15, q6, d7[0]
+
+    vmla.f32 q12, q7, d1[1]
+    vmla.f32 q13, q7, d3[1]
+    vmla.f32 q14, q7, d5[1]
+    vmla.f32 q15, q7, d7[1]
+    beq LoopIcEnd
+
+    subs r7, r7, #1
+
+    vld1.32 {q4, q5}, [r2]!
+    vld1.32 {q0, q1}, [r1]!
+    vld1.32 {q2, q3}, [r1]!
+
+    vmla.f32 q8, q4, d0[0]
+    vmla.f32 q9, q4, d2[0]
+    beq LoopIcEndHalf
+
+    LoopIc:
+        vmla.f32 q10, q4, d4[0]
+        vmla.f32 q11, q4, d6[0]
+
+        vmla.f32 q8, q5, d0[1]
+        vmla.f32 q9, q5, d2[1]
+        vld1.32 {q6, q7}, [r2]!
+        vmla.f32 q10, q5, d4[1]
+        vmla.f32 q11, q5, d6[1]
+
+        vmla.f32 q8, q6, d1[0]
+        vmla.f32 q9, q6, d3[0]
+        vmla.f32 q10, q6, d5[0]
+        vmla.f32 q11, q6, d7[0]
+
+        vmla.f32 q8, q7, d1[1]
+        vmla.f32 q9, q7, d3[1]
+        vmla.f32 q10, q7, d5[1]
+        vld1.32 {q0, q1}, [r1]!
+        vmla.f32 q11, q7, d7[1]
+
+        vld1.32 {q2, q3}, [r1]!
+
+        vmla.f32 q12, q4, d0[0]
+        vmla.f32 q13, q4, d2[0]
+        vmla.f32 q14, q4, d4[0]
+        vmla.f32 q15, q4, d6[0]
+
+        vmla.f32 q12, q5, d0[1]
+        vmla.f32 q13, q5, d2[1]
+        vmla.f32 q14, q5, d4[1]
+        vmla.f32 q15, q5, d6[1]
+
+        vmla.f32 q12, q6, d1[0]
+        vmla.f32 q13, q6, d3[0]
+        vmla.f32 q14, q6, d5[0]
+        vld1.32 {q4, q5}, [r2]!
+        vmla.f32 q15, q6, d7[0]
+
+        vmla.f32 q12, q7, d1[1]
+        vmla.f32 q13, q7, d3[1]
+        vmla.f32 q14, q7, d5[1]
+        vld1.32 {q0, q1}, [r1]!
+        vmla.f32 q15, q7, d7[1]
+
+        vld1.32 {q2, q3}, [r1]!
+
+        vmla.f32 q8, q4, d0[0]
+        vmla.f32 q9, q4, d2[0]
+
+        subs r7, r7, #1
+        bne LoopIc
+    LoopIcEndHalf:
+        vmla.f32 q10, q4, d4[0]
+        vmla.f32 q11, q4, d6[0]
+
+        vmla.f32 q8, q5, d0[1]
+        vmla.f32 q9, q5, d2[1]
+        vld1.32 {q6, q7}, [r2]!
+        vmla.f32 q10, q5, d4[1]
+        vmla.f32 q11, q5, d6[1]
+
+        vmla.f32 q8, q6, d1[0]
+        vmla.f32 q9, q6, d3[0]
+        vmla.f32 q10, q6, d5[0]
+        vmla.f32 q11, q6, d7[0]
+
+        vmla.f32 q8, q7, d1[1]
+        vmla.f32 q9, q7, d3[1]
+        vmla.f32 q10, q7, d5[1]
+        vld1.32 {q0, q1}, [r1]!
+        vmla.f32 q11, q7, d7[1]
+
+        vld1.32 {q2, q3}, [r1]!
+
+        vmla.f32 q12, q4, d0[0]
+        vmla.f32 q13, q4, d2[0]
+        vmla.f32 q14, q4, d4[0]
+        vmla.f32 q15, q4, d6[0]
+
+        vmla.f32 q12, q5, d0[1]
+        vmla.f32 q13, q5, d2[1]
+        vmla.f32 q14, q5, d4[1]
+        vmla.f32 q15, q5, d6[1]
+
+        vmla.f32 q12, q6, d1[0]
+        vmla.f32 q13, q6, d3[0]
+        vmla.f32 q14, q6, d5[0]
+        vmla.f32 q15, q6, d7[0]
+
+        vmla.f32 q12, q7, d1[1]
+        vmla.f32 q13, q7, d3[1]
+        vmla.f32 q14, q7, d5[1]
+        vmla.f32 q15, q7, d7[1]
+    LoopIcEnd:
+        vst1.32 {q8, q9}, [r0]!
+        vst1.32 {q10, q11}, [r0]!
+        vst1.32 {q12, q13}, [r0]!
+        vst1.32 {q14, q15}, [r0]!
+        mov r1, r6
+
+    subs r5, r5, #1
+    add r0, r8, r3
+    bne LoopOc
+
+    vpop {q4-q7}
+    pop {r4-r8, pc}
+
+#endif
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
@ -0,0 +1,218 @@
+#ifdef ENABLE_ARM32
+
+    .text
+    .align 5
+    .global WinogradTransLeft
+#ifndef __APPLE__
+    .type WinogradTransLeft, %function
+#endif
+
+//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
+//x0: S
+//x1: B
+//x2: M
+//x3: w
+//x4: h
+//x5: k
+//x6: length
+WinogradTransLeft:
+    push {r4-r11, lr}
+    ldr r4, [sp, #36]
+    ldr r5, [sp, #40]
+    ldr r6, [sp, #44]
+
+    mov r8, #16 // 4 * sizeof(float)
+    mul r8, r6, r8
+    mul r9, r3, r8
+    sub r9, r9, r8
+    add r7, r9, r8 // step for S
+    mov r10, #4
+    mul r10, r4, r10 // step for B
+
+LoopH:
+    push {r0, r3}
+    LoopW:
+        push {r0, r1}
+        vmov.i32 q14, #0
+        mov r11, r6
+        InitZero:
+            vst1.32 {q14}, [r2]!
+            subs r11, r11, #1
+            bne InitZero
+
+        sub r2, r2, r8
+        mov r12, r5
+
+        LoopKStart7:
+            cmp r12, #7
+            blt LoopKStart4
+            push {r3-r7}
+        LoopK7:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            vld1.32 {d1[1]}, [r1], r10
+            vld1.32 {d2[0]}, [r1], r10
+            vld1.32 {d2[1]}, [r1], r10
+            vld1.32 {d3[0]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r7
+            add r3, r1, r7
+            add r4, r3, r7
+            add r5, r4, r7
+            add r6, r5, r7
+            add r7, r6, r7
+
+            LoopLength7:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+                vld1.32 {q13}, [r4]!
+                vmla.f32 q9, q13, d1[1]
+                vld1.32 {q12}, [r5]!
+                vmla.f32 q8, q12, d2[0]
+                vld1.32 {q13}, [r6]!
+                vmla.f32 q9, q13, d2[1]
+                vld1.32 {q12}, [r7]!
+                vmla.f32 q8, q12, d3[0]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength7
+
+            sub r2, r2, r8
+            sub r12, r12, #7
+            add r0, r7, r9
+            vmov.32 r1, d30[0]
+            cmp r12, #7
+            bge LoopK7
+
+        pop {r3-r7}
+
+        LoopKStart4:
+            cmp r12, #4
+            blt LoopKStart3
+            vmov.32 d30[1], r3
+            vmov.32 d31[0], r4
+        LoopK4:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            vld1.32 {d1[1]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r7
+            add r3, r1, r7
+            add r4, r3, r7
+
+            LoopLength4:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+                vld1.32 {q13}, [r4]!
+                vmla.f32 q9, q13, d1[1]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength4
+
+            sub r2, r2, r8
+            sub r12, r12, #4
+            add r0, r4, r9
+            vmov.32 r1, d30[0]
+            cmp r12, #4
+            bge LoopK4
+
+        vmov.32 r3, d30[1]
+        vmov.32 r4, d31[0]
+
+        LoopKStart3:
+            cmp r12, #3
+            blt LoopKStart
+            vmov.32 d30[1], r3
+            vmov.32 d31[0], r4
+        LoopK3:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r7
+            add r3, r1, r7
+
+            LoopLength3:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength3
+
+            sub r2, r2, r8
+            sub r12, r12, #3
+            add r0, r3, r9
+            vmov.32 r1, d30[0]
+            cmp r12, #3
+            bge LoopK3
+
+        vmov.32 r3, d30[1]
+        vmov.32 r4, d31[0]
+
+        LoopKStart:
+        cmp r12, #0
+        beq LoopKEnd
+
+        LoopK:
+            vld1.32 {d30[0]}, [r1], r10
+
+            vdup.32 q15, d30[0]
+            mov r11, r6
+            LoopLength:
+                vld1.32 {q0}, [r2]
+                vld1.32 {q1}, [r0]!
+                vmla.f32 q0, q1, q15
+
+                vst1.32 {q0}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength
+            subs r12, r12, #1
+
+            sub r2, r2, r8
+            add r0, r0, r9
+            bne LoopK
+
+        LoopKEnd:
+            pop {r0, r1}
+            subs r3, r3, #1
+            add r0, r0, r8
+            add r2, r2, r8
+            bne LoopW
+
+    pop {r0, r3}
+    add r1, r1, #4 //sizeof(float)
+    subs r4, r4, #1
+    bne LoopH
+
+    pop {r4-r11, pc}
+
+#endif
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
@ -0,0 +1,208 @@
+#ifdef ENABLE_ARM32
+
+    .text
+    .align 5
+    .global WinogradTransRight
+#ifndef __APPLE__
+    .type WinogradTransRight, %function
+#endif
+
+//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
+//x0: S
+//x1: B
+//x2: M
+//x3: w
+//x4: h
+//x5: k
+//x6: length
+WinogradTransRight:
+    push {r4-r11, lr}
+    ldr r4, [sp, #36]
+    ldr r5, [sp, #40]
+    ldr r6, [sp, #44]
+
+    mov r8, #16 // 4 * sizeof(float)
+    mul r8, r6, r8
+    mul r9, r5, r8 // step for S
+    mov r10, #4
+    mul r10, r4, r10 // step for B
+
+LoopH:
+    push {r1, r3}
+    LoopW:
+        push {r0, r1}
+        vmov.i32 q14, #0
+        mov r11, r6
+        InitZero:
+            vst1.32 {q14}, [r2]!
+            subs r11, r11, #1
+            bne InitZero
+
+        sub r2, r2, r8
+        mov r12, r5
+        LoopKStart7:
+            cmp r12, #7
+            blt LoopKStart4
+            push {r3-r7}
+        LoopK7:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            vld1.32 {d1[1]}, [r1], r10
+            vld1.32 {d2[0]}, [r1], r10
+            vld1.32 {d2[1]}, [r1], r10
+            vld1.32 {d3[0]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r8
+            add r3, r1, r8
+            add r4, r3, r8
+            add r5, r4, r8
+            add r6, r5, r8
+            add r7, r6, r8
+            LoopLength7:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+                vld1.32 {q13}, [r4]!
+                vmla.f32 q9, q13, d1[1]
+                vld1.32 {q12}, [r5]!
+                vmla.f32 q8, q12, d2[0]
+                vld1.32 {q13}, [r6]!
+                vmla.f32 q9, q13, d2[1]
+                vld1.32 {q12}, [r7]!
+                vmla.f32 q8, q12, d3[0]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength7
+
+            sub r2, r2, r8
+            sub r12, r12, #7
+            mov r0, r7
+            vmov.32 r1, d30[0]
+            cmp r12, #7
+            bge LoopK7
+
+        pop {r3-r7}
+
+        LoopKStart4:
+            cmp r12, #4
+            blt LoopKStart3
+            vmov.32 d30[1], r3
+            vmov.32 d31[0], r4
+        LoopK4:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            vld1.32 {d1[1]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r8
+            add r3, r1, r8
+            add r4, r3, r8
+
+            LoopLength4:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+                vld1.32 {q13}, [r4]!
+                vmla.f32 q9, q13, d1[1]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength4
+
+            sub r2, r2, r8
+            sub r12, r12, #4
+            mov r0, r4
+            vmov.32 r1, d30[0]
+            cmp r12, #4
+            bge LoopK4
+
+        vmov.32 r3, d30[1]
+        vmov.32 r4, d31[0]
+
+        LoopKStart3:
+            cmp r12, #3
+            blt LoopKStart
+            vmov.32 d30[1], r3
+        LoopK3:
+            vld1.32 {d0[0]}, [r1], r10
+            vld1.32 {d0[1]}, [r1], r10
+            vld1.32 {d1[0]}, [r1], r10
+            mov r11, r6
+            vmov.32 d30[0], r1
+
+            add r1, r0, r8
+            add r3, r1, r8
+
+            LoopLength3:
+                vld1.32 {q8}, [r2]
+                vld1.32 {q12}, [r0]!
+                vmla.f32 q8, q12, d0[0]
+                vld1.32 {q13}, [r1]!
+                vmul.f32 q9, q13, d0[1]
+                vld1.32 {q12}, [r3]!
+                vmla.f32 q8, q12, d1[0]
+
+                vadd.f32 q9, q8, q9
+                vst1.32 {q9}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength3
+
+            sub r2, r2, r8
+            sub r12, r12, #3
+            mov r0, r3
+            vmov.32 r1, d30[0]
+            cmp r12, #3
+            bge LoopK3
+
+        vmov.32 r3, d30[1]
+
+        LoopKStart:
+            cmp r12, #0
+            beq LoopKEnd
+        LoopK:
+            vld1.32 {d30[0]}, [r1], r10
+            vdup.32 q15, d30[0]
+            mov r11, r6
+            LoopLength:
+                vld1.32 {q0}, [r2]
+                vld1.32 {q1}, [r0]!
+                vmla.f32 q0, q1, q15
+
+                vst1.32 {q0}, [r2]!
+                subs r11, r11, #1
+                bne LoopLength
+
+            subs r12, r12, #1
+            sub r2, r2, r8
+            bne LoopK
+        LoopKEnd:
+            pop {r0, r1}
+            subs r3, r3, #1
+            add r2, r2, r8
+            add r1, r1, #4 //sizeof(float)
+            bne LoopW
+
+    pop {r1, r3}
+    add r0, r0, r9
+    subs r4, r4, #1
+    bne LoopH
+
+    pop {r4-r11, pc}
+
+#endif
--- a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
@ -0,0 +1,147 @@
+#ifdef __aarch64__
+
+    .text
+    .align 5
+    .global WinogradTransLeft
+#ifndef __APPLE__
+    .type WinogradTransLeft, %function
+#endif
+
+WinogradTransLeft:
+//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
+//x0: S
+//x1: B
+//x2: M
+//x3: w
+//x4: h
+//x5: k
+//x6:length
+
+sub sp, sp, #32
+stp x19, x20, [sp], #32
+
+mov x8, #16 // 4 * sizeof(float)
+mul x8, x6, x8
+mul x9, x3, x8
+sub x9, x9, x8
+add x7, x9, x8 // step for S
+mov x10, #4
+mul x10, x4, x10 // step for B
+
+LoopH:
+    mov x13, x0
+    mov x15, x3
+    LoopW:
+        mov x14, x13
+        mov x17, x1
+        dup v30.4s, wzr
+        mov x11, x6
+        InitZero:
+            st1 {v30.4s}, [x2], #16
+            subs x11, x11, #1
+            bne InitZero
+
+        sub x2, x2, x8
+        mov x12, x5
+        LoopKStart4:
+            cmp x12, #4
+            blt LoopKStart3
+            mov x16, x15
+            mov x19, x4
+        LoopK4:
+            ld1 {v0.s}[0], [x17], x10
+            ld1 {v0.s}[1], [x17], x10
+            ld1 {v0.s}[2], [x17], x10
+            ld1 {v0.s}[3], [x17], x10
+            mov x11, x6
+            mov x18, x17
+            add x18, x14, x7
+            add x16, x18, x7
+            add x19, x16, x7
+
+            LoopLength4:
+                ld1 {v16.4s}, [x2]
+                ld1 {v20.4s}, [x14], #16
+                fmla v16.4s, v20.4s, v0.s[0]
+                ld1 {v21.4s}, [x18], #16
+                fmul v17.4s, v21.4s, v0.s[1]
+                ld1 {v20.4s}, [x16], #16
+                fmla v16.4s, v20.4s, v0.s[2]
+                ld1 {v21.4s}, [x19], #16
+                fmla v17.4s, v21.4s, v0.s[3]
+                fadd v17.4s, v16.4s, v17.4s
+                st1 {v17.4s}, [x2], #16
+                subs x11, x11, #1
+                bne LoopLength4
+
+            sub x2, x2, x8
+            sub x12, x12, #4
+            add x14, x19, x9
+            cmp x12, #4
+            bge LoopK4
+
+        LoopKStart3:
+            cmp x12, #3
+            blt LoopKStart
+            mov x16, x15
+        LoopK3:
+            ld1 {v0.s}[0], [x17], x10
+            ld1 {v0.s}[1], [x17], x10
+            ld1 {v0.s}[2], [x17], x10
+            mov x11, x6
+            mov x18, x17
+            add x18, x14, x7
+            add x16, x18, x7
+            LoopLength3:
+                ld1 {v16.4s}, [x2]
+                ld1 {v20.4s}, [x14], #16
+                fmla v16.4s, v20.4s, v0.s[0]
+                ld1 {v21.4s}, [x18], #16
+                fmul v17.4s, v21.4s, v0.s[1]
+                ld1 {v20.4s}, [x16], #16
+                fmla v16.4s, v20.4s, v0.s[2]
+                fadd v17.4s, v16.4s, v17.4s
+                st1 {v17.4s}, [x2], #16
+                subs x11, x11, #1
+                bne LoopLength3
+
+            sub x2, x2, x8
+            sub x12, x12, #3
+            add x14, x16, x9
+            cmp x12, #3
+            bge LoopK3
+
+        LoopKStart:
+            cmp x12, #0
+            beq LKEnd
+        LoopK:
+            ld1r {v31.4s}, [x17], x10
+            mov x11, x6
+            LoopLength:
+                ld1 {v0.4s}, [x2]
+                ld1 {v1.4s}, [x14], #16
+                fmla v0.4s, v1.4s, v31.4s
+                st1 {v0.4s}, [x2], #16
+                subs x11, x11, #1
+                bne LoopLength
+
+            subs x12, x12, #1
+            sub x2, x2, x8
+            add x14, x14, x9
+            bne LoopK
+
+        LKEnd:
+        subs x15, x15, #1
+        add x13, x13, x8
+        add x2, x2, x8
+        bne LoopW
+
+    add x1, x1, #4 //sizeof(float)
+    subs x4, x4, #1
+    bne LoopH
+
+    sub sp, sp, #32
+    ldp x19, x20, [sp], #32
+    ret
+
+#endif
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
@ -0,0 +1,144 @@
+#ifdef __aarch64__
+
+    .text
+    .align 5
+    .global WinogradTransRight
+#ifndef __APPLE__
+    .type WinogradTransRight, %function
+#endif
+
+WinogradTransRight:
+//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
+//x0: S
+//x1: B
+//x2: M
+//x3: w
+//x4: h
+//x5: k
+//x6: length
+
+mov x8, #16 // 4 * sizeof(float)
+mul x8, x6, x8
+mul x9, x5, x8 // step for S
+mov x10, #4
+mul x10, x4, x10 // step for B
+
+LoopH:
+    mov x7, x1
+    mov x15, x3
+    LoopW:
+        mov x17, x0
+        mov x13, x7
+        dup v30.4s, wzr
+        mov x11, x6
+        InitZero:
+            st1 {v30.4s}, [x2], #16
+            subs x11, x11, #1
+            bne InitZero
+        sub x2, x2, x8
+        mov x12, x5
+
+        LoopKStart4:
+            cmp x12, #4
+            blt LoopKStart3
+            mov x16, x15
+            mov x18, x4
+            LoopK4:
+                ld1 {v0.s}[0], [x13], x10
+                ld1 {v0.s}[1], [x13], x10
+                ld1 {v0.s}[2], [x13], x10
+                ld1 {v0.s}[3], [x13], x10
+                mov x11, x6
+                mov x14, x13
+
+                add x14, x17, x8
+                add x16, x14, x8
+                add x18, x16, x8
+
+                LoopLength4:
+                    ld1 {v16.4s}, [x2]
+                    ld1 {v20.4s}, [x17], #16
+                    fmla v16.4s, v20.4s, v0.s[0]
+                    ld1 {v21.4s}, [x14], #16
+                    fmul v17.4s, v21.4s, v0.s[1]
+                    ld1 {v20.4s}, [x16], #16
+                    fmla v16.4s, v20.4s, v0.s[2]
+                    ld1 {v21.4s}, [x18], #16
+                    fmla v17.4s, v21.4s, v0.s[3]
+
+                    fadd v17.4s, v16.4s, v17.4s
+                    st1 {v17.4s}, [x2], #16
+                    subs x11, x11, #1
+                    bne LoopLength4
+                sub x2, x2, x8
+                sub x12, x12, #4
+                mov x17, x18
+
+                cmp x12, #4
+                bge LoopK4
+
+        LoopKStart3:
+            cmp x12, #3
+            blt LoopKStart
+            mov x16, x15
+            LoopK3:
+                ld1 {v0.s}[0], [x13], x10
+                ld1 {v0.s}[1], [x13], x10
+                ld1 {v0.s}[2], [x13], x10
+                mov x11, x6
+                mov x14, x13
+
+                add x14, x17, x8
+                add x16, x14, x8
+
+                LoopLength3:
+                    ld1 {v16.4s}, [x2]
+                    ld1 {v20.4s}, [x17], #16
+                    fmla v16.4s, v20.4s, v0.s[0]
+                    ld1 {v21.4s}, [x14], #16
+                    fmul v17.4s, v21.4s, v0.s[1]
+                    ld1 {v20.4s}, [x16], #16
+                    fmla v16.4s, v20.4s, v0.s[2]
+
+                    fadd v17.4s, v16.4s, v17.4s
+                    st1 {v17.4s}, [x2], #16
+                    subs x11, x11, #1
+                    bne LoopLength3
+                sub x2, x2, x8
+                sub x12, x12, #3
+                mov x17, x18
+                cmp x12, #3
+                bge LoopK3
+
+        LoopKStart:
+            cmp x12, #0
+            beq LoopKEnd
+
+            LoopK:
+                ld1r {v31.4s}, [x13], x10
+
+                mov x11, x6
+                LoopLength:
+                    ld1 {v0.4s}, [x2]
+                    ld1 {v1.4s}, [x17], #16
+                    fmla v0.4s, v1.4s, v31.4s
+
+                    st1 {v0.4s}, [x2], #16
+                    subs x11, x11, #1
+                    bne LoopLength
+                    subs x12, x12, #1
+
+                sub x2, x2, x8
+                bne LoopK
+        LoopKEnd:
+            subs x15, x15, #1
+            add x2, x2, x8
+            add x7, x7, #4 //sizeof(float)
+            bne LoopW
+
+    add x0, x0, x9
+    subs x4, x4, #1
+    bne LoopH
+
+    ret
+#endif
--- a/mindspore/lite/nnacl/fp32/common_func.c
+++ b/mindspore/lite/nnacl/fp32/common_func.c
@ -68,7 +68,8 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi
  return;
 }

-void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
+#ifndef ENABLE_ARM
+void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
  int unitStep = 4 * length;
  for (int y = 0; y < h; ++y) {
    float *dstY = M + y * w * unitStep;
@ -91,7 +92,7 @@ void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t
 }

 // M = S * B , M = w*h * l, S = k*h * l, B = w*k
-void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
+void WinogradTransRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
  int unitStep = 4 * length;
  for (int y = 0; y < h; ++y) {
    float *dstY = M + y * w * unitStep;
@ -113,6 +114,7 @@ void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t
    }
  }
 }
+#endif

 union float32_bits {
  unsigned int u;
--- a/mindspore/lite/nnacl/fp32/common_func.h
+++ b/mindspore/lite/nnacl/fp32/common_func.h
@ -32,8 +32,8 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi
 void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
                        size_t plane_size, size_t plane_stride, size_t relu_type);

-void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
-void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
+void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
+void WinogradTransRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);

 float ShortToFloat32(uint16_t src_value);

--- a/mindspore/lite/nnacl/fp32/deconv_winograd.c
+++ b/mindspore/lite/nnacl/fp32/deconv_winograd.c
@ -130,21 +130,21 @@ void DeConvWgInputPack(float *src_ptr, float *dst_ptr, int channel, int stride)
  return;
 }

-void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
-                         size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
+#ifndef ENABLE_ARM
+void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) {
  int dx, sz, dz;
-  int src_depth_step = 4 * width;
-  for (dz = 0; dz < dst_depth_quad; ++dz) {
-    float *dst_z = dst + dz * dst_step;
-    const float *weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
-    for (dx = 0; dx < width; ++dx) {
+  int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE;
+  for (dz = 0; dz < oc4; ++dz) {
+    float *dst_z = dst + dz * cal_num;
+    const float *weight_dz = weight + dz * ic4 * 16;
+    for (dx = 0; dx < DECONV_WINOGRAD_DEFAULT_TILE; ++dx) {
      float *dst_x = dst_z + dx * 4;
      dst_x[0] = 0.0f;
      dst_x[1] = 0.0f;
      dst_x[2] = 0.0f;
      dst_x[3] = 0.0f;
      const float *src_dx = src + 4 * dx;
-      for (sz = 0; sz < src_depth_quad; ++sz) {
+      for (sz = 0; sz < ic4; ++sz) {
        const float *src_z = src_dx + sz * src_depth_step;
        const float *weight_z = weight_dz + sz * 16;
        for (int i = 0; i < 4; ++i) {
@ -156,12 +156,7 @@ void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size
    }
  }
 }
-
-void MSGemmFloatUnit_4(float *dstOrigin, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
-                       size_t dst_depth_quad, size_t weight_depth_offset) {
-  MSGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, DECONV_WINOGRAD_DEFAULT_TILE,
-                      weight_depth_offset);
-}
+#endif

 void DeConvWgMerge(const float *src, float *dst, size_t src_stride, size_t dst_stride, size_t count) {
  for (int i = 0; i < count; ++i) {
@ -179,10 +174,10 @@ void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *
                     int unit_size, int w_start, int h_start, ConvParameter *conv_param, DeConvParam *deconv_param) {
  int winograd_plane = unit_size * unit_size;
  if (!transfered[unit_size]) {
-    WinogradMatrixProductLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
-                              DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
-    WinogradMatrixProductRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
-                               deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+    WinogradTransLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
+                      deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+    WinogradTransRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
+                       deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
    transfered[unit_size] = true;
  }

@ -190,14 +185,14 @@ void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *
    float *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
    float *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
    float *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
-    MSGemmFloatUnit_4(dst, src, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM,
-                      deconv_param->oc_div4_, 0);
+    TiledC4MatmulFp32(dst, src, weight, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, deconv_param->ic_div4_,
+                      deconv_param->oc_div4_);
  }

-  WinogradMatrixProductLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
-                            deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
-  WinogradMatrixProductRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
-                             deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+  WinogradTransLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
+                    deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+  WinogradTransRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
+                     deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);

  // Add to dest
  for (int uhi = 0; uhi < unit_size; uhi++) {
@ -223,7 +218,7 @@ void _deConvCommon(float *tile_in, float *tile_out, float *weight, float *tmp_bu
  for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
    for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
      float *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
-      MSGemmFloatUnit_4(tmp_buf, src_in, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * 4, count, 0);
+      TiledC4MatmulFp32(tmp_buf, src_in, weight, DECONV_WINOGRAD_DEFAULT_TILE * 4, deconv_param->ic_div4_, count);

      for (int uhi = 0; uhi < h_size; uhi++) {
        for (int uwi = 0; uwi < w_size; uwi++) {
--- a/mindspore/lite/nnacl/fp32/deconv_winograd.h
+++ b/mindspore/lite/nnacl/fp32/deconv_winograd.h
@ -34,6 +34,7 @@ void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_ind
              ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
 void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param,
                  int calculate_count, int tile_index);
+void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t ic4, size_t cal_num, size_t oc4);

 #ifdef __cplusplus
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@ -254,7 +254,7 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
  if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
      (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
    /* DeConvolutionWinogradCPUKernel */
-    kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
@ -258,10 +258,10 @@ int DeConvolutionWinogradCPUKernel::InitDataParam() {
  }

  /* bias */
-  auto bias_tensor = in_tensors_.at(kBiasIndex);
  bias_data_ = malloc(deconv_param_->oc_up4_ * sizeof(float));
  memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float));
  if (in_tensors_.size() == 3) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
    memcpy(bias_data_, bias_tensor->data_c(), conv_param_->output_channel_ * sizeof(float));
  }
  return RET_OK;