diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.c b/mindspore/lite/nnacl/fp32/matmul_fp32.c index 4a4c85ba4e..cc0d492b3b 100644 --- a/mindspore/lite/nnacl/fp32/matmul_fp32.c +++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c @@ -84,136 +84,142 @@ void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) return; } +#ifdef ENABLE_ARM64 +void RowMajor2Col12Major_arm64(const float *src_c, float *dst_c, size_t col) { + size_t stride = col * sizeof(float); + asm volatile( + "mov x10, %[src_c]\n" + "mov x11, %[dst_c]\n" + + "ld1 {v0.4s}, [x10], %[stride]\n" + "ld1 {v1.4s}, [x10], %[stride]\n" + "ld1 {v2.4s}, [x10], %[stride]\n" + "ld1 {v3.4s}, [x10], %[stride]\n" + + "ld1 {v4.4s}, [x10], %[stride]\n" + "ld1 {v5.4s}, [x10], %[stride]\n" + "ld1 {v6.4s}, [x10], %[stride]\n" + "ld1 {v7.4s}, [x10], %[stride]\n" + + "zip1 v12.4s, v0.4s, v1.4s\n" + "zip2 v13.4s, v0.4s, v1.4s\n" + "zip1 v14.4s, v2.4s, v3.4s\n" + "zip2 v15.4s, v2.4s, v3.4s\n" + + "ld1 {v8.4s}, [x10], %[stride]\n" + "ld1 {v9.4s}, [x10], %[stride]\n" + "ld1 {v10.4s}, [x10], %[stride]\n" + "ld1 {v11.4s}, [x10], %[stride]\n" + + "zip1 v16.4s, v4.4s, v5.4s\n" + "zip2 v17.4s, v4.4s, v5.4s\n" + "zip1 v18.4s, v6.4s, v7.4s\n" + "zip2 v19.4s, v6.4s, v7.4s\n" + + "trn1 v20.2d, v12.2d, v14.2d\n" + "trn2 v23.2d, v12.2d, v14.2d\n" + "trn1 v26.2d, v13.2d, v15.2d\n" + "trn2 v29.2d, v13.2d, v15.2d\n" + + "trn1 v21.2d, v16.2d, v18.2d\n" + "trn2 v24.2d, v16.2d, v18.2d\n" + "trn1 v27.2d, v17.2d, v19.2d\n" + "trn2 v30.2d, v17.2d, v19.2d\n" + + "zip1 v12.4s, v8.4s, v9.4s\n" + "zip2 v13.4s, v8.4s, v9.4s\n" + "zip1 v14.4s, v10.4s, v11.4s\n" + "zip2 v15.4s, v10.4s, v11.4s\n" + + "trn1 v22.2d, v12.2d, v14.2d\n" + "trn2 v25.2d, v12.2d, v14.2d\n" + "trn1 v28.2d, v13.2d, v15.2d\n" + "trn2 v31.2d, v13.2d, v15.2d\n" + + "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64\n" + "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64\n" + "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64\n" + + : + : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) + : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + return; +} +#endif +#ifdef ENABLE_ARM32 +void RowMajor2Col12Major_arm32(const float *src_c, float *dst_c, size_t col) { + size_t stride = col * sizeof(float); + asm volatile( + "mov r10, %[src_c]\n" + "mov r12, %[dst_c]\n" + + "vld1.32 {q0}, [r10], %[stride]\n" + "vld1.32 {q3}, [r10], %[stride]\n" + "vld1.32 {q10}, [r10], %[stride]\n" + "vld1.32 {q13}, [r10], %[stride]\n" + + "vtrn.32 d0, d6\n" + "vtrn.32 d1, d7\n" + "vtrn.32 d20, d26\n" + "vtrn.32 d21, d27\n" + + "vld1.32 {q1}, [r10], %[stride]\n" + "vld1.32 {q8}, [r10], %[stride]\n" + "vld1.32 {q11}, [r10], %[stride]\n" + "vld1.32 {q14}, [r10], %[stride]\n" + + "vswp d1, d20\n" + "vswp d7, d26\n" + + "vld1.32 {q2}, [r10], %[stride]\n" + "vld1.32 {q9}, [r10], %[stride]\n" + "vld1.32 {q12}, [r10], %[stride]\n" + "vld1.32 {q15}, [r10], %[stride]\n" + + "vtrn.32 d2, d16\n" + "vtrn.32 d3, d17\n" + "vtrn.32 d22, d28\n" + "vtrn.32 d23, d29\n" + + "vswp d3, d22\n" + "vswp d17, d28\n" + + "vtrn.32 d4, d18\n" + "vtrn.32 d5, d19\n" + "vtrn.32 d24, d30\n" + "vtrn.32 d25, d31\n" + + "vswp d5, d24\n" + "vswp d19, d30\n" + + "vst1.32 {q0, q1}, [r12]!\n" + "vst1.32 {q2, q3}, [r12]!\n" + "vst1.32 {q8, q9}, [r12]!\n" + "vst1.32 {q10, q11}, [r12]!\n" + "vst1.32 {q12, q13}, [r12]!\n" + "vst1.32 {q14, q15}, [r12]!\n" + + : + : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) + : "r10", "r12", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + return; +} +#endif void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { - size_t row_up_12 = UP_ROUND(row, C12NUM); - size_t row12 = row / C12NUM * C12NUM; - size_t col4 = col / C4NUM * C4NUM; const float *src_r = src_ptr; float *dst_r = dst_ptr; - size_t ri = 0; - for (; ri < row12; ri += C12NUM) { + for (; ri < (row / C12NUM * C12NUM); ri += C12NUM) { size_t ci = 0; - for (; ci < col4; ci += C4NUM) { + for (; ci < (col / C4NUM * C4NUM); ci += C4NUM) { const float *src_c = src_r + ci; float *dst_c = dst_r + ci * C12NUM; - - /* 12x4 row-major to col-major */ #ifdef ENABLE_ARM64 - size_t stride = col * sizeof(float); - asm volatile( - "mov x10, %[src_c]\n" - "mov x11, %[dst_c]\n" - - "ld1 {v0.4s}, [x10], %[stride]\n" - "ld1 {v1.4s}, [x10], %[stride]\n" - "ld1 {v2.4s}, [x10], %[stride]\n" - "ld1 {v3.4s}, [x10], %[stride]\n" - - "ld1 {v4.4s}, [x10], %[stride]\n" - "ld1 {v5.4s}, [x10], %[stride]\n" - "ld1 {v6.4s}, [x10], %[stride]\n" - "ld1 {v7.4s}, [x10], %[stride]\n" - - "zip1 v12.4s, v0.4s, v1.4s\n" - "zip2 v13.4s, v0.4s, v1.4s\n" - "zip1 v14.4s, v2.4s, v3.4s\n" - "zip2 v15.4s, v2.4s, v3.4s\n" - - "ld1 {v8.4s}, [x10], %[stride]\n" - "ld1 {v9.4s}, [x10], %[stride]\n" - "ld1 {v10.4s}, [x10], %[stride]\n" - "ld1 {v11.4s}, [x10], %[stride]\n" - - "zip1 v16.4s, v4.4s, v5.4s\n" - "zip2 v17.4s, v4.4s, v5.4s\n" - "zip1 v18.4s, v6.4s, v7.4s\n" - "zip2 v19.4s, v6.4s, v7.4s\n" - - "trn1 v20.2d, v12.2d, v14.2d\n" - "trn2 v23.2d, v12.2d, v14.2d\n" - "trn1 v26.2d, v13.2d, v15.2d\n" - "trn2 v29.2d, v13.2d, v15.2d\n" - - "trn1 v21.2d, v16.2d, v18.2d\n" - "trn2 v24.2d, v16.2d, v18.2d\n" - "trn1 v27.2d, v17.2d, v19.2d\n" - "trn2 v30.2d, v17.2d, v19.2d\n" - - "zip1 v12.4s, v8.4s, v9.4s\n" - "zip2 v13.4s, v8.4s, v9.4s\n" - "zip1 v14.4s, v10.4s, v11.4s\n" - "zip2 v15.4s, v10.4s, v11.4s\n" - - "trn1 v22.2d, v12.2d, v14.2d\n" - "trn2 v25.2d, v12.2d, v14.2d\n" - "trn1 v28.2d, v13.2d, v15.2d\n" - "trn2 v31.2d, v13.2d, v15.2d\n" - - "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64\n" - "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64\n" - "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64\n" - - : - : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) - : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31"); + RowMajor2Col12Major_arm64(src_c, dst_c, col); #elif ENABLE_ARM32 - size_t stride = col * sizeof(float); - asm volatile( - "mov r10, %[src_c]\n" - "mov r12, %[dst_c]\n" - - "vld1.32 {q0}, [r10], %[stride]\n" - "vld1.32 {q3}, [r10], %[stride]\n" - "vld1.32 {q10}, [r10], %[stride]\n" - "vld1.32 {q13}, [r10], %[stride]\n" - - "vtrn.32 d0, d6\n" - "vtrn.32 d1, d7\n" - "vtrn.32 d20, d26\n" - "vtrn.32 d21, d27\n" - - "vld1.32 {q1}, [r10], %[stride]\n" - "vld1.32 {q8}, [r10], %[stride]\n" - "vld1.32 {q11}, [r10], %[stride]\n" - "vld1.32 {q14}, [r10], %[stride]\n" - - "vswp d1, d20\n" - "vswp d7, d26\n" - - "vld1.32 {q2}, [r10], %[stride]\n" - "vld1.32 {q9}, [r10], %[stride]\n" - "vld1.32 {q12}, [r10], %[stride]\n" - "vld1.32 {q15}, [r10], %[stride]\n" - - "vtrn.32 d2, d16\n" - "vtrn.32 d3, d17\n" - "vtrn.32 d22, d28\n" - "vtrn.32 d23, d29\n" - - "vswp d3, d22\n" - "vswp d17, d28\n" - - "vtrn.32 d4, d18\n" - "vtrn.32 d5, d19\n" - "vtrn.32 d24, d30\n" - "vtrn.32 d25, d31\n" - - "vswp d5, d24\n" - "vswp d19, d30\n" - - "vst1.32 {q0, q1}, [r12]!\n" - "vst1.32 {q2, q3}, [r12]!\n" - "vst1.32 {q8, q9}, [r12]!\n" - "vst1.32 {q10, q11}, [r12]!\n" - "vst1.32 {q12, q13}, [r12]!\n" - "vst1.32 {q14, q15}, [r12]!\n" - - : - : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) - : "r10", "r12", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + RowMajor2Col12Major_arm32(src_c, dst_c, col); #elif ENABLE_SSE __m128 src1 = _mm_loadu_ps(src_c); __m128 src2 = _mm_loadu_ps(src_c + col); @@ -288,24 +294,145 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_ src_r += C12NUM * col; dst_r += C12NUM * col; } - - for (; ri < row; ri++) { + for (; ri < row; ri++, dst_r++, src_r += col) { for (size_t i = 0; i < col; i++) { dst_r[i * C12NUM] = src_r[i]; } - src_r += col; - dst_r += 1; } - - for (; ri < row_up_12; ri++) { + for (; ri < UP_ROUND(row, C12NUM); ri++, dst_r++) { for (size_t i = 0; i < col; i++) { dst_r[i * C12NUM] = 0; } - dst_r += 1; } return; } +#ifdef ENABLE_ARM64 +void RowMajor2Col8Major_arm64(const float *src_c, float *dst_c, size_t col) { + size_t stride = col * sizeof(float); + asm volatile( + "mov x10, %[src_c]\n" + "mov x11, %[dst_c]\n" + + "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" + "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" + "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" + "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" + + "zip1 v8.4s, v0.4s, v2.4s\n" + "zip2 v9.4s, v0.4s, v2.4s\n" + "zip1 v10.4s, v4.4s, v6.4s\n" + "zip2 v11.4s, v4.4s, v6.4s\n" + + "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" + "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" + "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" + "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" + + "zip1 v12.4s, v1.4s, v3.4s\n" + "zip2 v13.4s, v1.4s, v3.4s\n" + "zip1 v14.4s, v5.4s, v7.4s\n" + "zip2 v15.4s, v5.4s, v7.4s\n" + + "trn1 v0.2d, v8.2d, v10.2d\n" + "trn2 v1.2d, v8.2d, v10.2d\n" + "trn1 v2.2d, v9.2d, v11.2d\n" + "trn2 v3.2d, v9.2d, v11.2d\n" + + "zip1 v24.4s, v16.4s, v18.4s\n" + "zip2 v25.4s, v16.4s, v18.4s\n" + "zip1 v26.4s, v20.4s, v22.4s\n" + "zip2 v27.4s, v20.4s, v22.4s\n" + + "trn1 v4.2d, v12.2d, v14.2d\n" + "trn2 v5.2d, v12.2d, v14.2d\n" + "trn1 v6.2d, v13.2d, v15.2d\n" + "trn2 v7.2d, v13.2d, v15.2d\n" + + "zip1 v28.4s, v17.4s, v19.4s\n" + "zip2 v29.4s, v17.4s, v19.4s\n" + "zip1 v30.4s, v21.4s, v23.4s\n" + "zip2 v31.4s, v21.4s, v23.4s\n" + + "trn1 v16.2d, v24.2d, v26.2d\n" + "trn2 v17.2d, v24.2d, v26.2d\n" + "trn1 v18.2d, v25.2d, v27.2d\n" + "trn2 v19.2d, v25.2d, v27.2d\n" + + "trn1 v20.2d, v28.2d, v30.2d\n" + "trn2 v21.2d, v28.2d, v30.2d\n" + "trn1 v22.2d, v29.2d, v31.2d\n" + "trn2 v23.2d, v29.2d, v31.2d\n" + + "st1 {v0.4s}, [x11], #16\n" + "st1 {v16.4s}, [x11], #16\n" + "st1 {v1.4s}, [x11], #16\n" + "st1 {v17.4s}, [x11], #16\n" + "st1 {v2.4s}, [x11], #16\n" + "st1 {v18.4s}, [x11], #16\n" + "st1 {v3.4s}, [x11], #16\n" + "st1 {v19.4s}, [x11], #16\n" + "st1 {v4.4s}, [x11], #16\n" + "st1 {v20.4s}, [x11], #16\n" + "st1 {v5.4s}, [x11], #16\n" + "st1 {v21.4s}, [x11], #16\n" + "st1 {v6.4s}, [x11], #16\n" + "st1 {v22.4s}, [x11], #16\n" + "st1 {v7.4s}, [x11], #16\n" + "st1 {v23.4s}, [x11], #16\n" + + : + : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) + : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + return; +} +#endif +#ifdef ENABLE_ARM32 +void RowMajor2Col8Major_arm32(const float *src_c, float *dst_c, size_t col) { + size_t stride = col * sizeof(float); + asm volatile( + "mov r10, %[src_c]\n" + "mov r11, %[dst_c]\n" + + "vld1.32 {q0}, [r10], %[stride]\n" + "vld1.32 {q2}, [r10], %[stride]\n" + "vld1.32 {q4}, [r10], %[stride]\n" + "vld1.32 {q6}, [r10], %[stride]\n" + + "vtrn.32 d0, d4\n" + "vtrn.32 d1, d5\n" + "vtrn.32 d8, d12\n" + "vtrn.32 d9, d13\n" + + "vld1.32 {q1}, [r10], %[stride]\n" + "vld1.32 {q3}, [r10], %[stride]\n" + "vld1.32 {q5}, [r10], %[stride]\n" + "vld1.32 {q7}, [r10], %[stride]\n" + + "vswp d1, d8\n" + "vswp d5, d12\n" + + "vtrn.32 d2, d6\n" + "vtrn.32 d3, d7\n" + "vtrn.32 d10, d14\n" + "vtrn.32 d11, d15\n" + + "vswp d3, d10\n" + "vswp d7, d14\n" + + "vst1.32 {q0, q1}, [r11]!\n" + "vst1.32 {q2, q3}, [r11]!\n" + "vst1.32 {q4, q5}, [r11]!\n" + "vst1.32 {q6, q7}, [r11]!\n" + + : + : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) + : "r10", "r11", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + return; +} +#endif void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { size_t row8 = row / C8NUM * C8NUM; #ifdef ENABLE_ARM64 @@ -326,127 +453,10 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t float *dst_c = dst_r + ci * C8NUM; #ifdef ENABLE_ARM64 - /* 8x8 row-major to col-major */ - size_t stride = col * sizeof(float); - asm volatile( - "mov x10, %[src_c]\n" - "mov x11, %[dst_c]\n" - - "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" - "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" - "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" - "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" - - "zip1 v8.4s, v0.4s, v2.4s\n" - "zip2 v9.4s, v0.4s, v2.4s\n" - "zip1 v10.4s, v4.4s, v6.4s\n" - "zip2 v11.4s, v4.4s, v6.4s\n" - - "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" - "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" - "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" - "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" - - "zip1 v12.4s, v1.4s, v3.4s\n" - "zip2 v13.4s, v1.4s, v3.4s\n" - "zip1 v14.4s, v5.4s, v7.4s\n" - "zip2 v15.4s, v5.4s, v7.4s\n" - - "trn1 v0.2d, v8.2d, v10.2d\n" - "trn2 v1.2d, v8.2d, v10.2d\n" - "trn1 v2.2d, v9.2d, v11.2d\n" - "trn2 v3.2d, v9.2d, v11.2d\n" - - "zip1 v24.4s, v16.4s, v18.4s\n" - "zip2 v25.4s, v16.4s, v18.4s\n" - "zip1 v26.4s, v20.4s, v22.4s\n" - "zip2 v27.4s, v20.4s, v22.4s\n" - - "trn1 v4.2d, v12.2d, v14.2d\n" - "trn2 v5.2d, v12.2d, v14.2d\n" - "trn1 v6.2d, v13.2d, v15.2d\n" - "trn2 v7.2d, v13.2d, v15.2d\n" - - "zip1 v28.4s, v17.4s, v19.4s\n" - "zip2 v29.4s, v17.4s, v19.4s\n" - "zip1 v30.4s, v21.4s, v23.4s\n" - "zip2 v31.4s, v21.4s, v23.4s\n" - - "trn1 v16.2d, v24.2d, v26.2d\n" - "trn2 v17.2d, v24.2d, v26.2d\n" - "trn1 v18.2d, v25.2d, v27.2d\n" - "trn2 v19.2d, v25.2d, v27.2d\n" - - "trn1 v20.2d, v28.2d, v30.2d\n" - "trn2 v21.2d, v28.2d, v30.2d\n" - "trn1 v22.2d, v29.2d, v31.2d\n" - "trn2 v23.2d, v29.2d, v31.2d\n" - - "st1 {v0.4s}, [x11], #16\n" - "st1 {v16.4s}, [x11], #16\n" - "st1 {v1.4s}, [x11], #16\n" - "st1 {v17.4s}, [x11], #16\n" - "st1 {v2.4s}, [x11], #16\n" - "st1 {v18.4s}, [x11], #16\n" - "st1 {v3.4s}, [x11], #16\n" - "st1 {v19.4s}, [x11], #16\n" - "st1 {v4.4s}, [x11], #16\n" - "st1 {v20.4s}, [x11], #16\n" - "st1 {v5.4s}, [x11], #16\n" - "st1 {v21.4s}, [x11], #16\n" - "st1 {v6.4s}, [x11], #16\n" - "st1 {v22.4s}, [x11], #16\n" - "st1 {v7.4s}, [x11], #16\n" - "st1 {v23.4s}, [x11], #16\n" - - : - : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) - : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31"); + RowMajor2Col8Major_arm64(src_c, dst_c, col); #elif ENABLE_ARM32 - /* 8x4 row-major to col-major */ - size_t stride = col * sizeof(float); - asm volatile( - "mov r10, %[src_c]\n" - "mov r11, %[dst_c]\n" - - "vld1.32 {q0}, [r10], %[stride]\n" - "vld1.32 {q2}, [r10], %[stride]\n" - "vld1.32 {q4}, [r10], %[stride]\n" - "vld1.32 {q6}, [r10], %[stride]\n" - - "vtrn.32 d0, d4\n" - "vtrn.32 d1, d5\n" - "vtrn.32 d8, d12\n" - "vtrn.32 d9, d13\n" - - "vld1.32 {q1}, [r10], %[stride]\n" - "vld1.32 {q3}, [r10], %[stride]\n" - "vld1.32 {q5}, [r10], %[stride]\n" - "vld1.32 {q7}, [r10], %[stride]\n" - - "vswp d1, d8\n" - "vswp d5, d12\n" - - "vtrn.32 d2, d6\n" - "vtrn.32 d3, d7\n" - "vtrn.32 d10, d14\n" - "vtrn.32 d11, d15\n" - - "vswp d3, d10\n" - "vswp d7, d14\n" - - "vst1.32 {q0, q1}, [r11]!\n" - "vst1.32 {q2, q3}, [r11]!\n" - "vst1.32 {q4, q5}, [r11]!\n" - "vst1.32 {q6, q7}, [r11]!\n" - - : - : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) - : "r10", "r11", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + RowMajor2Col8Major_arm32(src_c, dst_c, col); #elif ENABLE_SSE - /* 8x4 row-major to col-major */ __m128 src1 = _mm_loadu_ps(src_c); __m128 src2 = _mm_loadu_ps(src_c + col); __m128 src3 = _mm_loadu_ps(src_c + 2 * col); @@ -492,12 +502,16 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t src_r += C8NUM * col; dst_r += C8NUM * col; } - for (; ri < row; ri++) { + for (; ri < row; ri++, src_r += col, dst_r++) { for (size_t i = 0; i < col; i++) { dst_r[i * C8NUM] = src_r[i]; } - src_r += col; - dst_r += 1; + } + + for (; ri < UP_ROUND(row, C8NUM); ri++, dst_r++) { + for (size_t i = 0; i < col; i++) { + dst_r[i * C8NUM] = 0; + } } return; } @@ -538,6 +552,14 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_ src_r += col; dst_r += 1; } + + size_t total_row = UP_ROUND(row, C16NUM); + for (; ri < total_row; ri++) { + for (size_t i = 0; i < col; i++) { + dst_r[i * C16NUM] = 0; + } + dst_r += 1; + } return; } @@ -555,7 +577,6 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t const float *src_c = src_r + ci; float *dst_c = dst_r + ci * C6NUM; - /* 6x8 row-major to col-major */ #ifdef ENABLE_AVX __m256 src0 = _mm256_loadu_ps(src_c); __m256 src1 = _mm256_loadu_ps(src_c + col); @@ -642,19 +663,19 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t } void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { - size_t row8 = row / C4NUM * C4NUM; + size_t total_row = UP_ROUND(row, C4NUM); + size_t row4 = row / C4NUM * C4NUM; size_t col4 = col / C4NUM * C4NUM; const float *src_r = src_ptr; float *dst_r = dst_ptr; size_t ri = 0; - for (; ri < row8; ri += C4NUM) { + for (; ri < row4; ri += C4NUM) { size_t ci = 0; for (; ci < col4; ci += C4NUM) { const float *src_c = src_r + ci; float *dst_c = dst_r + ci * C4NUM; - /* 4x4 row-major to col-major */ #ifdef ENABLE_ARM32 size_t stride = col * 4; asm volatile( @@ -727,9 +748,31 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t src_r += col; dst_r += 1; } + + for (; ri < total_row; ri++) { + for (size_t i = 0; i < col; i++) { + dst_r[i * C4NUM] = 0; + } + dst_r += 1; + } return; } +#ifndef ENABLE_ARM +void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) { + for (int ci = 0; ci < col; ci++) { + float value = 0; + for (int di = 0; di < depth; di++) { + value += a[di] * b[ci * depth + di]; + } + if (bias != NULL) value += bias[ci]; + if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); + if (act_type == ActType_Relu || act_type == ActType_Relu6) value = MSMAX(0.0f, value); + c[ci] = value; + } + return; +} +#endif void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, int col, int stride, int out_type) { if (out_type == OutType_Nhwc) { @@ -744,9 +787,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A size_t bi = c8div * deep * 8 + d * 8 + c8mod; value = value + a[ai] * b[bi]; } - if (bias != NULL) value += bias[c]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); + ADD_BIAS(value, bias, c) + DO_RELU(value, act_type) + DO_RELU6(value, act_type) dst[ci] = value; } } @@ -764,9 +807,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod; value = value + a[ai] * b[bi]; } - if (bias != NULL) value += bias[c]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); + ADD_BIAS(value, bias, c) + DO_RELU(value, act_type) + DO_RELU6(value, act_type) dst[ci] = value; } } @@ -783,79 +826,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A size_t bi = c8div * deep * 8 + d * 8 + c8mod; value = value + a[ai] * b[bi]; } - if (bias != NULL) value += bias[j]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); - dst[ci] = value; - } - } - } - return; -} - -void MatMul6x16(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, - int col, int stride, int out_type) { - if (out_type == OutType_Nhwc) { - for (int r = 0; r < row; r++) { - for (int c = 0; c < col; c++) { - int r6div = r / C6NUM, r6mod = r % C6NUM; - int c16div = c / C16NUM, c16mod = c % C16NUM; - size_t ci = r * stride + c; - float value = 0; - for (int d = 0; d < deep; d++) { - size_t ai = r6div * deep * C6NUM + d * C6NUM + r6mod; - size_t bi = c16div * deep * C16NUM + d * C16NUM + c16mod; - value = value + a[ai] * b[bi]; - } - if (bias != NULL) value += bias[c]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); - dst[ci] = value; - } - } - } else { - for (int i = 0; i < row; ++i) { - int dst_r_offset = i * col * stride; - int r6div = i / C6NUM, r6mod = i % C6NUM; - for (int j = 0; j < col; ++j) { - int b16div = j / C16NUM, b16mod = j % C16NUM; - int c8div = j / C8NUM, c8mod = j % C8NUM; - size_t ci = dst_r_offset + c8div * C8NUM * stride + c8mod; - float value = 0; - for (int d = 0; d < deep; ++d) { - size_t ai = r6div * deep * C6NUM + d * C6NUM + r6mod; - size_t bi = b16div * deep * C16NUM + d * C16NUM + b16mod; - value = value + a[ai] * b[bi]; - } - if (bias != NULL) value += bias[j]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); - dst[ci] = value; - } - } - } - return; -} - -void MatMul4x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, - int col, int stride, int out_type) { - if (out_type == OutType_C8) { - int col_8 = UP_ROUND(col, C8NUM); - int row_4 = UP_ROUND(row, C4NUM); - for (int r = 0; r < row_4; r++) { - for (int c = 0; c < col_8; c++) { - int r4div = r / C4NUM, r4mod = r % C4NUM; - int c8div = c / C8NUM, c8mod = c % C8NUM; - size_t ci = (c8div * C8NUM * row_4 + r * C8NUM + c8mod); - float value = 0; - for (int d = 0; d < deep; d++) { - size_t ai = r4div * deep * C4NUM + d * C4NUM + r4mod; - size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod; - value = value + a[ai] * b[bi]; - } - if (bias != NULL) value += bias[c]; - if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); - if (act_type != ActType_No) value = MSMAX(0.0f, value); + ADD_BIAS(value, bias, j) + DO_RELU(value, act_type) + DO_RELU6(value, act_type) dst[ci] = value; } } @@ -895,44 +868,3 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type); #endif } - -void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { -#ifdef ENABLE_ARM - MatVecMulFp32(a, b, c, bias, (int)act_type, depth, col); -#endif -} - -#ifdef ENABLE_NNACL_INFER_SHAPE -static void SwapDims(int *dims, int index1, int index2) { - int tmp = dims[index1]; - dims[index1] = dims[index2]; - dims[index2] = tmp; -} - -int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format, - int *in_datatype, int *out_datatype, OpParameter *param) { - *out_datatype = in_datatype[0]; - *out_format = in_format[0]; - if (dim_size[0] < 2 || dim_size[1] < 2) { - return NNACL_PARAM_INVALID; - } - - for (int i = 0; i < dim_size[0] - 2; ++i) { - if (in_shape[0][i] != in_shape[1][i]) { - return NNACL_PARAM_INVALID; - } - } - MatMulParameter *matmul_param = (MatMulParameter *)param; - if (matmul_param->a_transpose_) { - SwapDims(in_shape[0], dim_size[0] - 1, dim_size[0] - 2); - } - if (matmul_param->b_transpose_) { - SwapDims(in_shape[1], dim_size[1] - 1, dim_size[1] - 2); - } - for (int i = 0; i < dim_size[0] - 1; ++i) { - out_shape[i] = in_shape[0][i]; - } - out_shape[dim_size[0] - 1] = in_shape[1][dim_size[1] - 1]; - return NNACL_OK; -} -#endif diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.h b/mindspore/lite/nnacl/fp32/matmul_fp32.h index b2864b7921..4a629eeadd 100644 --- a/mindspore/lite/nnacl/fp32/matmul_fp32.h +++ b/mindspore/lite/nnacl/fp32/matmul_fp32.h @@ -23,12 +23,23 @@ #include "nnacl/matmul_parameter.h" #include "nnacl/op_base.h" +#define ADD_BIAS(value, bias, c) \ + if (bias != NULL) value = value + bias[c]; + +#define DO_RELU(value, act_type) \ + if (act_type == ActType_Relu) value = MSMAX(0.0f, value); + +#define DO_RELU6(value, act_type) \ + if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); \ + if (act_type == ActType_Relu6) value = MSMAX(0.0f, value); + #ifdef __cplusplus extern "C" { #endif void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, int col, size_t stride, int out_type); -void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col); +void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); + void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col); @@ -40,9 +51,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); -#ifdef ENABLE_ARM -void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); -#endif + #ifdef ENABLE_ARM64 void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino); @@ -67,10 +76,6 @@ void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bi #endif #endif -#ifdef ENABLE_NNACL_INFER_SHAPE -int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format, - int *in_datatype, int *out_datatype, OpParameter *param); -#endif #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h index ad4ca3ea87..9681080fef 100644 --- a/mindspore/lite/nnacl/matmul_parameter.h +++ b/mindspore/lite/nnacl/matmul_parameter.h @@ -44,14 +44,11 @@ typedef struct MatMulParameter { int col_; int row_4_; int row_6_; - int row_8_; int row_12_; int row_16_; int row_align_; - int col_2_; int col_4_; int col_8_; - int col_16_; int col_align_; int deep_; int deep_4_; @@ -61,8 +58,6 @@ typedef struct MatMulParameter { bool b_transpose_; /* true : col-major */ bool a_const_; bool b_const_; - bool a_init_shape_; - bool b_init_shape_; ActType act_type_; } MatMulParameter; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc index b2a76cec3d..badde5e6d3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc @@ -62,11 +62,8 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; matmul_param_->col_ = conv_param_->output_channel_; matmul_param_->deep_ = conv_param_->input_channel_; - matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); - matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM); - matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_); - matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); + matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_); matmul_param_->act_type_ = conv_param_->act_type_; return; } @@ -76,20 +73,6 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { auto input_channel = filter_tensor->Channel(); auto output_channel = filter_tensor->Batch(); -#ifdef ENABLE_AVX - row_tile_ = C6NUM; - col_tile_ = C16NUM; -#elif defined(ENABLE_SSE) - row_tile_ = C4NUM; - col_tile_ = C8NUM; -#elif defined(ENABLE_ARM32) - row_tile_ = C12NUM; - col_tile_ = C4NUM; -#else - row_tile_ = C12NUM; - col_tile_ = C8NUM; -#endif - if (in_tensors_.size() == 3) { int size = UP_ROUND(output_channel, col_tile_) * sizeof(float); int weight_size = output_channel * sizeof(float); @@ -146,6 +129,19 @@ int Convolution1x1CPUKernel::InitConv1x1Param() { } int Convolution1x1CPUKernel::Init() { +#ifdef ENABLE_AVX + row_tile_ = C6NUM; + col_tile_ = C16NUM; +#elif defined(ENABLE_SSE) + row_tile_ = C4NUM; + col_tile_ = C8NUM; +#elif defined(ENABLE_ARM32) + row_tile_ = C12NUM; + col_tile_ = C4NUM; +#else + row_tile_ = C12NUM; + col_tile_ = C8NUM; +#endif matmul_param_ = new (std::nothrow) MatMulParameter; if (matmul_param_ == nullptr) { MS_LOG(ERROR) << "Memory allocation failed"; @@ -270,20 +266,6 @@ void Convolution1x1CPUKernel::PackWeight() { auto input_channel = filter_tensor->Channel(); auto output_channel = filter_tensor->Batch(); -#ifdef ENABLE_AVX - row_tile_ = C6NUM; - col_tile_ = C16NUM; -#elif defined(ENABLE_SSE) - row_tile_ = C4NUM; - col_tile_ = C8NUM; -#elif defined(ENABLE_ARM32) - row_tile_ = C12NUM; - col_tile_ = C4NUM; -#else - row_tile_ = C12NUM; - col_tile_ = C8NUM; -#endif - int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float); int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float); memset(reinterpret_cast(weight_ptr_) + down_size, 0, size - down_size); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc index e460d81271..5d6fa3f621 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc @@ -21,228 +21,56 @@ using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_INVALID_OP_ATTR; -using mindspore::lite::RET_MEMORY_FAILED; -using mindspore::lite::RET_NULL_PTR; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { -FullconnectionCPUKernel::~FullconnectionCPUKernel() { - FreeBuf(); - return; -} - -void FullconnectionCPUKernel::FreeBuf() { - if (a_pack_ptr_ != nullptr) { - free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (b_pack_ptr_ != nullptr) { - free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } - if (bias_ptr_ != nullptr) { - free(bias_ptr_); - bias_ptr_ = nullptr; - } -} - -int FullconnectionCPUKernel::ReSize() { - FreeBuf(); - int row = 1; - for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { - row *= (out_tensors_.at(0)->shape())[i]; - } - fc_param_->row_ = row; - fc_param_->col_ = out_tensors_.at(0)->shape().back(); - fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); - -#ifdef ENABLE_AVX - int col_tile = C16NUM; -#elif defined(ENABLE_ARM32) - int col_tile = C4NUM; -#else - int col_tile = C8NUM; -#endif - fc_param_->row_12_ = UP_ROUND(fc_param_->row_, C12NUM); - fc_param_->col_align_ = UP_ROUND(fc_param_->col_, col_tile); - fc_param_->row_6_ = UP_ROUND(fc_param_->row_, C6NUM); - fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); - - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_align_, col_tile)); - thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_align_, col_tile), thread_count_); +int FullconnectionCPUKernel::Init() { + MatmulFp32BaseCPUKernel::InitParameter(); -#ifdef ENABLE_ARM - if (fc_param_->row_ == 1) { - is_vector_input_ = true; - } else { - is_vector_input_ = false; - } -#endif - if (in_tensors_.size() == 3) { - int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_align_; - bias_ptr_ = reinterpret_cast(malloc(col_tmp * sizeof(float))); - if (bias_ptr_ == nullptr) { - MS_LOG(ERROR) << "malloc bias_ptr_ failed"; - return RET_ERROR; - } - memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float)); + if (params_->a_const_ == true) { + auto a_shape = in_tensors_.at(0)->shape(); + params_->row_ = a_shape[0]; + params_->deep_ = a_shape[1]; } -#ifdef ENABLE_AVX - int row_tmp = is_vector_input_ ? 1 : fc_param_->row_6_; -#elif defined(ENABLE_SSE) - int row_tmp = is_vector_input_ ? 1 : fc_param_->row_4_; -#else - int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; -#endif - a_pack_ptr_ = reinterpret_cast(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); - if (a_pack_ptr_ == nullptr) { - return RET_MEMORY_FAILED; + if (params_->b_const_ == true) { + auto b_shape = in_tensors_.at(1)->shape(); + params_->col_ = b_shape[0]; + params_->deep_ = b_shape[1]; } - memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); - int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_align_; - b_pack_ptr_ = reinterpret_cast(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); - if (b_pack_ptr_ == nullptr) { - FreeBuf(); - return RET_MEMORY_FAILED; - } - memset(b_pack_ptr_, 0, col_tmp * fc_param_->deep_ * sizeof(float)); + params_->batch = 1; + params_->a_transpose_ = false; + params_->b_transpose_ = true; - fc_param_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); - fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); - if (fc_param_->a_const_) { - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->MutableData()), a_pack_ptr_); - a_ptr_ = a_pack_ptr_; - } - if (fc_param_->b_const_) { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->MutableData()), b_pack_ptr_); - b_ptr_ = b_pack_ptr_; + auto ret = MatmulFp32BaseCPUKernel::Init(); + if (ret != RET_OK) { + return ret; } - return RET_OK; -} -int FullconnectionCPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } return ReSize(); } -void FullconnectionCPUKernel::InitMatrixA(const float *src_ptr, float *dst_ptr) { - if (is_vector_input_) { - memcpy(dst_ptr, src_ptr, fc_param_->deep_ * sizeof(float)); - return; - } - -#ifdef ENABLE_AVX - RowMajor2Col6Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); -#elif defined(ENABLE_SSE) - RowMajor2Col4Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); -#else - RowMajor2Col12Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); -#endif -} - -void FullconnectionCPUKernel::InitMatrixB(const float *src_ptr, float *dst_ptr) { - if (is_vector_input_) { - memcpy(dst_ptr, src_ptr, fc_param_->col_ * fc_param_->deep_ * sizeof(float)); - return; - } -#ifdef ENABLE_AVX - RowMajor2Col16Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); -#elif defined(ENABLE_ARM32) - RowMajor2Col4Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); -#else - RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); -#endif -} - -int FcFp32MatmulRun(void *cdata, int task_id) { - auto fc = reinterpret_cast(cdata); - auto error_code = fc->DoMatmul(task_id); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "FcFp32MatmulRun error task_id[" << task_id << "] error_code[" << error_code << "]"; - return RET_ERROR; - } - return RET_OK; -} - -int FullconnectionCPUKernel::DoMatmul(int task_id) { -#ifdef ENABLE_AVX - int col_tile = C16NUM; -#elif defined(ENABLE_ARM32) - int col_tile = C4NUM; -#else - int col_tile = C8NUM; -#endif - int cur_oc = MSMIN(thread_stride_ * col_tile, fc_param_->col_ - task_id * thread_stride_ * col_tile); - if (cur_oc <= 0) { - return RET_OK; - } - - auto b = b_ptr_ + task_id * thread_stride_ * col_tile * fc_param_->deep_; - auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * col_tile; - auto c = c_ptr_ + task_id * thread_stride_ * col_tile; - if (is_vector_input_) { - MatVecMul(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); - } else { - MatMulOpt(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, - OutType_Nhwc); +int FullconnectionCPUKernel::ReSize() { + int row = 1; + for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { + row *= (out_tensors_.at(0)->shape())[i]; } + params_->row_ = row; + params_->col_ = out_tensors_.at(0)->shape().back(); + params_->deep_ = (in_tensors_.at(1)->shape()).at(1); - return RET_OK; + return MatmulFp32BaseCPUKernel::ReSize(); } int FullconnectionCPUKernel::Run() { - auto a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); - auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); - c_ptr_ = reinterpret_cast(out_tensors_.at(0)->data_c()); - - if (!fc_param_->a_const_) { - if (is_vector_input_) { - a_ptr_ = a_ptr; - } else { - InitMatrixA(a_ptr, a_pack_ptr_); - a_ptr_ = a_pack_ptr_; - } - } - if (!fc_param_->b_const_) { - if (is_vector_input_) { - b_ptr_ = b_ptr; - } else { - InitMatrixB(b_ptr, b_pack_ptr_); - b_ptr_ = b_pack_ptr_; - } - } - ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_); - + MatmulFp32BaseCPUKernel::Run(); return RET_OK; } -kernel::LiteKernel *CpuFullConnectionFp32KernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - MS_ASSERT(opParameter != nullptr); - MS_ASSERT(desc.type == schema::PrimitiveType_FullConnection); - auto kernel = new (std::nothrow) FullconnectionCPUKernel(opParameter, inputs, outputs, ctx, primitive); - if (!kernel) { - MS_LOG(ERROR) << "kernel is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " - << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); - delete kernel; - return nullptr; - } - return kernel; -} -REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, CpuFullConnectionFp32KernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h index c4ef67b33f..450efff2fb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h @@ -21,43 +21,19 @@ #include "include/context.h" #include "include/errorcode.h" #include "nnacl/fp32/matmul_fp32.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" -using mindspore::lite::InnerContext; namespace mindspore::kernel { -class FullconnectionCPUKernel : public LiteKernel { +class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel { public: FullconnectionCPUKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, const InnerContext *ctx, + const std::vector &outputs, const mindspore::lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - fc_param_ = reinterpret_cast(op_parameter_); - } - ~FullconnectionCPUKernel() override; - + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~FullconnectionCPUKernel() = default; int Init() override; int ReSize() override; int Run() override; - - public: - int DoMatmul(int task_id); - void FreeBuf(); - - private: - void InitMatrixA(const float *src_ptr, float *dst_ptr); - void InitMatrixB(const float *src_ptr, float *dst_ptr); - - private: - MatMulParameter *fc_param_ = nullptr; - float *a_pack_ptr_ = nullptr; - float *b_pack_ptr_ = nullptr; - float *c_ptr_ = nullptr; - float *bias_ptr_ = nullptr; - float *a_ptr_ = nullptr; - float *b_ptr_ = nullptr; - bool is_vector_input_ = false; - int thread_count_ = 1; - int thread_stride_ = 0; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc index fe735320ea..0239e53fc9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc @@ -17,47 +17,53 @@ #include "src/runtime/kernel/arm/fp32/matmul_fp32.h" #include "include/errorcode.h" #include "nnacl/fp32/matmul_fp32.h" -#include "src/runtime/runtime_api.h" #include "src/kernel_registry.h" -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_INPUT_TENSOR_ERROR; -using mindspore::lite::RET_MEMORY_FAILED; -using mindspore::lite::RET_OK; - using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { -MatmulCPUKernel::~MatmulCPUKernel() { - if (a_pack_ptr_ != nullptr) { - free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (b_pack_ptr_ != nullptr) { - free(b_pack_ptr_); - b_pack_ptr_ = nullptr; +int MatmulCPUKernel::Init() { + MatmulFp32BaseCPUKernel::InitParameter(); + + if (params_->a_const_ == true) { + auto a_shape = in_tensors_.at(0)->shape(); + int batch = 1; + for (size_t i = 0; i < a_shape.size() - 2; ++i) { + batch *= a_shape[i]; + } + params_->batch = batch; + params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; + params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; } - if (bias_ptr_ != nullptr) { - free(bias_ptr_); - bias_ptr_ = nullptr; + + if (params_->b_const_ == true) { + auto b_shape = in_tensors_.at(1)->shape(); + int batch = 1; + for (size_t i = 0; i < b_shape.size() - 2; ++i) { + batch *= b_shape[i]; + } + params_->batch = batch; + params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; + params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; } -} -void MatmulCPUKernel::FreeTmpBuffer() { - if (a_pack_ptr_ != nullptr) { - params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); - a_pack_ptr_ = nullptr; + auto ret = MatmulFp32BaseCPUKernel::Init(); + if (ret != RET_OK) { + return ret; } - if (b_pack_ptr_ != nullptr) { - params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); - b_pack_ptr_ = nullptr; + + if (!InferShapeDone()) { + return RET_OK; } + return ReSize(); } -int MatmulCPUKernel::MallocMatrixABuffer() { +int MatmulCPUKernel::ReSize() { auto a_shape = in_tensors_.at(0)->shape(); + auto b_shape = in_tensors_.at(1)->shape(); int batch = 1; MS_ASSERT(a_shape.size() >= 2); for (size_t i = 0; i < a_shape.size() - 2; ++i) { @@ -65,307 +71,34 @@ int MatmulCPUKernel::MallocMatrixABuffer() { } params_->batch = batch; params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; -#ifdef ENABLE_ARM - if (params_->a_init_shape_ && params_->row_ == 1) { - is_vector_a_ = true; - } else { - is_vector_a_ = false; - } -#endif - params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; -#ifdef ENABLE_AVX - params_->row_align_ = UP_ROUND(params_->row_, C6NUM); -#elif defined(ENABLE_SSE) - params_->row_align_ = UP_ROUND(params_->row_, C4NUM); -#else - params_->row_align_ = UP_ROUND(params_->row_, C12NUM); -#endif - - int row_tmp = is_vector_a_ ? 1 : params_->row_align_; - if (params_->a_const_) { - a_pack_ptr_ = reinterpret_cast(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); - } else { - a_pack_ptr_ = - reinterpret_cast(context_->allocator->Malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); - } - if (a_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - - return RET_OK; -} - -int MatmulCPUKernel::MallocMatrixBBuffer() { - auto b_shape = in_tensors_.at(1)->shape(); - if (b_shape.empty()) { - return RET_OK; - } - int batch = 1; - MS_ASSERT(b_shape.size() >= 2); - for (size_t i = 0; i < b_shape.size() - 2; ++i) { - batch *= b_shape[i]; - } - params_->batch = batch; params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; - params_->col_align_ = UP_ROUND(params_->col_, col_tile_); - params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; - - int col_tmp = is_vector_a_ ? params_->col_ : params_->col_align_; - if (params_->b_const_) { - b_pack_ptr_ = reinterpret_cast(malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); - } else { - b_pack_ptr_ = - reinterpret_cast(context_->allocator->Malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); - } - if (b_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_)); - thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_); - return RET_OK; -} + params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; -int MatmulCPUKernel::InitBias() { - auto b_shape = in_tensors_.at(1)->shape(); - auto c_shape = out_tensors_.at(0)->shape(); - params_->col_ = params_->b_const_ - ? (params_->b_transpose_ ? b_shape.at(b_shape.size() - 2) : b_shape.at(b_shape.size() - 1)) - : (c_shape.at(c_shape.size() - 1)); - params_->col_align_ = UP_ROUND(params_->col_, col_tile_); - auto col_tmp = is_vector_a_ ? params_->col_ : params_->col_align_; - if (bias_ptr_ == nullptr) { - bias_ptr_ = reinterpret_cast(malloc(col_tmp * sizeof(float))); - if (bias_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - } - memset(bias_ptr_, 0, col_tmp * sizeof(float)); - if (in_tensors_.size() == 3) { - memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float)); - } - return RET_OK; + return MatmulFp32BaseCPUKernel::ReSize(); } -int MatmulCPUKernel::ReSize() { - if (!params_->b_const_) { - free(bias_ptr_); - bias_ptr_ = nullptr; - auto ret = InitBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 init bias failed"; +int MatmulCPUKernel::Run() { + if (IsTrain()) { + if (RET_OK != InitBufferA()) { return RET_ERROR; } - } - return RET_OK; -} - -void MatmulCPUKernel::InitMatrixA(const float *src_ptr, float *dst_ptr) { - if (is_vector_a_) { - memcpy(dst_ptr, src_ptr, params_->batch * params_->deep_ * sizeof(float)); - return; - } + InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c())); - for (int i = 0; i < params_->batch; i++) { - const float *src = src_ptr + i * params_->deep_ * params_->row_; - float *dst = dst_ptr + i * params_->deep_ * params_->row_align_; -#ifdef ENABLE_AVX - if (params_->a_transpose_) { - RowMajor2Row6Major(src, dst, params_->deep_, params_->row_); - } else { - RowMajor2Col6Major(src, dst, params_->row_, params_->deep_); - } -#elif defined(ENABLE_SSE) - if (params_->a_transpose_) { - RowMajor2Row4Major(src, dst, params_->deep_, params_->row_); - } else { - RowMajor2Col4Major(src, dst, params_->row_, params_->deep_); - } -#else - if (params_->a_transpose_) { - RowMajor2Row12Major(src, dst, params_->deep_, params_->row_); - } else { - RowMajor2Col12Major(src, dst, params_->row_, params_->deep_); - } -#endif - } - return; -} - -void MatmulCPUKernel::InitMatrixB(const float *src_ptr, float *dst_ptr) { - if (is_vector_a_) { - if (params_->b_transpose_) { - memcpy(dst_ptr, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); - } else { - for (int i = 0; i < params_->batch; i++) { - const float *src = src_ptr + i * params_->deep_ * params_->col_; - float *dst = dst_ptr + i * params_->deep_ * params_->col_; - RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); - } - } - return; - } - - for (int i = 0; i < params_->batch; i++) { - const float *src = src_ptr + i * params_->deep_ * params_->col_; - float *dst = dst_ptr + i * params_->deep_ * params_->col_align_; -#ifdef ENABLE_AVX - if (params_->b_transpose_) { - RowMajor2Col16Major(src, dst, params_->col_, params_->deep_); - } else { - RowMajor2Row16Major(src, dst, params_->deep_, params_->col_); - } -#elif defined(ENABLE_ARM32) - if (params_->b_transpose_) { - RowMajor2Col4Major(src, dst, params_->col_, params_->deep_); - } else { - RowMajor2Row4Major(src, dst, params_->deep_, params_->col_); - } -#else - if (params_->b_transpose_) { - RowMajor2Col8Major(src, dst, params_->col_, params_->deep_); - } else { - RowMajor2Row8Major(src, dst, params_->deep_, params_->col_); - } -#endif - } - return; -} - -int MatmulCPUKernel::Init() { -#ifdef ENABLE_AVX - col_tile_ = C16NUM; -#elif defined(ENABLE_ARM32) - col_tile_ = C4NUM; -#else - col_tile_ = C8NUM; -#endif - params_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); - params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); - if (params_->a_const_) { - auto ret = MallocMatrixABuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix A buffer failed"; - return RET_ERROR; - } - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_); - a_ptr_ = a_pack_ptr_; - } - if (params_->b_const_) { - auto ret = MallocMatrixBBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix B buffer failed"; - return RET_ERROR; - } - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - b_ptr_ = b_pack_ptr_; - // init bias - ret = InitBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 init bias failed"; + if (RET_OK != InitBufferB()) { return RET_ERROR; } - } - return RET_OK; -} + InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c())); -int MatmulCPUKernel::RunImpl(int task_id) { - int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); - if (cur_oc <= 0) { - return RET_OK; + FreeBiasBuf(); + InitBiasData(); } - auto b = cur_b_ptr_ + task_id * thread_stride_ * col_tile_ * params_->deep_; - auto c = cur_c_ptr_ + task_id * thread_stride_ * col_tile_; - auto bias = bias_ptr_ ? bias_ptr_ + task_id * thread_stride_ * col_tile_ : NULL; - MS_ASSERT(cur_a_ptr_); - MS_ASSERT(b); - MS_ASSERT(c); - if (is_vector_a_) { - MatVecMul(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, cur_oc); - } else { - MatMulOpt(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); - } - return RET_OK; -} - -int MatmulFloatRun(void *cdata, int task_id) { - auto op = reinterpret_cast(cdata); - auto error_code = op->RunImpl(task_id); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; - return RET_ERROR; - } - return RET_OK; -} -int MatmulCPUKernel::Run() { - auto a_src = reinterpret_cast(in_tensors_.at(0)->data_c()); - auto b_src = reinterpret_cast(in_tensors_.at(1)->data_c()); - auto c_src = reinterpret_cast(out_tensors_.at(0)->data_c()); + MatmulFp32BaseCPUKernel::Run(); - if (!params_->a_const_ || IsTrain()) { - if (a_pack_ptr_ != nullptr) { - params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - auto ret = MallocMatrixABuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix a buffer failed"; - return RET_ERROR; - } - if (is_vector_a_) { - a_ptr_ = a_src; - } else { - InitMatrixA(a_src, a_pack_ptr_); - a_ptr_ = a_pack_ptr_; - } - } - if (!params_->b_const_ || IsTrain()) { - if (b_pack_ptr_ != nullptr) { - params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } - auto ret = MallocMatrixBBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix b buffer failed"; - return RET_ERROR; - } - if (is_vector_a_ && params_->b_transpose_) { - b_ptr_ = b_src; - } else { - InitMatrixB(b_src, b_pack_ptr_); - b_ptr_ = b_pack_ptr_; - } - } if (IsTrain()) { - InitBias(); - } - for (int i = 0; i < params_->batch; ++i) { - if (is_vector_a_) { - cur_a_ptr_ = a_ptr_ + i * params_->deep_; - cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_; - cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; - } else { - cur_a_ptr_ = a_ptr_ + i * params_->row_align_ * params_->deep_; - cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_align_; - cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; - } - auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 run function MatmulFloatRun failed"; - FreeTmpBuffer(); - return RET_ERROR; - } - } - if (!params_->a_const_ || IsTrain()) { - params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); + context_->allocator->Free(a_pack_ptr_); + context_->allocator->Free(b_pack_ptr_); a_pack_ptr_ = nullptr; - } - if (!params_->b_const_ || IsTrain()) { - params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); b_pack_ptr_ = nullptr; } return RET_OK; @@ -376,61 +109,24 @@ int MatmulCPUKernel::Eval() { auto a_src = reinterpret_cast(in_tensors_.at(0)->data_c()); auto b_src = reinterpret_cast(in_tensors_.at(1)->data_c()); LiteKernel::Eval(); + if (params_->a_const_) { - if (a_pack_ptr_ == nullptr) { - auto ret = MallocMatrixABuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix a buffer failed"; - return RET_ERROR; - } - } - if (is_vector_a_) { - a_ptr_ = a_src; - } else { - InitMatrixA(a_src, a_pack_ptr_); - a_ptr_ = a_pack_ptr_; + if (RET_OK != InitBufferA()) { + return RET_ERROR; } + InitMatrixA(a_src); } if (params_->b_const_) { - if (b_pack_ptr_ == nullptr) { - auto ret = MallocMatrixBBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp32 malloc matrix b buffer failed"; - return RET_ERROR; - } - } - if (is_vector_a_ && params_->b_transpose_) { - b_ptr_ = b_src; - } else { - InitMatrixB(b_src, b_pack_ptr_); - b_ptr_ = b_pack_ptr_; + if (RET_OK != InitBufferB()) { + return RET_ERROR; } + InitMatrixB(b_src); } - InitBias(); - return RET_OK; -} -kernel::LiteKernel *CpuMatmulFp32KernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - MS_ASSERT(opParameter != nullptr); - MS_ASSERT(desc.type == schema::PrimitiveType_MatMul); - auto kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " - << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); - delete kernel; - return nullptr; - } - return kernel; + FreeBiasBuf(); + InitBiasData(); + return RET_OK; } -REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulFp32KernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h index efcf5cd8fe..6a9bb0305a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h @@ -19,47 +19,20 @@ #include #include "nnacl/matmul_parameter.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" namespace mindspore::kernel { -class MatmulCPUKernel : public LiteKernel { +class MatmulCPUKernel : public MatmulFp32BaseCPUKernel { public: explicit MatmulCPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - params_ = reinterpret_cast(op_parameter_); - } - ~MatmulCPUKernel() override; + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~MatmulCPUKernel() = default; int Init() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); int Eval() override; - - private: - int MallocMatrixABuffer(); - int MallocMatrixBBuffer(); - int InitBias(); - void InitMatrixA(const float *src_ptr, float *dst_ptr); - void InitMatrixB(const float *src_ptr, float *dst_ptr); - void FreeTmpBuffer(); - - private: - MatMulParameter *params_ = nullptr; - float *a_pack_ptr_ = nullptr; - float *b_pack_ptr_ = nullptr; - float *bias_ptr_ = nullptr; - float *a_ptr_ = nullptr; - float *b_ptr_ = nullptr; - float *cur_a_ptr_ = nullptr; - float *cur_b_ptr_ = nullptr; - float *cur_c_ptr_ = nullptr; - bool is_vector_a_ = false; - int col_tile_ = 0; - int thread_stride_ = 0; - int thread_count_ = 0; }; } // namespace mindspore::kernel - #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc new file mode 100644 index 0000000000..77bf5b28f1 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc @@ -0,0 +1,299 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" +#include "nnacl/fp32/matmul_fp32.h" + +namespace mindspore::kernel { +int MatmulBaseFloatRun(void *cdata, int task_id) { + auto op = reinterpret_cast(cdata); + auto error_code = op->FloatRun(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() { + FreeResizeBufA(); + FreeResizeBufB(); + FreeBiasBuf(); + return; +} + +void MatmulFp32BaseCPUKernel::InitParameter() { + params_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); + params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); + +#ifdef ENABLE_AVX + row_tile_ = C6NUM; + col_tile_ = C16NUM; +#elif defined(ENABLE_ARM32) + row_tile_ = C12NUM; + col_tile_ = C4NUM; +#elif defined(ENABLE_SSE) + row_tile_ = C4NUM; + col_tile_ = C8NUM; +#else + row_tile_ = C12NUM; + col_tile_ = C8NUM; +#endif + return; +} + +void MatmulFp32BaseCPUKernel::ResizeParameter() { + if (params_->row_ == 1 && params_->b_const_ == false) { + vec_matmul_ = true; + } + params_->row_align_ = vec_matmul_ ? 1 : UP_ROUND(params_->row_, row_tile_); + params_->col_align_ = vec_matmul_ ? params_->col_ : UP_ROUND(params_->col_, col_tile_); + return; +} + +int MatmulFp32BaseCPUKernel::InitBufferA() { + if (a_pack_ptr_ != nullptr) { + return RET_OK; + } + a_pack_ptr_ = + reinterpret_cast(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); + if (a_pack_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc a_pack_ptr_ failed"; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::InitBufferB() { + if (b_pack_ptr_ != nullptr) { + return RET_OK; + } + b_pack_ptr_ = + reinterpret_cast(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); + if (b_pack_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc b_pack_ptr_ failed"; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::InitBiasData() { + if (in_tensors_.size() == 3) { + auto bias_tensor = in_tensors_[2]; + int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C16NUM); + bias_ptr_ = reinterpret_cast(malloc(max_bias_data * sizeof(float))); + if (bias_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc bias_ptr_ failed"; + return RET_ERROR; + } + memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float)); + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) { + if (vec_matmul_) { + memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float)); + return RET_OK; + } + + for (int i = 0; i < params_->batch; i++) { + const float *src = src_ptr + i * params_->deep_ * params_->row_; + float *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; +#ifdef ENABLE_AVX + if (params_->a_transpose_) { + RowMajor2Row6Major(src, dst, params_->deep_, params_->row_); + } else { + RowMajor2Col6Major(src, dst, params_->row_, params_->deep_); + } +#elif defined(ENABLE_SSE) + if (params_->a_transpose_) { + RowMajor2Row4Major(src, dst, params_->deep_, params_->row_); + } else { + RowMajor2Col4Major(src, dst, params_->row_, params_->deep_); + } +#else + if (params_->a_transpose_) { + RowMajor2Row12Major(src, dst, params_->deep_, params_->row_); + } else { + RowMajor2Col12Major(src, dst, params_->row_, params_->deep_); + } +#endif + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) { + if (vec_matmul_) { + if (params_->b_transpose_) { + memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); + } else { + for (int i = 0; i < params_->batch; i++) { + const float *src = src_ptr + i * params_->deep_ * params_->col_; + float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_; + RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); + } + } + return RET_OK; + } + + for (int i = 0; i < params_->batch; i++) { + const float *src = src_ptr + i * params_->deep_ * params_->col_; + float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; +#ifdef ENABLE_AVX + if (params_->b_transpose_) { + RowMajor2Col16Major(src, dst, params_->col_, params_->deep_); + } else { + RowMajor2Row16Major(src, dst, params_->deep_, params_->col_); + } +#elif defined(ENABLE_ARM32) + if (params_->b_transpose_) { + RowMajor2Col4Major(src, dst, params_->col_, params_->deep_); + } else { + RowMajor2Row4Major(src, dst, params_->deep_, params_->col_); + } +#else + if (params_->b_transpose_) { + RowMajor2Col8Major(src, dst, params_->col_, params_->deep_); + } else { + RowMajor2Row8Major(src, dst, params_->deep_, params_->col_); + } +#endif + } + return RET_OK; +} + +void MatmulFp32BaseCPUKernel::FreeBiasBuf() { + if (bias_ptr_ != nullptr) { + free(bias_ptr_); + bias_ptr_ = nullptr; + } + return; +} + +void MatmulFp32BaseCPUKernel::FreeResizeBufA() { + if (a_pack_ptr_ != nullptr) { + context_->allocator->Free(a_pack_ptr_); + a_pack_ptr_ = nullptr; + } + return; +} + +void MatmulFp32BaseCPUKernel::FreeResizeBufB() { + if (b_pack_ptr_ != nullptr) { + context_->allocator->Free(b_pack_ptr_); + b_pack_ptr_ = nullptr; + } + return; +} + +int MatmulFp32BaseCPUKernel::FloatRun(int task_id) { + int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); + if (cur_oc <= 0) { + return RET_OK; + } + + auto b = batch_b_ptr_ + task_id * thread_stride_ * col_tile_ * params_->deep_; + auto c = batch_c_ptr_ + task_id * thread_stride_ * col_tile_; + auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * col_tile_; + if (vec_matmul_) { + MatVecMulFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); + } else { + MatMulOpt(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, + OutType_Nhwc); + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::Init() { + ResizeParameter(); + + auto ret = InitBiasData(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "InitBiasData failed"; + return ret; + } + + if (params_->a_const_ == true) { + if (RET_OK != InitBufferA()) { + return RET_ERROR; + } + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c())); + } + + if (params_->b_const_ == true) { + if (RET_OK != InitBufferB()) { + return RET_ERROR; + } + InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c())); + } + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::ReSize() { + ResizeParameter(); + + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_); + return RET_OK; +} + +int MatmulFp32BaseCPUKernel::Run() { + auto a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); + auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); + c_ptr_ = reinterpret_cast(out_tensors_.at(0)->data_c()); + + if (params_->a_const_ == false) { + if (RET_OK != InitBufferA()) { + return RET_ERROR; + } + InitMatrixA(a_ptr); + } + if (params_->b_const_ == false) { + if (RET_OK != InitBufferB()) { + FreeResizeBufA(); + return RET_ERROR; + } + InitMatrixB(b_ptr); + } + + for (int i = 0; i < params_->batch; ++i) { + if (vec_matmul_) { + batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; + batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; + batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; + } else { + batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; + batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; + batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; + } + auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatmulBaseFloatRun failed"; + return ret; + } + } + + if (params_->a_const_ == false) { + FreeResizeBufA(); + } + + if (params_->b_const_ == false) { + FreeResizeBufB(); + } + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h new file mode 100644 index 0000000000..188863a5e0 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h @@ -0,0 +1,77 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ + +#include +#include "src/lite_kernel.h" +#include "nnacl/matmul_parameter.h" +#include "include/errorcode.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +class MatmulFp32BaseCPUKernel : public LiteKernel { + public: + MatmulFp32BaseCPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const mindspore::lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : LiteKernel(parameter, inputs, outputs, ctx, primitive) { + params_ = reinterpret_cast(op_parameter_); + vec_matmul_ = false; + } + ~MatmulFp32BaseCPUKernel(); + int Init() override; + int ReSize() override; + int Run() override; + + public: + int FloatRun(int task_id); + + protected: + int InitBufferA(); + int InitBufferB(); + int InitMatrixA(const float *src_ptr); + int InitMatrixB(const float *src_ptr); + void FreeBiasBuf(); + int InitBiasData(); + void InitParameter(); + + private: + void ResizeParameter(); + void FreeResizeBufA(); + void FreeResizeBufB(); + + protected: + MatMulParameter *params_ = nullptr; + float *a_pack_ptr_ = nullptr; + float *b_pack_ptr_ = nullptr; + float *c_ptr_ = nullptr; + float *bias_ptr_ = nullptr; + float *batch_a_ptr_ = nullptr; + float *batch_b_ptr_ = nullptr; + float *batch_c_ptr_ = nullptr; + int col_tile_ = 0; + int row_tile_ = 0; + int thread_stride_ = 0; + int thread_count_ = 0; + bool vec_matmul_ = false; +}; +} // namespace mindspore::kernel +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc index 0aefe0b25f..3e7d28b46f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -318,12 +318,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; matmul_param_->deep_ = conv_param_->input_channel_; matmul_param_->col_ = conv_param_->output_channel_; - matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM); - matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM); - matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); - matmul_param_->col_16_ = UP_ROUND(matmul_param_->col_, C16NUM); matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); - matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM); matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM); matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc index ba8cbcf420..1eb5441363 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc @@ -156,12 +156,8 @@ void FullconnectionInt8CPUKernel::InitParam() { fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); - fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM); - fc_param_->col_2_ = UP_ROUND(fc_param_->col_, C2NUM); fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM); fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); - fc_param_->col_16_ = UP_ROUND(fc_param_->col_, C16NUM); - fc_param_->deep_4_ = UP_ROUND(fc_param_->deep_, C4NUM); fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM); thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM));