!7457 [MSLITE] Support Fp32 Matrix-Vector Multiplication for FC/MATMUL Ops

Merge pull request !7457 from zhanyuan/tmp
5 years ago · 7747f4c471
parent d7ea4742f3 cccaab4fdc
commit 7747f4c471
12 changed files with 470 additions and 99 deletions
--- a/mindspore/lite/internal/CMakeLists.txt
+++ b/mindspore/lite/internal/CMakeLists.txt
@ -39,6 +39,7 @@ if (PLATFORM_ARM64)
    # assembly
    file(GLOB ASSEMBLY_SRC
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S
+            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatVecMulFp32.S
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
    set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC})
--- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
@ -0,0 +1,203 @@
+#ifdef __aarch64__
+    .text
+    .align 5
+    .global MatVecMulFp32Neon64
+#ifndef __APPLE__
+    .type MatVecMulFp32Neon64, %function
+#endif
+
+// void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
+// x0: a
+// x1: b
+// x2: c
+// x3: bias
+// w4: act_type
+// w5: depth
+// w6: col
+
+MatVecMulFp32Neon64:
+  sub sp, sp, #128
+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+
+  mov w14, #4      // sizeof(float)
+  mul w8, w14, w5  // rhs depthx1 block stride
+  mov w14, #4
+  mul w13, w8, w14 // rhs depthx4 block stride 
+
+Loop:
+  mov x15, x0     // reload a ptr
+  mov x7, x1      // reload b ptr
+  mov w9, w5      // reload depth
+  cmp w6, #4
+  blt Loop1x1  
+
+Loop1x4: 
+  dup v10.8h, wzr  
+  dup v11.8h, wzr  
+  dup v12.8h, wzr
+  dup v13.8h, wzr
+  dup v14.8h, wzr
+
+  add x10, x7, x8
+  add x11, x10, x8
+  add x12, x11, x8
+
+Depth8_1x4:
+  cmp w9, #8
+  blt Depth4_1x4
+
+  ld1 {v0.4s, v1.4s}, [x15], #32
+  ld1 {v2.4s, v3.4s}, [x7], #32
+  ld1 {v4.4s, v5.4s}, [x10], #32
+  fmla v10.4s, v0.4s, v2.4s
+  fmla v10.4s, v1.4s, v3.4s
+  fmla v11.4s, v0.4s, v4.4s
+  fmla v11.4s, v1.4s, v5.4s
+  ld1 {v6.4s, v7.4s}, [x11], #32
+  ld1 {v8.4s, v9.4s}, [x12], #32
+  fmla v12.4s, v0.4s, v6.4s
+  fmla v12.4s, v1.4s, v7.4s
+  fmla v13.4s, v0.4s, v8.4s
+  fmla v13.4s, v1.4s, v9.4s
+  sub w9, w9, #8
+  cbz w9, End1x4
+  b Depth8_1x4
+
+Depth4_1x4:
+  cmp w9, #4
+  blt Depth1_1x4
+
+  ld1 {v0.4s}, [x15], #16
+  ld1 {v1.4s}, [x7], #16
+  ld1 {v2.4s}, [x10], #16
+  ld1 {v3.4s}, [x11], #16
+  ld1 {v4.4s}, [x12], #16
+  fmla v10.4s, v1.4s, v0.4s
+  fmla v11.4s, v2.4s, v0.4s
+  fmla v12.4s, v3.4s, v0.4s
+  fmla v13.4s, v4.4s, v0.4s
+  sub w9, w9, #4
+  cbz w9, End1x4
+  b Depth8_1x4
+
+Depth1_1x4:
+  ld1 {v0.s}[0], [x15], #4
+  ld1 {v1.s}[0], [x7], #4
+  ld1 {v1.s}[1], [x10], #4
+  ld1 {v1.s}[2], [x11], #4
+  ld1 {v1.s}[3], [x12], #4
+
+  fmla v14.4s, v1.4s, v0.s[0]
+  sub w9, w9, #1
+  cbz w9, End1x4
+  b Depth1_1x4
+
+End1x4:
+  faddp v15.4s, v10.4s, v11.4s
+  faddp v16.4s, v12.4s, v13.4s
+  faddp v17.4s, v15.4s, v16.4s
+  fadd v14.4s, v14.4s, v17.4s
+
+  cbz x3, Act1x4
+  ld1 {v15.4s}, [x3], #16
+  fadd v14.4s, v14.4s, v15.4s   // add bias 
+
+Act1x4:
+  cmp w4, #3
+  beq Relu6_1x4
+  cmp w4, #1
+  beq Relu1x4
+  b Write1x4
+
+Relu6_1x4:
+  movi v15.4s, #0x46, lsl #8
+  fmin v14.4s, v14.4s, v15.4s
+
+Relu1x4:
+  dup v15.4s, wzr
+  fmax v14.4s, v14.4s, v15.4s
+
+Write1x4:
+  st1 {v14.4s}, [x2], #16
+  sub w6, w6, #4
+  cbz w6, End
+  add x1, x1, x13
+  b Loop
+
+
+Loop1x1:
+  dup v4.4s, wzr
+  dup v5.4s, wzr
+
+Depth8_1x1:
+  cmp w9, #8
+  blt Depth4_1x1
+
+  ld1 {v0.4s, v1.4s}, [x15], #32
+  ld1 {v2.4s, v3.4s}, [x7], #32
+
+  fmla v4.4s, v2.4s, v0.4s
+  fmla v4.4s, v3.4s, v1.4s
+  sub w9, w9, #8
+  cbz w9, End1x1
+  b Depth8_1x1
+
+Depth4_1x1:
+  cmp w9, #4
+  blt Depth1_1x1
+
+  ld1 {v0.4s}, [x15], #16
+  ld1 {v1.4s}, [x7], #16
+
+  fmla v4.4s, v1.4s, v0.4s
+  sub w9, w9, #4
+  cbz w9, End1x1
+  b Depth8_1x1
+
+Depth1_1x1:
+  ld1 {v0.s}[0], [x15], #4
+  ld1 {v1.s}[0], [x7], #4
+
+  fmla v5.4s, v1.4s, v0.s[0]
+  sub w9, w9, #1
+  cbz w9, End1x1
+  b Depth1_1x1
+
+End1x1:
+  faddp v6.4s, v4.4s, v4.4s  
+  faddp v7.4s, v6.4s, v6.4s  
+  fadd v7.4s, v7.4s, v5.4s
+
+  cbz x3, Act1x1
+  ld1 {v8.s}[0], [x3], #4
+  fadd v7.4s, v7.4s, v8.4s    // add bias
+
+Act1x1:
+  cmp w4, #3
+  beq Relu6_1x1
+  cmp w4, #1
+  beq Relu1x1
+  b Write1x1
+
+Relu6_1x1:
+  movi v8.4s, #0x46, lsl #8
+  fmin v7.4s, v7.4s, v8.4s
+
+Relu1x1:
+  dup v8.4s, wzr
+  fmax v7.4s, v7.4s, v8.4s
+
+Write1x1:
+  st1 {v7.s}[0], [x2], #4
+  sub w6, w6, #1
+  cbz w6, End
+  add x1, x1, x8
+  b Loop
+
+End:
+  sub sp, sp, #128
+  ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+  ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+  ret
+#endif
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
@ -1,12 +1,12 @@
 #ifdef __aarch64__
    .text
    .align 5
-    .global MatmulFp16Neon64_1xN
+    .global MatVecMulFp16Neon64
 #ifndef __APPLE__
-    .type MatmulFp16Neon64_1xN, %function
+    .type MatVecMulFp16Neon64, %function
 #endif

-// void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
+// void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
 // x0: a
 // x1: b
 // x2: c
@ -15,7 +15,7 @@
 // w5: depth
 // w6: col

-MatmulFp16Neon64_1xN:
+MatVecMulFp16Neon64:
  sub sp, sp, #128
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@ -289,9 +289,9 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
  return;
 }

-void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
-                   int col) {
-  MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col);
+void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
+                   int depth, int col) {
+  MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col);
 }

 void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@ -17,14 +17,14 @@
 #ifndef MINDSPORE_LITE_NNACL_FP16_MATMUL_H_
 #define MINDSPORE_LITE_NNACL_FP16_MATMUL_H_

-#include <string.h>
 #include <float.h>
+#include <string.h>
 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/errorcode.h"
-#include "nnacl/op_base.h"
 #include "nnacl/matmul_parameter.h"
+#include "nnacl/op_base.h"

 #ifdef __cplusplus
 extern "C" {
@ -35,8 +35,8 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, int out_type);

-void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
-                   int col);
+void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
+                   int depth, int col);

 void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);

@ -48,8 +48,8 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
 void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);

-void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
-                          int depth, int col);
+void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
+                         int depth, int col);

 void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@ -16,6 +16,14 @@

 #include "nnacl/fp32/matmul.h"

+void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col) {
+  for (int r = 0; r < row; ++r) {
+    for (int c = 0; c < col; ++c) {
+      dst_ptr[c * row + r] = src_ptr[r * col + c];
+    }
+  }
+}
+
 void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col) {
  for (int r = 0; r < row; r++) {
    float *src = src_ptr + r * col;
@ -562,6 +570,12 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
 #endif
 }

+void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) {
+#ifdef ENABLE_ARM64
+  MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col);
+#endif
+}
+
 #ifdef ENABLE_NNACL_INFER_SHAPE
 static void SwapDims(int *dims, int index1, int index2) {
  int tmp = dims[index1];
--- a/mindspore/lite/nnacl/fp32/matmul.h
+++ b/mindspore/lite/nnacl/fp32/matmul.h
@ -17,17 +17,19 @@
 #ifndef MINDSPORE_LITE_NNACL_FP32_MATMUL_H_
 #define MINDSPORE_LITE_NNACL_FP32_MATMUL_H_

-#include <string.h>
 #include <float.h>
+#include <string.h>
 #include "nnacl/errorcode.h"
-#include "nnacl/op_base.h"
 #include "nnacl/matmul_parameter.h"
+#include "nnacl/op_base.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
 void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
               int col, size_t stride, int out_type);
+void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col);
+void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col);
@ -39,6 +41,7 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi
                       int col, size_t stride, size_t writeNhwc, size_t WriteWino);
 void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                          int col, size_t stride, size_t write_mode);
+void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
 #elif ENABLE_ARM32
 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                       int col, int stride, size_t writeNhwc, size_t WriteWino);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
@ -27,13 +27,13 @@ FullconnectionCPUKernel::~FullconnectionCPUKernel() {
 }

 void FullconnectionCPUKernel::FreeBuf() {
-  if (a_c12_ptr_ != nullptr) {
-    free(a_c12_ptr_);
-    a_c12_ptr_ = nullptr;
+  if (a_pack_ptr_ != nullptr) {
+    free(a_pack_ptr_);
+    a_pack_ptr_ = nullptr;
  }
-  if (b_r8_ptr_ != nullptr) {
-    free(b_r8_ptr_);
-    b_r8_ptr_ = nullptr;
+  if (b_pack_ptr_ != nullptr) {
+    free(b_pack_ptr_);
+    b_pack_ptr_ = nullptr;
  }
  if (bias_ptr_ != nullptr) {
    free(bias_ptr_);
@ -56,37 +56,50 @@ int FullconnectionCPUKernel::ReSize() {
  thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8));
  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_);

-  bias_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->col_8_ * sizeof(float)));
-  memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float));
+#ifdef ENABLE_ARM64
+  if (fc_param_->row_ == 1) {
+    is_vector_input_ = true;
+  }
+#endif
  if (in_tensors_.size() == 3) {
+    int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_;
+    bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float)));
    memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float));
  }

 #ifdef ENABLE_ARM32
-  a_c12_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)));
-  if (a_c12_ptr_ == nullptr) {
+  a_pack_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)));
+  if (a_pack_ptr_ == nullptr) {
    return RET_MEMORY_FAILED;
  }
-  memset(a_c12_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float));
+  memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float));
 #else
-  a_c12_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_12_ * fc_param_->deep_ * sizeof(float)));
-  if (a_c12_ptr_ == nullptr) {
+  int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_;
+  a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float)));
+  if (a_pack_ptr_ == nullptr) {
    return RET_MEMORY_FAILED;
  }
-  memset(a_c12_ptr_, 0, fc_param_->row_12_ * fc_param_->deep_ * sizeof(float));
+  memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float));
 #endif

-  b_r8_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float)));
-  if (b_r8_ptr_ == nullptr) {
+  int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_;
+  b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float)));
+  if (b_pack_ptr_ == nullptr) {
    FreeBuf();
    return RET_MEMORY_FAILED;
  }
-  memset(b_r8_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float));
+  memset(b_pack_ptr_, 0, col_tmp * fc_param_->deep_ * sizeof(float));

  fc_param_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
  fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
-  if (fc_param_->a_const_) InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_c12_ptr_);
-  if (fc_param_->b_const_) InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_r8_ptr_);
+  if (fc_param_->a_const_) {
+    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_pack_ptr_);
+    a_ptr_ = a_pack_ptr_;
+  }
+  if (fc_param_->b_const_) {
+    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_pack_ptr_);
+    b_ptr_ = b_pack_ptr_;
+  }
  return RET_OK;
 }

@ -98,14 +111,24 @@ int FullconnectionCPUKernel::Init() {
 }

 void FullconnectionCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
+  if (is_vector_input_) {
+    memcpy(dst_ptr, src_ptr, fc_param_->deep_ * sizeof(float));
+    return;
+  }
+
 #ifdef ENABLE_ARM32
-  RowMajor2Col4Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_);
+  RowMajor2Col4Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_);
 #else
-  RowMajor2Col12Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_);
+  RowMajor2Col12Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_);
 #endif
 }

 void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
+  if (is_vector_input_) {
+    memcpy(dst_ptr, src_ptr, fc_param_->col_ * fc_param_->deep_ * sizeof(float));
+    return;
+  }
+
  RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_);
 }

@ -125,9 +148,16 @@ int FullconnectionCPUKernel::DoMatmul(int task_id) {
    return RET_OK;
  }

-  MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_,
-            c_r_ptr + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM,
-            fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, OutType_Nhwc);
+  auto b = b_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_;
+  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * C8NUM;
+  auto c = c_ptr_ + task_id * thread_stride_ * C8NUM;
+  if (is_vector_input_) {
+    MatVecMul(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
+  } else {
+    MatMulOpt(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
+              OutType_Nhwc);
+  }
+
  return RET_OK;
 }

@ -139,11 +169,24 @@ int FullconnectionCPUKernel::Run() {
  }
  auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
  auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
-  c_r_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
-
-  if (!fc_param_->a_const_) InitMatrixA(a_ptr, a_c12_ptr_);
-  if (!fc_param_->b_const_) InitMatrixB(b_ptr, b_r8_ptr_);
-
+  c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
+
+  if (!fc_param_->a_const_) {
+    if (is_vector_input_) {
+      a_ptr_ = a_ptr;
+    } else {
+      InitMatrixA(a_ptr, a_pack_ptr_);
+      a_ptr_ = a_pack_ptr_;
+    }
+  }
+  if (!fc_param_->b_const_) {
+    if (is_vector_input_) {
+      b_ptr_ = b_ptr;
+    } else {
+      InitMatrixB(b_ptr, b_pack_ptr_);
+      b_ptr_ = b_pack_ptr_;
+    }
+  }
  ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_);

  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.h
@ -19,8 +19,8 @@

 #include <vector>
 #include "src/runtime/kernel/arm/base/fullconnection_base.h"
-#include "include/errorcode.h"
 #include "include/context.h"
+#include "include/errorcode.h"
 #include "nnacl/fp32/matmul.h"

 using mindspore::lite::InnerContext;
@ -47,10 +47,13 @@ class FullconnectionCPUKernel : public FullconnectionBaseCPUKernel {
  void InitMatrixB(float *src_ptr, float *dst_ptr);

 private:
-  float *a_c12_ptr_ = nullptr;
-  float *b_r8_ptr_ = nullptr;
-  float *c_r_ptr = nullptr;
+  float *a_pack_ptr_ = nullptr;
+  float *b_pack_ptr_ = nullptr;
+  float *c_ptr_ = nullptr;
  float *bias_ptr_ = nullptr;
+  float *a_ptr_ = nullptr;
+  float *b_ptr_ = nullptr;
+  bool is_vector_input_ = false;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
@ -15,9 +15,9 @@
 */

 #include "src/runtime/kernel/arm/fp32/matmul.h"
+#include "include/errorcode.h"
 #include "nnacl/fp32/matmul.h"
 #include "src/runtime/runtime_api.h"
-#include "include/errorcode.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INPUT_TENSOR_ERROR;
@ -28,13 +28,13 @@ namespace mindspore::kernel {
 MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); }

 void MatmulCPUKernel::FreeTmpBuffer() {
-  if (a_c12_ptr_ != nullptr) {
-    free(a_c12_ptr_);
-    a_c12_ptr_ = nullptr;
+  if (a_pack_ptr_ != nullptr) {
+    free(a_pack_ptr_);
+    a_pack_ptr_ = nullptr;
  }
-  if (b_r8_ptr_ != nullptr) {
-    free(b_r8_ptr_);
-    b_r8_ptr_ = nullptr;
+  if (b_pack_ptr_ != nullptr) {
+    free(b_pack_ptr_);
+    b_pack_ptr_ = nullptr;
  }
  if (bias_ptr_ != nullptr) {
    free(bias_ptr_);
@ -50,24 +50,30 @@ int MatmulCPUKernel::MallocMatrixABuffer() {
  }
  params_->batch = batch;
  params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
+#ifdef ENABLE_ARM64
+  if (params_->row_ == 1) {
+    is_vector_a_ = true;
+  }
+#endif
  params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
  params_->row_4_ = UP_ROUND(params_->row_, C4NUM);
  params_->row_12_ = UP_ROUND(params_->row_, C12NUM);

 #ifdef ENABLE_ARM32
-  a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float)));
-  if (a_c12_ptr_ == nullptr) {
+  a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float)));
+  if (a_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
-  memset(a_c12_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float));
+  memset(a_pack_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float));
 #else
-  a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float)));
-  if (a_c12_ptr_ == nullptr) {
+  int row_tmp = is_vector_a_ ? 1 : params_->row_12_;
+  a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float)));
+  if (a_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
-  memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float));
+  memset(a_pack_ptr_, 0, params_->batch * row_tmp * params_->deep_ * sizeof(float));
 #endif
  return RET_OK;
 }
@ -86,12 +92,13 @@ int MatmulCPUKernel::MallocMatrixBBuffer() {
  params_->col_8_ = UP_ROUND(params_->col_, 8);
  params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];

-  b_r8_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float)));
-  if (b_r8_ptr_ == nullptr) {
+  int col_tmp = is_vector_a_ ? params_->col_ : params_->col_8_;
+  b_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float)));
+  if (b_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
-  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
+  memset(b_pack_ptr_, 0, params_->batch * col_tmp * params_->deep_ * sizeof(float));

  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
@ -99,24 +106,22 @@ int MatmulCPUKernel::MallocMatrixBBuffer() {
 }

 int MatmulCPUKernel::InitBias() {
-  auto c_shape = out_tensors_[0]->shape();
-  if (c_shape.empty()) {
-    return RET_OK;
-  }
-  auto col_8 = UP_ROUND(c_shape[c_shape.size() - 1], 8);
-  bias_ptr_ = reinterpret_cast<float *>(malloc(col_8 * sizeof(float)));
-  if (bias_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(bias_ptr_, 0, col_8 * sizeof(float));
  if (in_tensors_.size() == 3) {
+    auto c_shape = out_tensors_[0]->shape();
    auto bias_shape = in_tensors_[1]->shape();
    if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) {
      MS_LOG(ERROR) << "The bias'dimension is not equal with colum";
      FreeTmpBuffer();
      return RET_INPUT_TENSOR_ERROR;
    }
+    auto col = c_shape[c_shape.size() - 1];
+    auto col_8 = UP_ROUND(col, 8);
+    auto col_tmp = is_vector_a_ ? col : col_8;
+    bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float)));
+    if (bias_ptr_ == nullptr) {
+      FreeTmpBuffer();
+      return RET_MEMORY_FAILED;
+    }
    memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float));
  }
  return RET_OK;
@ -124,9 +129,9 @@ int MatmulCPUKernel::InitBias() {

 int MatmulCPUKernel::ReSize() {
  if (params_->a_const_ == false || params_->a_has_shape_ == false) {
-    if (a_c12_ptr_ != nullptr) {
-      free(a_c12_ptr_);
-      a_c12_ptr_ = nullptr;
+    if (a_pack_ptr_ != nullptr) {
+      free(a_pack_ptr_);
+      a_pack_ptr_ = nullptr;
    }
    auto ret = MallocMatrixABuffer();
    if (ret != RET_OK) {
@ -135,9 +140,9 @@ int MatmulCPUKernel::ReSize() {
    }
  }
  if (params_->b_const_ == false || params_->b_has_shape_ == false) {
-    if (b_r8_ptr_ != nullptr) {
-      free(b_r8_ptr_);
-      b_r8_ptr_ = nullptr;
+    if (b_pack_ptr_ != nullptr) {
+      free(b_pack_ptr_);
+      b_pack_ptr_ = nullptr;
    }
    auto ret = MallocMatrixBBuffer();
    if (ret != RET_OK) {
@ -158,6 +163,11 @@ int MatmulCPUKernel::ReSize() {
 }

 void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
+  if (is_vector_a_) {
+    memcpy(dst_ptr, src_ptr, params_->batch * params_->deep_ * sizeof(float));
+    return;
+  }
+
  for (int i = 0; i < params_->batch; i++) {
    float *src = src_ptr + i * params_->deep_ * params_->row_;
 #ifdef ENABLE_ARM32
@ -180,6 +190,19 @@ void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) {
 }

 void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
+  if (is_vector_a_) {
+    if (params_->b_transpose_) {
+      memcpy(dst_ptr, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float));
+    } else {
+      for (int i = 0; i < params_->batch; i++) {
+        float *src = src_ptr + i * params_->deep_ * params_->col_;
+        float *dst = dst_ptr + i * params_->deep_ * params_->col_;
+        RowMajor2ColMajor(src, dst, params_->deep_, params_->col_);
+      }
+    }
+    return;
+  }
+
  for (int i = 0; i < params_->batch; i++) {
    float *src = src_ptr + i * params_->deep_ * params_->col_;
    float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
@ -213,10 +236,12 @@ int MatmulCPUKernel::Init() {
  params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
  params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
  if (params_->a_const_ == true) {
-    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_c12_ptr_);
+    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
+    a_ptr_ = a_pack_ptr_;
  }
  if (params_->b_const_ == true) {
-    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_r8_ptr_);
+    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
+    b_ptr_ = b_pack_ptr_;
  }
  if (!InferShapeDone()) {
    return RET_OK;
@ -234,9 +259,14 @@ int MatmulCPUKernel::RunImpl(int task_id) {
  if (cur_oc <= 0) {
    return RET_OK;
  }
-  MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_,
-            c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No,
-            params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
+  auto b = cur_b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
+  auto c = cur_c_ptr_ + task_id * thread_stride_ * C8NUM;
+  auto bias = bias_ptr_ ? bias_ptr_ + task_id * thread_stride_ * C8NUM : NULL;
+  if (is_vector_a_) {
+    MatVecMul(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, cur_oc);
+  } else {
+    MatMulOpt(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
+  }
  return RET_OK;
 }

@ -261,16 +291,32 @@ int MatmulCPUKernel::Run() {
  auto c_src = reinterpret_cast<float *>(out_tensors_[0]->data_c());

  if (params_->a_const_ == false || is_train()) {
-    InitMatrixA(a_src, a_c12_ptr_);
+    if (is_vector_a_) {
+      a_ptr_ = a_src;
+    } else {
+      InitMatrixA(a_src, a_pack_ptr_);
+      a_ptr_ = a_pack_ptr_;
+    }
  }
  if (params_->b_const_ == false || is_train()) {
-    InitMatrixB(b_src, b_r8_ptr_);
+    if (is_vector_a_) {
+      b_ptr_ = b_src;
+    } else {
+      InitMatrixB(b_src, b_pack_ptr_);
+      b_ptr_ = b_pack_ptr_;
+    }
  }

  for (int i = 0; i < params_->batch; ++i) {
-    a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_;
-    b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_;
-    c_ptr_ = c_src + i * params_->row_ * params_->col_;
+    if (is_vector_a_) {
+      cur_a_ptr_ = a_ptr_ + i * params_->deep_;
+      cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_;
+      cur_c_ptr_ = c_src + i * params_->row_ * params_->col_;
+    } else {
+      cur_a_ptr_ = a_ptr_ + i * params_->row_12_ * params_->deep_;
+      cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_8_;
+      cur_c_ptr_ = c_src + i * params_->row_ * params_->col_;
+    }
    ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_);
  }
  return RET_OK;
@ -280,10 +326,10 @@ void MatmulCPUKernel::eval() {
  // Copy weights after training
  LiteKernel::eval();
  if (params_->a_const_ == true) {
-    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_c12_ptr_);
+    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_pack_ptr_);
  }
  if (params_->b_const_ == true) {
-    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_r8_ptr_);
+    InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_pack_ptr_);
  }
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
@ -18,8 +18,8 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_

 #include <vector>
-#include "src/lite_kernel.h"
 #include "nnacl/matmul_parameter.h"
+#include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/base/matmul_base.h"

 namespace mindspore::kernel {
@ -45,12 +45,15 @@ class MatmulCPUKernel : public MatmulBaseCPUKernel {
  void FreeTmpBuffer();

 private:
-  float *a_c12_ptr_ = nullptr;
-  float *b_r8_ptr_ = nullptr;
+  float *a_pack_ptr_ = nullptr;
+  float *b_pack_ptr_ = nullptr;
  float *bias_ptr_ = nullptr;
  float *a_ptr_ = nullptr;
  float *b_ptr_ = nullptr;
-  float *c_ptr_ = nullptr;
+  float *cur_a_ptr_ = nullptr;
+  float *cur_b_ptr_ = nullptr;
+  float *cur_c_ptr_ = nullptr;
+  bool is_vector_a_ = false;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
@ -17,11 +17,11 @@
 #include <sys/time.h>
 #include <iostream>
 #include <memory>
-#include "src/common/log_adapter.h"
 #include "common/common_test.h"
+#include "nnacl/fp32/matmul.h"
 #include "src/common/file_utils.h"
+#include "src/common/log_adapter.h"
 #include "src/runtime/kernel/arm/fp32/fullconnection.h"
-#include "nnacl/fp32/matmul.h"

 namespace mindspore {
 using mindspore::lite::Tensor;
@ -144,4 +144,59 @@ TEST_F(TestFcFp32, FcTest2) {
  fc->Run();
  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001);
 }
+
+int FcTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
+                MatMulParameter *matmal_param, float **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeFloat, {1, 1, 1, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST);
+  in_t->MallocData();
+  float in[] = {1, 0, 3, 0, 4, 5, 2, 5, 2, 5, 1, 5, 0, 1, 2, 0, 2, 1, 0, 5};
+  memcpy(in_t->MutableData(), in, sizeof(float) * in_t->ElementsNum());
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeFloat, {16, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST);
+  weight_t->MallocData();
+  float weight[] = {0, 5, 5, 3, 0, 5, 3, 1, 0, 1, 3, 0, 5, 5, 2, 4, 0, 1, 1, 2, 3, 0, 5, 5, 4, 4, 1, 4, 1, 1, 5, 3,
+                    3, 1, 0, 3, 1, 2, 4, 5, 3, 4, 4, 0, 3, 5, 0, 3, 4, 1, 0, 1, 3, 4, 0, 5, 2, 5, 0, 4, 2, 2, 2, 2,
+                    4, 4, 5, 2, 1, 1, 5, 1, 4, 4, 5, 1, 2, 4, 0, 3, 1, 1, 0, 2, 1, 5, 2, 0, 1, 1, 5, 5, 4, 0, 0, 4,
+                    2, 3, 2, 1, 4, 0, 5, 0, 2, 3, 1, 2, 1, 2, 1, 4, 2, 3, 5, 5, 4, 5, 2, 0, 3, 0, 2, 0, 1, 3, 0, 4,
+                    1, 5, 2, 5, 4, 2, 5, 1, 4, 5, 3, 1, 0, 4, 4, 4, 1, 3, 4, 2, 2, 4, 1, 4, 0, 1, 0, 2, 4, 5, 2, 1,
+                    0, 3, 5, 2, 4, 2, 1, 4, 2, 0, 1, 0, 2, 3, 0, 3, 2, 5, 5, 4, 3, 0, 0, 2, 0, 1, 5, 2, 2, 1, 3, 0,
+                    3, 0, 5, 3, 3, 3, 5, 5, 3, 4, 0, 1, 2, 1, 2, 4, 3, 5, 4, 3, 0, 0, 4, 4, 2, 3, 5, 4, 3, 5, 1, 2,
+                    1, 5, 0, 5, 1, 1, 5, 5, 0, 0, 1, 3, 2, 2, 2, 3, 4, 2, 2, 3, 2, 4, 3, 0, 2, 0, 3, 2, 1, 5, 2, 4,
+                    4, 5, 2, 5, 0, 5, 3, 3, 0, 3, 2, 5, 5, 1, 1, 0, 2, 3, 0, 1, 1, 2, 4, 1, 3, 3, 5, 5, 0, 1, 0, 0,
+                    1, 2, 3, 3, 5, 2, 2, 5, 1, 4, 3, 3, 0, 2, 5, 4, 3, 1, 2, 4, 0, 2, 1, 3, 1, 2, 1, 0, 5, 5, 4, 5};
+  memcpy(weight_t->MutableData(), weight, sizeof(float) * weight_t->ElementsNum());
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeFloat, {1, 16}, schema::Format_NHWC, lite::Tensor::Category::CONST);
+  out_t->MallocData();
+  outputs_->push_back(out_t);
+
+  matmal_param->b_transpose_ = true;
+  matmal_param->a_transpose_ = false;
+  matmal_param->has_bias_ = false;
+  matmal_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestFcFp32, FcTest3) {
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  float *correct;
+  int total_size = FcTestInit3(&inputs_, &outputs_, matmul_param, &correct);
+  lite::InnerContext *ctx = new lite::InnerContext;
+  ctx->thread_num_ = 1;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::FullconnectionCPUKernel *fc =
+    new kernel::FullconnectionCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx, nullptr);
+
+  fc->Init();
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+  for (int i = 0; i < 100000; ++i) fc->Run();
+  gettimeofday(&end, NULL);
+  // printf("## elapsed: %llu\n", 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - end.tv_usec);
+}
+
 }  // namespace mindspore