!10998 [MSLITE] fp32 matmul base

From: @ling_qiao_min Reviewed-by: Signed-off-by:
5 years ago · f2b4db1cc8
parent 3708624a25 e938efc5dd
commit f2b4db1cc8
12 changed files with 800 additions and 1046 deletions
--- a/mindspore/lite/nnacl/fp32/matmul_fp32.c
+++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c
--- a/mindspore/lite/nnacl/fp32/matmul_fp32.h
+++ b/mindspore/lite/nnacl/fp32/matmul_fp32.h
@ -23,12 +23,23 @@
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/op_base.h"

+#define ADD_BIAS(value, bias, c) \
+  if (bias != NULL) value = value + bias[c];
+
+#define DO_RELU(value, act_type) \
+  if (act_type == ActType_Relu) value = MSMAX(0.0f, value);
+
+#define DO_RELU6(value, act_type)                            \
+  if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); \
+  if (act_type == ActType_Relu6) value = MSMAX(0.0f, value);
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
               int col, size_t stride, int out_type);
-void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col);
+void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
+
 void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col);
@ -40,9 +51,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t
 void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
 void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
 void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
-#ifdef ENABLE_ARM
-void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
-#endif
+
 #ifdef ENABLE_ARM64
 void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                       int col, size_t stride, size_t writeNhwc, size_t WriteWino);
@ -67,10 +76,6 @@ void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bi
 #endif
 #endif

-#ifdef ENABLE_NNACL_INFER_SHAPE
-int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
-                     int *in_datatype, int *out_datatype, OpParameter *param);
-#endif
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@ -44,14 +44,11 @@ typedef struct MatMulParameter {
  int col_;
  int row_4_;
  int row_6_;
-  int row_8_;
  int row_12_;
  int row_16_;
  int row_align_;
-  int col_2_;
  int col_4_;
  int col_8_;
-  int col_16_;
  int col_align_;
  int deep_;
  int deep_4_;
@ -61,8 +58,6 @@ typedef struct MatMulParameter {
  bool b_transpose_; /* true  :  col-major  */
  bool a_const_;
  bool b_const_;
-  bool a_init_shape_;
-  bool b_init_shape_;
  ActType act_type_;
 } MatMulParameter;

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@ -62,11 +62,8 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
  matmul_param_->col_ = conv_param_->output_channel_;
  matmul_param_->deep_ = conv_param_->input_channel_;
-  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
-  matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
-  matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
  matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
-  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
+  matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_);
  matmul_param_->act_type_ = conv_param_->act_type_;
  return;
 }
@ -76,20 +73,6 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();

-#ifdef ENABLE_AVX
-  row_tile_ = C6NUM;
-  col_tile_ = C16NUM;
-#elif defined(ENABLE_SSE)
-  row_tile_ = C4NUM;
-  col_tile_ = C8NUM;
-#elif defined(ENABLE_ARM32)
-  row_tile_ = C12NUM;
-  col_tile_ = C4NUM;
-#else
-  row_tile_ = C12NUM;
-  col_tile_ = C8NUM;
-#endif
-
  if (in_tensors_.size() == 3) {
    int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
    int weight_size = output_channel * sizeof(float);
@ -146,6 +129,19 @@ int Convolution1x1CPUKernel::InitConv1x1Param() {
 }

 int Convolution1x1CPUKernel::Init() {
+#ifdef ENABLE_AVX
+  row_tile_ = C6NUM;
+  col_tile_ = C16NUM;
+#elif defined(ENABLE_SSE)
+  row_tile_ = C4NUM;
+  col_tile_ = C8NUM;
+#elif defined(ENABLE_ARM32)
+  row_tile_ = C12NUM;
+  col_tile_ = C4NUM;
+#else
+  row_tile_ = C12NUM;
+  col_tile_ = C8NUM;
+#endif
  matmul_param_ = new (std::nothrow) MatMulParameter;
  if (matmul_param_ == nullptr) {
    MS_LOG(ERROR) << "Memory allocation failed";
@ -270,20 +266,6 @@ void Convolution1x1CPUKernel::PackWeight() {
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();

-#ifdef ENABLE_AVX
-  row_tile_ = C6NUM;
-  col_tile_ = C16NUM;
-#elif defined(ENABLE_SSE)
-  row_tile_ = C4NUM;
-  col_tile_ = C8NUM;
-#elif defined(ENABLE_ARM32)
-  row_tile_ = C12NUM;
-  col_tile_ = C4NUM;
-#else
-  row_tile_ = C12NUM;
-  col_tile_ = C8NUM;
-#endif
-
  int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
  int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h
@ -21,43 +21,19 @@
 #include "include/context.h"
 #include "include/errorcode.h"
 #include "nnacl/fp32/matmul_fp32.h"
-#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"

-using mindspore::lite::InnerContext;
 namespace mindspore::kernel {
-class FullconnectionCPUKernel : public LiteKernel {
+class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel {
 public:
  FullconnectionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                          const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
+                          const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
                          const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  }
-  ~FullconnectionCPUKernel() override;
-
+      : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~FullconnectionCPUKernel() = default;
  int Init() override;
  int ReSize() override;
  int Run() override;
-
- public:
-  int DoMatmul(int task_id);
-  void FreeBuf();
-
- private:
-  void InitMatrixA(const float *src_ptr, float *dst_ptr);
-  void InitMatrixB(const float *src_ptr, float *dst_ptr);
-
- private:
-  MatMulParameter *fc_param_ = nullptr;
-  float *a_pack_ptr_ = nullptr;
-  float *b_pack_ptr_ = nullptr;
-  float *c_ptr_ = nullptr;
-  float *bias_ptr_ = nullptr;
-  float *a_ptr_ = nullptr;
-  float *b_ptr_ = nullptr;
-  bool is_vector_input_ = false;
-  int thread_count_ = 1;
-  int thread_stride_ = 0;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
@ -19,47 +19,20 @@

 #include <vector>
 #include "nnacl/matmul_parameter.h"
-#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"

 namespace mindspore::kernel {
-class MatmulCPUKernel : public LiteKernel {
+class MatmulCPUKernel : public MatmulFp32BaseCPUKernel {
 public:
  explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                           const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  }
-  ~MatmulCPUKernel() override;
+      : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~MatmulCPUKernel() = default;
  int Init() override;
  int ReSize() override;
  int Run() override;
-  int RunImpl(int task_id);
  int Eval() override;
-
- private:
-  int MallocMatrixABuffer();
-  int MallocMatrixBBuffer();
-  int InitBias();
-  void InitMatrixA(const float *src_ptr, float *dst_ptr);
-  void InitMatrixB(const float *src_ptr, float *dst_ptr);
-  void FreeTmpBuffer();
-
- private:
-  MatMulParameter *params_ = nullptr;
-  float *a_pack_ptr_ = nullptr;
-  float *b_pack_ptr_ = nullptr;
-  float *bias_ptr_ = nullptr;
-  float *a_ptr_ = nullptr;
-  float *b_ptr_ = nullptr;
-  float *cur_a_ptr_ = nullptr;
-  float *cur_b_ptr_ = nullptr;
-  float *cur_c_ptr_ = nullptr;
-  bool is_vector_a_ = false;
-  int col_tile_ = 0;
-  int thread_stride_ = 0;
-  int thread_count_ = 0;
 };
 }  // namespace mindspore::kernel
-
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
@ -0,0 +1,77 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/matmul_parameter.h"
+#include "include/errorcode.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+class MatmulFp32BaseCPUKernel : public LiteKernel {
+ public:
+  MatmulFp32BaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                          const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
+                          const mindspore::lite::PrimitiveC *primitive)
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
+    params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
+    vec_matmul_ = false;
+  }
+  ~MatmulFp32BaseCPUKernel();
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ public:
+  int FloatRun(int task_id);
+
+ protected:
+  int InitBufferA();
+  int InitBufferB();
+  int InitMatrixA(const float *src_ptr);
+  int InitMatrixB(const float *src_ptr);
+  void FreeBiasBuf();
+  int InitBiasData();
+  void InitParameter();
+
+ private:
+  void ResizeParameter();
+  void FreeResizeBufA();
+  void FreeResizeBufB();
+
+ protected:
+  MatMulParameter *params_ = nullptr;
+  float *a_pack_ptr_ = nullptr;
+  float *b_pack_ptr_ = nullptr;
+  float *c_ptr_ = nullptr;
+  float *bias_ptr_ = nullptr;
+  float *batch_a_ptr_ = nullptr;
+  float *batch_b_ptr_ = nullptr;
+  float *batch_c_ptr_ = nullptr;
+  int col_tile_ = 0;
+  int row_tile_ = 0;
+  int thread_stride_ = 0;
+  int thread_count_ = 0;
+  bool vec_matmul_ = false;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@ -318,12 +318,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
  matmul_param_->deep_ = conv_param_->input_channel_;
  matmul_param_->col_ = conv_param_->output_channel_;
-  matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM);
-  matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM);
-  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
-  matmul_param_->col_16_ = UP_ROUND(matmul_param_->col_, C16NUM);
  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
-  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
  matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
  matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);

--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@ -156,12 +156,8 @@ void FullconnectionInt8CPUKernel::InitParam() {
  fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);

  fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM);
-  fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM);
-  fc_param_->col_2_ = UP_ROUND(fc_param_->col_, C2NUM);
  fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM);
  fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
-  fc_param_->col_16_ = UP_ROUND(fc_param_->col_, C16NUM);
-  fc_param_->deep_4_ = UP_ROUND(fc_param_->deep_, C4NUM);
  fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM);

  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM));