!7249 [MS][LITE][CPU] conv 1x1 init time optimize

Merge pull request !7249 from liuzhongkai/conv1x1_asmop
4 years ago · 634cdd3485
parent 5dbd8cd26d 8213de4434
commit 634cdd3485
3 changed files with 104 additions and 65 deletions
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@ -220,68 +220,104 @@ void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col)

 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) {
  size_t row8 = row / C8NUM * C8NUM;
-  size_t col4 = col / C4NUM * C4NUM;
+#ifdef ENABLE_ARM64
+  size_t col_skip = col / C8NUM * C8NUM;
+  int skip_size = C8NUM;
+#else
+  size_t col_skip = col / C4NUM * C4NUM;
+  int skip_size = C4NUM;
+#endif
  float *src_r = src_ptr;
  float *dst_r = dst_ptr;

  size_t ri = 0;
  for (; ri < row8; ri += C8NUM) {
    size_t ci = 0;
-    for (; ci < col4; ci += C4NUM) {
+    for (; ci < col_skip; ci += skip_size) {
      float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C8NUM;

-      /* 8x4 row-major to col-major */
 #ifdef ENABLE_ARM64
+      /* 8x8 row-major to col-major */
      size_t stride = col * sizeof(float);
      asm volatile(
        "mov x10, %[src_c]\n"
        "mov x11, %[dst_c]\n"

-        "ld1 {v0.4s}, [x10], %[stride]\n"
-        "ld1 {v1.4s}, [x10], %[stride]\n"
-        "ld1 {v2.4s}, [x10], %[stride]\n"
-        "ld1 {v3.4s}, [x10], %[stride]\n"
-
-        "zip1 v4.4s, v0.4s, v1.4s\n"
-        "zip2 v5.4s, v0.4s, v1.4s\n"
-        "zip1 v6.4s, v2.4s, v3.4s\n"
-        "zip2 v7.4s, v2.4s, v3.4s\n"
-
-        "ld1 {v8.4s},  [x10], %[stride]\n"
-        "ld1 {v9.4s},  [x10], %[stride]\n"
-        "ld1 {v10.4s}, [x10],  %[stride]\n"
-        "ld1 {v11.4s}, [x10],  %[stride]\n"
-
-        "trn1 v0.2d, v4.2d, v6.2d\n"
-        "trn2 v1.2d, v4.2d, v6.2d\n"
-        "trn1 v2.2d, v5.2d, v7.2d\n"
-        "trn2 v3.2d, v5.2d, v7.2d\n"
-
-        "zip1 v12.4s, v8.4s, v9.4s\n"
-        "zip2 v13.4s, v8.4s, v9.4s\n"
-        "zip1 v14.4s, v10.4s, v11.4s\n"
-        "zip2 v15.4s, v10.4s, v11.4s\n"
-
-        "trn1 v8.2d, v12.2d, v14.2d\n"
-        "trn2 v9.2d, v12.2d, v14.2d\n"
-        "trn1 v10.2d, v13.2d, v15.2d\n"
-        "trn2 v11.2d, v13.2d, v15.2d\n"
-
-        "st1 {v0.4s}, [x11],  #16\n"
-        "st1 {v8.4s}, [x11],  #16\n"
-        "st1 {v1.4s}, [x11],  #16\n"
-        "st1 {v9.4s}, [x11],  #16\n"
-        "st1 {v2.4s},  [x11],#16\n"
-        "st1 {v10.4s}, [x11], #16\n"
-        "st1 {v3.4s},  [x11],#16\n"
-        "st1 {v11.4s}, [x11], #16\n"
+        "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n"
+        "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n"
+        "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n"
+        "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n"
+
+        "zip1 v8.4s, v0.4s, v2.4s\n"
+        "zip2 v9.4s, v0.4s, v2.4s\n"
+        "zip1 v10.4s, v4.4s, v6.4s\n"
+        "zip2 v11.4s, v4.4s, v6.4s\n"
+
+        "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n"
+        "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n"
+        "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n"
+        "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n"
+
+        "zip1 v12.4s, v1.4s, v3.4s\n"
+        "zip2 v13.4s, v1.4s, v3.4s\n"
+        "zip1 v14.4s, v5.4s, v7.4s\n"
+        "zip2 v15.4s, v5.4s, v7.4s\n"
+
+        "trn1 v0.2d, v8.2d, v10.2d\n"
+        "trn2 v1.2d, v8.2d, v10.2d\n"
+        "trn1 v2.2d, v9.2d, v11.2d\n"
+        "trn2 v3.2d, v9.2d, v11.2d\n"
+
+        "zip1 v24.4s, v16.4s, v18.4s\n"
+        "zip2 v25.4s, v16.4s, v18.4s\n"
+        "zip1 v26.4s, v20.4s, v22.4s\n"
+        "zip2 v27.4s, v20.4s, v22.4s\n"
+
+        "trn1 v4.2d, v12.2d, v14.2d\n"
+        "trn2 v5.2d, v12.2d, v14.2d\n"
+        "trn1 v6.2d, v13.2d, v15.2d\n"
+        "trn2 v7.2d, v13.2d, v15.2d\n"
+
+        "zip1 v28.4s, v17.4s, v19.4s\n"
+        "zip2 v29.4s, v17.4s, v19.4s\n"
+        "zip1 v30.4s, v21.4s, v23.4s\n"
+        "zip2 v31.4s, v21.4s, v23.4s\n"
+
+        "trn1 v16.2d, v24.2d, v26.2d\n"
+        "trn2 v17.2d, v24.2d, v26.2d\n"
+        "trn1 v18.2d, v25.2d, v27.2d\n"
+        "trn2 v19.2d, v25.2d, v27.2d\n"
+
+        "trn1 v20.2d, v28.2d, v30.2d\n"
+        "trn2 v21.2d, v28.2d, v30.2d\n"
+        "trn1 v22.2d, v29.2d, v31.2d\n"
+        "trn2 v23.2d, v29.2d, v31.2d\n"
+
+        "st1 {v0.4s}, [x11], #16\n"
+        "st1 {v16.4s}, [x11], #16\n"
+        "st1 {v1.4s}, [x11], #16\n"
+        "st1 {v17.4s}, [x11], #16\n"
+        "st1 {v2.4s}, [x11], #16\n"
+        "st1 {v18.4s}, [x11], #16\n"
+        "st1 {v3.4s}, [x11], #16\n"
+        "st1 {v19.4s}, [x11], #16\n"
+        "st1 {v4.4s}, [x11], #16\n"
+        "st1 {v20.4s}, [x11], #16\n"
+        "st1 {v5.4s}, [x11], #16\n"
+        "st1 {v21.4s}, [x11], #16\n"
+        "st1 {v6.4s}, [x11], #16\n"
+        "st1 {v22.4s}, [x11], #16\n"
+        "st1 {v7.4s}, [x11], #16\n"
+        "st1 {v23.4s}, [x11], #16\n"

        :
        : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride)
        : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
-          "v15");
+          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31");
 #elif ENABLE_ARM32
+      /* 8x4 row-major to col-major */
      size_t stride = col * sizeof(float);
      asm volatile(
        "mov r10, %[src_c]\n"
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@ -85,14 +85,14 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
  auto input_channel = weight_tensor->Channel();
  auto output_channel = weight_tensor->Batch();

-  size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
-  bias_data_ = malloc(size);
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, size);
  if (in_tensors_.size() == 3) {
+    size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+    size_t weight_size = output_channel * sizeof(float16_t);
+    bias_data_ = malloc(size);
+    if (bias_data_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
+      return RET_ERROR;
+    }
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    if (bias_tensor->data_type() == kNumberTypeFloat16) {
      memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
@ -100,15 +100,17 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
      Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
                       output_channel);
    }
+    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }

-  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  size_t size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  size_t down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float16_t);
  weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
    return RET_ERROR;
  }
-  memset(weight_ptr_, 0, size);
+  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
  ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
                         weight_tensor->data_type() == kNumberTypeFloat16);
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@ -71,24 +71,26 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();

-  int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
-  bias_data_ = malloc(size);
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, size);
  if (in_tensors_.size() == 3) {
-    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(float));
+    int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
+    int weight_size = output_channel * sizeof(float);
+    bias_data_ = malloc(size);
+    if (bias_data_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
+      return RET_ERROR;
+    }
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
+    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }

-  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
+  int size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
+  int down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float);
  weight_ptr_ = reinterpret_cast<float *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
    return RET_ERROR;
  }
-  memset(weight_ptr_, 0, size);
+  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
  RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                     input_channel);
  return RET_OK;
@ -141,10 +143,10 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
  if (cur_oc <= 0) {
    return RET_OK;
  }
+  auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
  MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
-            output_ptr_ + task_id * thread_stride_, reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id,
-            matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_,
-            OutType_Nhwc);
+            output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
+            matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
  return RET_OK;
 }

@ -178,7 +180,6 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
  MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
            matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
            OutType_Nhwc);
-
  return RET_OK;
 }