From f5c86d128346476a297539a077eaf012bfc0d3c0 Mon Sep 17 00:00:00 2001
From: lzk <liuzhongkai2@huawei.com>
Date: Thu, 17 Dec 2020 17:26:17 -0800
Subject: [PATCH] x86 sse optimize

---
 .../lite/nnacl/fp32/conv_depthwise_fp32.c     | 23 ++++++++--
 .../lite/nnacl/fp32/conv_depthwise_fp32.h     |  5 +++
 mindspore/lite/nnacl/pack.c                   | 41 +++++++++++++++++
 mindspore/lite/nnacl/pack.h                   |  4 ++
 .../arm/fp32/convolution_depthwise_fp32.cc    |  2 +-
 .../convolution_depthwise_indirect_fp32.cc    | 45 ++++++++++++++-----
 6 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
index 83467834bc..a5f7847ee3 100644
--- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
@@ -587,10 +587,16 @@ bool CheckConvDwUseIndirectBuffer(const ConvParameter *conv_param) {
 
 void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, const ConvParameter *conv_param,
                            int step_h, int step_w) {
-  int ic_4 = UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM;
+#ifdef ENABLE_AVX
+  int div = C8NUM;
+#else
+  int div = C4NUM;
+#endif
+
+  int ic_div = UP_DIV(conv_param->input_channel_, div) * div;
   for (int b = 0; b < conv_param->output_batch_; b++) {
     float **indirect = indirect_buffer + b * conv_param->output_h_ * step_h;
-    float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_4;
+    float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_div;
     for (int oh = 0; oh < conv_param->output_h_; oh++) {
       for (int kh = 0; kh < conv_param->kernel_h_; kh++) {
         int ih = oh * conv_param->stride_h_ + kh * conv_param->dilation_h_ - conv_param->pad_u_;
@@ -600,7 +606,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
               int iw = ow * conv_param->stride_w_ + kw * conv_param->dilation_w_ - conv_param->pad_l_;
               int index = oh * step_h + ow * step_w * conv_param->kernel_h_ + kw * conv_param->kernel_h_ + kh;
               if (iw < conv_param->input_w_ && iw >= 0) {
-                indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_4;
+                indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_div;
               } else {
                 indirect[index] = zero_ptr;
               }
@@ -619,7 +625,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
   }
 }
 
-#ifndef ENABLE_ARM64
+#if !defined(ENABLE_ARM64) && !defined(ENABLE_AVX)
 void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
                            int output_width, int input_stride, bool relu, bool relu6, int kernel) {
   do {
@@ -674,6 +680,15 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
 }
 #endif
 
+#ifdef ENABLE_AVX
+void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
+                           int output_width, int input_stride, bool relu, bool relu6, int kernel) {
+  if (kernel == 9) {
+    ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6);
+  }
+}
+#endif
+
 void ConvDwIndirection(float *output_data, float **indirect_buffer, const float *weight_data, const float *bias_data,
                        float *zero_ptr, const ConvParameter *conv_param, int task_id) {
   int step_w = conv_param->dilation_w_ == 1 ? conv_param->stride_w_ : conv_param->kernel_w_;
diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h
index 7aae3a51f2..f81077fce8 100644
--- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h
@@ -66,6 +66,11 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c
                            int output_width, size_t input_stride, size_t relu, size_t relu6);
 #endif
 
+#ifdef ENABLE_AVX
+void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, int channels,
+                      int output_width, size_t input_stride, size_t relu, size_t relu6);
+#endif
+
 void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
                            int output_width, int input_stride, bool relu, bool relu6, int kernel);
 
diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c
index 245fbb11ca..13b5c9a13d 100644
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -500,6 +500,30 @@ void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int c
   }
 }
 
+void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  int c8_channel = c8 * C8NUM;
+  int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
+  int ic_remainder_ = channel % C8NUM;
+  if (ic_remainder_ != 0) {
+    int nhwc8_batch_offset = 0;
+    for (int b = 0; b < batch; b++) {
+      int batch_offset = b * channel * plane;
+      for (int i = 0; i < plane; i++) {
+        float *dst_per_plane = (float *)dst + nhwc8_batch_offset + i * c8_channel;
+        memcpy(dst_per_plane, (float *)src + batch_offset + i * channel, channel * sizeof(float));
+        for (int j = channel; j < c8_channel; ++j) {
+          dst_per_plane[j] = 0;
+        }
+      }
+      nhwc8_batch_offset += nhwc8_batch_unit_offset;
+    }
+  } else {
+    size_t ori_input_size = batch * plane * channel * sizeof(float);
+    memcpy((float *)dst, (float *)src, ori_input_size);
+  }
+}
+
 void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) {
   int c4 = UP_DIV(channel, C4NUM);
   int ic_remainder_ = channel % C4NUM;
@@ -600,6 +624,23 @@ void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, i
   }
 }
 
+void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  for (int c = 0; c < c8; c++) {
+    int dst_off_c = c * C8NUM * height * width;
+    for (int i = 0; i < C8NUM; i++) {
+      int src_off_c = (c * C8NUM + i) * height * width;
+      for (int kh = 0; kh < height; kh++) {
+        int src_off_kh = src_off_c + kh * width;
+        for (int kw = 0; kw < width; kw++) {
+          int dst_off = dst_off_c + kw * height * C8NUM + kh * C8NUM + i;
+          ((float *)dst)[dst_off] = ((float *)src)[src_off_kh + kw];
+        }
+      }
+    }
+  }
+}
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) {
   int c4 = UP_DIV(channel, C4NUM);
   int c4_channel = c4 * C4NUM;
diff --git a/mindspore/lite/nnacl/pack.h b/mindspore/lite/nnacl/pack.h
index 33f6321e28..13288c50e7 100644
--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@@ -64,6 +64,8 @@ void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
 
 void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
 
+void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
+
 void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);
 
 void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel);
@@ -80,6 +82,8 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int
 
 void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, int width, int channel);
 
+void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel);
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel);
 
 void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
index 7958e09d82..305f2716c0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@@ -147,7 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *>
     conv_param->input_channel_ = inputs[kInputIndex]->Channel();
     conv_param->output_h_ = outputs[kOutputIndex]->Height();
     conv_param->output_w_ = outputs[kOutputIndex]->Width();
-#ifdef ENABLE_ARM64
+#if defined(ENABLE_ARM64) || defined(ENABLE_AVX)
     if (CheckConvDwUseIndirectBuffer(conv_param)) {
       kernel =
         new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
index a3eabd9834..156ceed79a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@@ -47,37 +47,47 @@ int ConvolutionDepthwiseIndirectCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
-  int C4 = UP_DIV(weight_tensor->Batch(), C4NUM);
-  int pack_weight_size = C4NUM * C4 * weight_tensor->Height() * weight_tensor->Width();
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag);
+  int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
+#ifdef ENABLE_AVX
+  PackDepthwiseIndirectWeightC8Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
+                                    weight_tensor->Batch());
+#else
   PackDepthwiseIndirectWeightC4Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
                                     weight_tensor->Batch());
+#endif
 
   auto bias_tensor = in_tensors_[kBiasIndex];
-  bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
+  bias_data_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
 
-  memset(bias_data_, 0, C4NUM * C4 * sizeof(float));
+  memset(bias_data_, 0, batch_flag * div_flag * sizeof(float));
   if (in_tensors_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
     memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float));
   }
 
   // malloc zero ptr
-  zero_ptr_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
+  zero_ptr_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
   if (zero_ptr_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(zero_ptr_, 0, C4NUM * C4 * sizeof(float));
+  memset(zero_ptr_, 0, batch_flag * div_flag * sizeof(float));
   return RET_OK;
 }
 
@@ -139,8 +149,13 @@ int ConvDwIndirectRun(void *cdata, int task_id) {
 }
 
 int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
-  int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  int IC_DIV = UP_DIV(conv_param_->input_channel_, div_flag);
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * div_flag * IC_DIV;
   packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -152,14 +167,24 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
 int ConvolutionDepthwiseIndirectCPUKernel::Run() {
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto input_ptr = reinterpret_cast<float *>(input_tensor->data_c());
-  if (conv_param_->input_channel_ % C4NUM != 0) {
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  if (conv_param_->input_channel_ % div_flag != 0) {
     auto ret = MallocPackedInput();
     if (ret != 0) {
       MS_LOG(ERROR) << "Convolution depthwise fp32 indirect buffer MallocPackedInput failed.";
       return RET_ERROR;
     }
+#ifdef ENABLE_AVX
+    PackNHWCToNHWC8Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+#else
     PackNHWCToNHWC4Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
                         conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+#endif
   } else {
     packed_input_ = input_ptr;
   }
@@ -174,7 +199,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {
     MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]";
     return RET_ERROR;
   }
-  if (conv_param_->input_channel_ % C4NUM != 0) {
+  if (conv_param_->input_channel_ % div_flag != 0) {
     context_->allocator->Free(packed_input_);
   }
   return RET_OK;