!10148 [ms][lite][cpu] x86 depthwise modify

From: @lzkcode Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
4 years ago · b859dbdc3e
parent b741fe46b3 f5c86d1283
commit b859dbdc3e
6 changed files with 105 additions and 15 deletions
--- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
@ -587,10 +587,16 @@ bool CheckConvDwUseIndirectBuffer(const ConvParameter *conv_param) {

 void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, const ConvParameter *conv_param,
                           int step_h, int step_w) {
-  int ic_4 = UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM;
+#ifdef ENABLE_AVX
+  int div = C8NUM;
+#else
+  int div = C4NUM;
+#endif
+
+  int ic_div = UP_DIV(conv_param->input_channel_, div) * div;
  for (int b = 0; b < conv_param->output_batch_; b++) {
    float **indirect = indirect_buffer + b * conv_param->output_h_ * step_h;
-    float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_4;
+    float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_div;
    for (int oh = 0; oh < conv_param->output_h_; oh++) {
      for (int kh = 0; kh < conv_param->kernel_h_; kh++) {
        int ih = oh * conv_param->stride_h_ + kh * conv_param->dilation_h_ - conv_param->pad_u_;
@ -600,7 +606,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
              int iw = ow * conv_param->stride_w_ + kw * conv_param->dilation_w_ - conv_param->pad_l_;
              int index = oh * step_h + ow * step_w * conv_param->kernel_h_ + kw * conv_param->kernel_h_ + kh;
              if (iw < conv_param->input_w_ && iw >= 0) {
-                indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_4;
+                indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_div;
              } else {
                indirect[index] = zero_ptr;
              }
@ -619,7 +625,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
  }
 }

-#ifndef ENABLE_ARM64
+#if !defined(ENABLE_ARM64) && !defined(ENABLE_AVX)
 void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
                           int output_width, int input_stride, bool relu, bool relu6, int kernel) {
  do {
@ -674,6 +680,15 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
 }
 #endif

+#ifdef ENABLE_AVX
+void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
+                           int output_width, int input_stride, bool relu, bool relu6, int kernel) {
+  if (kernel == 9) {
+    ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6);
+  }
+}
+#endif
+
 void ConvDwIndirection(float *output_data, float **indirect_buffer, const float *weight_data, const float *bias_data,
                       float *zero_ptr, const ConvParameter *conv_param, int task_id) {
  int step_w = conv_param->dilation_w_ == 1 ? conv_param->stride_w_ : conv_param->kernel_w_;
--- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h
@ -66,6 +66,11 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c
                           int output_width, size_t input_stride, size_t relu, size_t relu6);
 #endif

+#ifdef ENABLE_AVX
+void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, int channels,
+                      int output_width, size_t input_stride, size_t relu, size_t relu6);
+#endif
+
 void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
                           int output_width, int input_stride, bool relu, bool relu6, int kernel);

--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@ -500,6 +500,30 @@ void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int c
  }
 }

+void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  int c8_channel = c8 * C8NUM;
+  int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
+  int ic_remainder_ = channel % C8NUM;
+  if (ic_remainder_ != 0) {
+    int nhwc8_batch_offset = 0;
+    for (int b = 0; b < batch; b++) {
+      int batch_offset = b * channel * plane;
+      for (int i = 0; i < plane; i++) {
+        float *dst_per_plane = (float *)dst + nhwc8_batch_offset + i * c8_channel;
+        memcpy(dst_per_plane, (float *)src + batch_offset + i * channel, channel * sizeof(float));
+        for (int j = channel; j < c8_channel; ++j) {
+          dst_per_plane[j] = 0;
+        }
+      }
+      nhwc8_batch_offset += nhwc8_batch_unit_offset;
+    }
+  } else {
+    size_t ori_input_size = batch * plane * channel * sizeof(float);
+    memcpy((float *)dst, (float *)src, ori_input_size);
+  }
+}
+
 void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) {
  int c4 = UP_DIV(channel, C4NUM);
  int ic_remainder_ = channel % C4NUM;
@ -600,6 +624,23 @@ void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, i
  }
 }

+void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel) {
+  int c8 = UP_DIV(channel, C8NUM);
+  for (int c = 0; c < c8; c++) {
+    int dst_off_c = c * C8NUM * height * width;
+    for (int i = 0; i < C8NUM; i++) {
+      int src_off_c = (c * C8NUM + i) * height * width;
+      for (int kh = 0; kh < height; kh++) {
+        int src_off_kh = src_off_c + kh * width;
+        for (int kw = 0; kw < width; kw++) {
+          int dst_off = dst_off_c + kw * height * C8NUM + kh * C8NUM + i;
+          ((float *)dst)[dst_off] = ((float *)src)[src_off_kh + kw];
+        }
+      }
+    }
+  }
+}
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) {
  int c4 = UP_DIV(channel, C4NUM);
  int c4_channel = c4 * C4NUM;
--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@ -64,6 +64,8 @@ void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int

 void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);

+void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
+
 void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);

 void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel);
@ -80,6 +82,8 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int

 void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, int width, int channel);

+void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel);
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel);

 void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@ -147,7 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *>
    conv_param->input_channel_ = inputs[kInputIndex]->Channel();
    conv_param->output_h_ = outputs[kOutputIndex]->Height();
    conv_param->output_w_ = outputs[kOutputIndex]->Width();
-#ifdef ENABLE_ARM64
+#if defined(ENABLE_ARM64) || defined(ENABLE_AVX)
    if (CheckConvDwUseIndirectBuffer(conv_param)) {
      kernel =
        new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@ -47,37 +47,47 @@ int ConvolutionDepthwiseIndirectCPUKernel::InitWeightBias() {
  // init weight: o, h, w, i; o == group, i == 1
  auto weight_tensor = in_tensors_[kWeightIndex];
  auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
-  int C4 = UP_DIV(weight_tensor->Batch(), C4NUM);
-  int pack_weight_size = C4NUM * C4 * weight_tensor->Height() * weight_tensor->Width();
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag);
+  int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();

  packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
+#ifdef ENABLE_AVX
+  PackDepthwiseIndirectWeightC8Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
+                                    weight_tensor->Batch());
+#else
  PackDepthwiseIndirectWeightC4Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
                                    weight_tensor->Batch());
+#endif

  auto bias_tensor = in_tensors_[kBiasIndex];
-  bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
+  bias_data_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }

-  memset(bias_data_, 0, C4NUM * C4 * sizeof(float));
+  memset(bias_data_, 0, batch_flag * div_flag * sizeof(float));
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float));
  }

  // malloc zero ptr
-  zero_ptr_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
+  zero_ptr_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
  if (zero_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  memset(zero_ptr_, 0, C4NUM * C4 * sizeof(float));
+  memset(zero_ptr_, 0, batch_flag * div_flag * sizeof(float));
  return RET_OK;
 }

@ -139,8 +149,13 @@ int ConvDwIndirectRun(void *cdata, int task_id) {
 }

 int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
-  int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  int IC_DIV = UP_DIV(conv_param_->input_channel_, div_flag);
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * div_flag * IC_DIV;
  packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
@ -152,14 +167,24 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
 int ConvolutionDepthwiseIndirectCPUKernel::Run() {
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto input_ptr = reinterpret_cast<float *>(input_tensor->data_c());
-  if (conv_param_->input_channel_ % C4NUM != 0) {
+#ifdef ENABLE_AVX
+  int div_flag = C8NUM;
+#else
+  int div_flag = C4NUM;
+#endif
+  if (conv_param_->input_channel_ % div_flag != 0) {
    auto ret = MallocPackedInput();
    if (ret != 0) {
      MS_LOG(ERROR) << "Convolution depthwise fp32 indirect buffer MallocPackedInput failed.";
      return RET_ERROR;
    }
+#ifdef ENABLE_AVX
+    PackNHWCToNHWC8Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+#else
    PackNHWCToNHWC4Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+#endif
  } else {
    packed_input_ = input_ptr;
  }
@ -174,7 +199,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {
    MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]";
    return RET_ERROR;
  }
-  if (conv_param_->input_channel_ % C4NUM != 0) {
+  if (conv_param_->input_channel_ % div_flag != 0) {
    context_->allocator->Free(packed_input_);
  }
  return RET_OK;