add fp32 sliding window kernel

5 years ago · a5bd254821
parent 7a8fbbbb4b
commit a5bd254821
6 changed files with 481 additions and 8 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@ -258,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
    kernel =
      new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else if (use_sw) {
-    // kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
-    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
@ -18,7 +18,9 @@ ConvDwFp32Center:
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #48
+    sub sp, sp, #176
+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
@ -287,7 +289,9 @@ ConvDwFp32Center:
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #48
+    sub sp, sp, #176
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
@ -19,7 +19,9 @@ ConvDwInt8Center:
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #48
+    sub sp, sp, #176
+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
@ -631,7 +633,9 @@ ConvDwInt8Center:
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #48
+    sub sp, sp, #176
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
@ -71,6 +71,11 @@ void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_

 void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod,
                        size_t plane_size, size_t stride, size_t relu_type);
+
+void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height,
+                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
+                      size_t ic4, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
+                      size_t relu, size_t relu6);
 #endif

 #ifdef __cplusplus
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
@ -16,6 +16,7 @@

 #include "nnacl/fp32/conv.h"
 #include <string.h>
+#include "nnacl/fp32/common_func.h"
 #include "nnacl/winograd_transform.h"

 void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
@ -83,6 +84,7 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void SWCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width, int kernel_h,
              int kernel_w, int out_h_step, int block_channel, int ic4, int in_sh_step, int in_sw_step, int in_kh_step,
              int in_kw_step, bool is_relu, bool is_relu6) {
@ -135,6 +137,7 @@ void SWCenter(float *dst, const float *src, const float *weight, const float *bi
    src_h += in_sh_step;
  }  // dst_height loop
 }
+#endif

 // fp32 sliding window
 void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *tmp_out_block,
@ -172,11 +175,23 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
          src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
        float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
                       slidingWindow_param->left_ * slidingWindow_param->block_channel_;
+#ifdef ENABLE_ARM64
+        ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
+                         slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
+                         conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
+                         slidingWindow_param->block_channel_ * sizeof(float), ic4,
+                         slidingWindow_param->in_sh_step_ * sizeof(float),
+                         slidingWindow_param->in_sw_step_ * sizeof(float),
+                         slidingWindow_param->in_kh_step_ * sizeof(float),
+                         slidingWindow_param->in_kw_step_ * sizeof(float),
+                         conv_param->is_relu_, conv_param->is_relu6_);
+#else
        SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
-                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
-                 slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
+                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
+                 conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
                 slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
                 slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+#endif
      }
    }  // output C4 loop
    src += slidingWindow_param->in_step_;