[MS][LITE] optimize arm cpu fp16 op: conv depthwise fp16

5 years ago · ac1d19ce57
parent 6bf9732369
commit ac1d19ce57
3 changed files with 69 additions and 6 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/ConvDwFp16Border.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/ConvDwFp16Border.S
@ -0,0 +1,56 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwFp16Border
+#ifndef __APPLE__
+.type ConvDwFp16Border, %function
+#endif
+
+// void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
+//                       size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
+//                       size_t relu6)
+
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
+// x8: kernel_w, x9: relu, x10: relu6
+ConvDwFp16Border:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+
+    ld1 {v0.8h}, [x3]           // bias
+    movi v1.8h, #0x46, lsl #8   // relu 6
+    dup v2.4s, wzr              // relu
+
+    mov x13, x1
+    mov x14, x2
+    LoopH:
+        mov x15, x13
+        mov x16, x14
+        mov x17, x5
+        LoopW:
+            ld1 {v3.8h}, [x15], x7
+            ld1 {v4.8h}, [x16], #16
+            fmla v0.8h, v3.8h, v4.8h
+            subs x17, x17, #1
+            bne LoopW
+        subs x4, x4, #1
+        add x13, x13, x6
+        add x14, x14, x8
+        bne LoopH
+    cbnz x10, Relu6
+    cbnz x9, Relu
+    b Write
+    Relu6:
+        fmin v0.8h, v0.8h, v1.8h
+    Relu:
+        fmax v0.8h, v0.8h, v2.8h
+    Write:
+        st1 {v0.8h}, [x0]
+
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h
@ -28,6 +28,9 @@ extern "C" {
 #endif

 #ifdef ENABLE_ARM64
+void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
+                      size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
+                      size_t relu6);
 void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
                      size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
                      size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c
@ -20,7 +20,7 @@

 /*conv depthwise fp16 begin*/
 void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
-                              int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu,
+                              int height, int width, int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu,
                              bool is_relu6) {
  for (int c = 0; c < C8NUM; c++) {
    dst[c] = 0;
@ -41,7 +41,7 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1
      weight_kw += C8NUM;
    }  // kernel_w loop
    src_kh += in_kh_step;
-    weight_kh += kernel_w * C8NUM;
+    weight_kh += kernel_w_step;
  }  // kernel_h loop
  for (int c = 0; c < C8NUM; c++) {
    dst[c] += bias[c];
@ -69,11 +69,15 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *

      const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
      const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM;
-
+#ifdef ENABLE_ARM64
+      ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
+                       sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
+                       conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
+#else
      DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
-                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_,
-                               conv_param->is_relu6_);
-
+                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM,
+                               conv_param->is_relu_, conv_param->is_relu6_);
+#endif
      dst_kernel += sliding->block_channel_;
    }  // width loop
    dst_h += sliding->out_h_step_;