From 59d971044761057457ac215a473cf0a185b81398 Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Sun, 16 Aug 2020 12:06:06 +0800 Subject: [PATCH] [MS][LITE] optimize arm cpu fp32/fp16 op: add assembly file for deconv depthwise border --- .../nnacl/assembly/arm64/DeconvDwFp32Border.S | 39 +++++++++++++++++++ .../nnacl/assembly/opt/DeconvDwFp16Border.S | 39 +++++++++++++++++++ .../kernel/arm/nnacl/fp16/common_func.h | 2 + .../arm/nnacl/fp16/conv_depthwise_fp16.c | 13 +++++-- .../kernel/arm/nnacl/fp32/common_func.h | 4 ++ .../kernel/arm/nnacl/fp32/conv_depthwise.c | 13 +++++-- 6 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/DeconvDwFp32Border.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/DeconvDwFp16Border.S diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/DeconvDwFp32Border.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/DeconvDwFp32Border.S new file mode 100644 index 0000000000..dd65bcaed8 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/DeconvDwFp32Border.S @@ -0,0 +1,39 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global DeconvDwFp32Border +#ifndef __APPLE__ +.type DeconvDwFp32Border, %function +#endif + +// void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, +// size_t in_kh_step, size_t in_kw_step, size_t kernel_w) + +// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w +DeconvDwFp32Border: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + ld1 {v1.4s}, [x1] + + mov x13, x0 + mov x14, x2 + LoopH: + mov x15, x13 + mov x16, x14 + mov x17, x4 + LoopW: + ld1 {v0.4s}, [x15] + ld1 {v2.4s}, [x16], #16 + fmla v0.4s, v1.4s, v2.4s + st1 {v0.4s}, [x15], x6 + subs x17, x17, #1 + bne LoopW + subs x3, x3, #1 + add x13, x13, x5 + add x14, x14, x7 + bne LoopH + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/DeconvDwFp16Border.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/DeconvDwFp16Border.S new file mode 100644 index 0000000000..73d5232233 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/DeconvDwFp16Border.S @@ -0,0 +1,39 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global DeconvDwFp16Border +#ifndef __APPLE__ +.type DeconvDwFp16Border, %function +#endif + +// void DeconvDwFp16Border(float *dst, const float *src, const float *weight, size_t height, size_t width, +// size_t in_kh_step, size_t in_kw_step, size_t kernel_w) + +// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w +DeconvDwFp16Border: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + ld1 {v1.8h}, [x1] + + mov x13, x0 + mov x14, x2 + LoopH: + mov x15, x13 + mov x16, x14 + mov x17, x4 + LoopW: + ld1 {v0.8h}, [x15] + ld1 {v2.8h}, [x16], #16 + fmla v0.8h, v1.8h, v2.8h + st1 {v0.8h}, [x15], x6 + subs x17, x17, #1 + bne LoopW + subs x3, x3, #1 + add x13, x13, x5 + add x14, x14, x7 + bne LoopH + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h index 96ad7bde09..deb8e581c8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h @@ -35,6 +35,8 @@ void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *wei size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, + size_t in_kh_step, size_t in_kw_step, size_t kernel_w); void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c index b41dd8e495..e5870f8f4b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c @@ -184,7 +184,7 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo /*deconv depthwise fp16 begin*/ void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, - int width, int in_kh_step, int in_kw_step, int kernel_w) { + int width, int in_kh_step, int in_kw_step, int kernel_w_step) { float16_t *dst_kh = dst; const float16_t *weight_kh = weight; for (int kh = 0; kh < height; kh++) { @@ -201,7 +201,7 @@ void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const weight_kw += C8NUM; } // kernel_w loop dst_kh += in_kh_step; - weight_kh += kernel_w * C8NUM; + weight_kh += kernel_w_step; } // kernel_h loop } @@ -224,9 +224,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; - +#ifdef ENABLE_ARM64 + DeconvDwFp16Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, + sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), + conv_param->kernel_w_ * C8NUM * sizeof(float16_t)); +#else DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM); +#endif src_kernel += sliding->block_channel_; } // width loop src_h += sliding->out_h_step_; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h index c254219003..fa6a8b1d75 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h @@ -61,6 +61,10 @@ void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_ void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); + +void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, + size_t in_kh_step, size_t in_kw_step, size_t kernel_w); + void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, size_t plane_size, size_t stride, size_t relu_type); #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c index a6806f1ba8..20b5ce7b6b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c @@ -634,7 +634,7 @@ void ConvDw3x3Fp32(float *output_data, const float *input_data, const float *wei /*deconv depthwise fp32 begin*/ void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weight, int height, int width, - int in_kh_step, int in_kw_step, int kernel_w) { + int in_kh_step, int in_kw_step, int kernel_w_step) { float *dst_kh = dst; const float *weight_kh = weight; for (int kh = 0; kh < height; kh++) { @@ -656,7 +656,7 @@ void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weigh weight_kw += C4NUM; } // kernel_w loop dst_kh += in_kh_step; - weight_kh += kernel_w * C4NUM; + weight_kh += kernel_w_step; } // kernel_h loop } @@ -678,9 +678,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; float *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; - +#ifdef ENABLE_ARM64 + DeconvDwFp32Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, + sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), + conv_param->kernel_w_ * C4NUM * sizeof(float)); +#else DeconvDepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM); +#endif src_kernel += sliding->block_channel_; } // width loop src_h += sliding->out_h_step_;