diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S new file mode 100644 index 0000000000..5bf15c59e9 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S @@ -0,0 +1,63 @@ +#ifdef ENABLE_ARM32 + +.text +.align 5 +.global ConvDwFp32Border +#ifndef __APPLE__ +.type ConvDwFp32Border, %function +#endif + +// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, +// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) +// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, +// r8: kernel_w, r9: relu, r10: relu6 +ConvDwFp32Border: + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r4-r12, lr} + vpush {q4-q7} + add sp, sp, #104 + + ldr r4, [sp] // height + ldr r5, [sp, #4] // width + ldr r6, [sp, #8] // in_kh_step + ldr r7, [sp, #12] // in_kw_step + ldr r8, [sp, #16] // kernel_w + ldr r9, [sp, #20] // relu + ldr r10, [sp, #24] // relu6 + + vld1.32 {q0}, [r3] // bias + vmov.i32 q1, #6 // relu6 + vcvt.f32.s32 q1, q1 + veor q2, q2, q2 // relu + + LoopH: + mov r11, r1 + mov r12, r2 + mov r14, r5 + LoopW: + vld1.32 {q3}, [r11], r7 + vld1.32 {q4}, [r12]! + vmla.f32 q0, q3, q4 + subs r14, r14, #1 + bne LoopW + subs r4, r4, #1 + add r1, r1, r6 + add r2, r2, r8 + bne LoopH + + cmp r10, #0 + bne Relu6 + cmp r9, #0 + bne Relu + b Write + Relu6: + vmin.f32 q0, q0, q1 + Relu: + vmax.f32 q0, q0, q2 + Write: + vst1.32 {q0}, [r0] + + sub sp, sp, #104 + vpop {q4-q7} + pop {r4-r12, pc} +#endif diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S index c8398ca03d..a90d2fa014 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S @@ -11,9 +11,9 @@ // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); -// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, -// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step -// #88: relu, #92: relu6 +// r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, +// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step +// #40: relu, #44: relu6 ConvDwFp32Center: // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 @@ -24,7 +24,7 @@ ConvDwFp32Center: vpush {q4-q7} add sp, sp, #112 - ldr r4, [sp, #48] + ldr r4, [sp] // height vld1.32 {q13}, [r3] vmov.i32 q14, #6 @@ -32,22 +32,25 @@ ConvDwFp32Center: veor q15, q15, q15 LoopH: - ldr r1, [sp, #4] // src_w - ldr r5, [sp, #52] // width - ldr r0, [sp] // dst_w + ldr r1, [sp, #-44] // src_w, src_h = src + ldr r5, [sp, #4] // width + ldr r0, [sp, #-48] // dst_w, dst_h = dst cmp r5, #4 blt LoopW LoopW4: - ldr r11, [sp, #76] // in_sw_step - mov r8, r1 // src_kh - ldr r2, [sp, #8] // weight_kh - ldr r6, [sp, #56] // kernel_h + ldr r11, [sp, #28] // in_sw_step + mov r8, r1 // src_kh, src_w + ldr r2, [sp, #-40] // weight_kh, weight + ldr r6, [sp, #8] // kernel_h vmov q0, q13 + vmov q1, q13 + vmov q2, q13 + vmov q3, q13 LoopKh4: - ldr r12, [sp, #80] //in_kh_step - ldr r7, [sp, #60] // kernel_w - mov lr, r8 // src_kw + ldr r7, [sp, #12] // kernel_w + mov lr, r8 // src_kw, src_kh LoopKw4: + ldr r12, [sp, #36] //in_kw_step mov r10, lr vld1.32 {q12}, [r2]! vld1.32 {q4}, [r10] @@ -65,14 +68,14 @@ ConvDwFp32Center: subs r7, r7, #1 add lr, lr, r12 bne LoopKw4 - ldr r12, [sp, #80] + ldr r12, [sp, #32] // in_kh_step add r8, r8, r12 subs r6, r6, #1 bne LoopKh4 - ldr r12, [sp, #92] + ldr r12, [sp, #44] cmp r12, #0 bne Relu64 - ldr r12, [sp, #88] + ldr r12, [sp, #40] cmp r12, #0 bne Relu4 b Write4 @@ -87,7 +90,7 @@ ConvDwFp32Center: vmax.f32 q2, q2, q15 vmax.f32 q3, q3, q15 Write4: - ldr r12, [sp, #68] + ldr r12, [sp, #20] // block_channel vst1.32 {q0}, [r0] add r0, r0, r12 vst1.32 {q1}, [r0] @@ -98,36 +101,36 @@ ConvDwFp32Center: add r0, r0, r12 mov r12, #4 mul r11, r11, r12 - add r1, r1, r11 + add r1, r1, r11 // src_w += in_sw_step sub r5, r5, #4 cmp r5, #0 ble LoopWEnd cmp r5, #4 bge LoopW LoopW: - mov r8, r1 // src_kh - ldr r2, [sp, #8] // weight_kh - ldr r6, [sp, #56] // kernel_h - vmov q0, q13 + mov r8, r1 // src_kh, src_w + ldr r2, [sp, #-40] // weight_kh, weight + ldr r6, [sp, #8] // kernel_h + vmov q0, q13 // bias LoopKh: - ldr r12, [sp, #84] //in_kw_step - ldr r7, [sp, #60] // kernel_w - mov r10, r8 // src_kw + ldr r7, [sp, #12] // kernel_w + mov r10, r8 // src_kw, src_kh LoopKw: + ldr r12, [sp, #36] //in_kw_step vld1.32 {q1}, [r10] add r10, r10, r12 vld1.32 {q12}, [r2]! vmla.f32 q0, q1, q12 subs r7, r7, #1 bne LoopKw - ldr r12, [sp, #80] + ldr r12, [sp, #32] // in_kh_step add r8, r8, r12 subs r6, r6, #1 bne LoopKh - ldr r12, [sp, #92] + ldr r12, [sp, #44] cmp r12, #0 bne Relu6 - ldr r12, [sp, #88] + ldr r12, [sp, #40] cmp r12, #0 bne Relu b Write @@ -136,22 +139,24 @@ ConvDwFp32Center: Relu: vmax.f32 q0, q0, q15 Write: - ldr r12, [sp, #68] - vst1.32 {q0}, [r0] + ldr r12, [sp, #20] // block_channel + vst1.32 {q0}, [r0] // dst_kw += block_channel add r0, r0, r12 - ldr r12, [sp, #76] - add r1, r1, r12 + ldr r12, [sp, #28] // in_sw_step + add r1, r1, r12 // src_w += in_sw_step subs r5, r5, #1 bne LoopW - ldr r3, [sp, #64] - ldr r12, [sp] + ldr r3, [sp, #16] // out_h_step + ldr r12, [sp, #-48] add r12, r12, r3 - str r12, [sp] - ldr r3, [sp, #72] - ldr r12, [sp, #4] + str r12, [sp, #-48] + + ldr r3, [sp, #24] // in_sh_step + ldr r12, [sp, #-44] // src_h += in_sh_step add r12, r12, r3 - str r12, [sp, #4] - subs r4, r4, #1 + str r12, [sp, #-44] + + subs r4, r4, #1 // height bne LoopH LoopWEnd: sub sp, sp, #112 diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S new file mode 100644 index 0000000000..519a2da43a --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S @@ -0,0 +1,113 @@ +#ifdef ENABLE_ARM32 + +.text +.align 5 +.global ConvDwFp32Row +#ifndef __APPLE__ +.type ConvDwFp32Row, %function +#endif + +// voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr, +// size_t num_pixels, size_t input_channel, size_t input_step) +// r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, +// r4: input_channel, r5: input_step +ConvDwFp32Row: + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + + push {r4-r6, r8, r10, r11} + vpush {q4-q7} + add sp, sp, #88 + mov r11, r0 + ldr r4, [sp] + ldr r5, [sp, #4] + mov r6, #4 + mul r5, r5, r6 + cmp r3, #0 + beq End + + LoopNumPixel: + mov r6, r1 // input_ptr + mov r8, r2 // filter_ptr + mov r10, r4 // input_channel + + LoopDepth16In: + cmp r10, #16 + blt L4 + sub r10, r10, #16 + + vld1.32 {q0, q1}, [r6]! + vld1.32 {q4, q5}, [r8]! + vld1.32 {q8, q9}, [r0]! + + cmp r10, #16 + blt LoopDepth16Out + LoopDepth16: + vmla.f32 q8, q0, q4 + vmla.f32 q9, q1, q5 + vst1.32 {q8, q9}, [r11]! + + vld1.32 {q2, q3}, [r6]! + vld1.32 {q6, q7}, [r8]! + vld1.32 {q10, q11}, [r0]! + vmla.f32 q10, q2, q6 + vmla.f32 q11, q3, q7 + vst1.32 {q10, q11}, [r11]! + + vld1.32 {q0, q1}, [r6]! + vld1.32 {q4, q5}, [r8]! + vld1.32 {q8, q9}, [r0]! + + sub r10, r10, #16 + cmp r10, #16 + bge LoopDepth16 + + LoopDepth16Out: + vmla.f32 q8, q0, q4 + vmla.f32 q9, q1, q5 + vst1.32 {q8, q9}, [r11]! + + vld1.32 {q2, q3}, [r6]! + vld1.32 {q6, q7}, [r8]! + vld1.32 {q10, q11}, [r0]! + vmla.f32 q10, q2, q6 + vmla.f32 q11, q3, q7 + vst1.32 {q10, q11}, [r11]! + + L4: + cmp r10, #4 + blt L0 + + LoopDepth4: + vld1.32 {q0}, [r6]! + vld1.32 {q4}, [r8]! + vld1.32 {q8}, [r0]! + vmla.f32 q8, q0, q4 + vst1.32 {q8}, [r11]! + sub r10, r10, #4 + cmp r10, #4 + bge LoopDepth4 + + L0: + cmp r10, #0 + beq Loop16LineEnd + + LoopDepth0: + vld1.32 {s0}, [r6]! + vld1.32 {s1}, [r8]! + vld1.32 {s2}, [r0]! + vmla.f32 s2, s0, s1 + vst1.32 {s2}, [r11]! + subs r10, r10, #1 + bne LoopDepth0 + + Loop16LineEnd: + subs r3, r3, #1 + add r1, r1, r5 + bne LoopNumPixel + + End: + sub sp, sp, #88 + vpop {q4-q7} + pop {r4-r6, r8, r10, r11} + bx lr +#endif diff --git a/mindspore/lite/nnacl/fp32/common_func.h b/mindspore/lite/nnacl/fp32/common_func.h index b92c942ecd..cb1fceeeac 100644 --- a/mindspore/lite/nnacl/fp32/common_func.h +++ b/mindspore/lite/nnacl/fp32/common_func.h @@ -40,6 +40,11 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, + size_t output_channel, size_t input_step); + +void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, + size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); #endif #ifdef ENABLE_ARM64 @@ -49,12 +54,6 @@ void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); void Relu6(float *data, size_t element4); void Relu(float *data, size_t element4); -void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, - size_t output_channel, size_t input_step); - -void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, - size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); - void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w); @@ -70,5 +69,4 @@ void ConvSwFp32Center(float *dst, const float *src, const float *weight, const f #ifdef __cplusplus } #endif - #endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */ diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise.c b/mindspore/lite/nnacl/fp32/conv_depthwise.c index b80166190d..bf87f47d18 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise.c +++ b/mindspore/lite/nnacl/fp32/conv_depthwise.c @@ -21,7 +21,7 @@ #include #endif -#ifndef ENABLE_ARM64 +#ifndef ENABLE_ARM void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, int output_channel, int input_step) { for (int i = 0; i < num_pixels; i++) { @@ -202,7 +202,7 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; -#ifdef ENABLE_ARM64 +#ifdef ENABLE_ARM ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); @@ -286,7 +286,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; -#ifdef ENABLE_ARM64 +#ifdef ENABLE_ARM ConvDwFp32Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float),