!6182 [MS][LITE][Develop] add arm32 fp32 DwBoder、Row、Center op

Merge pull request !6182 from liuzhongkai/arm32_new1
pull/6182/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 6873b53043

@ -0,0 +1,63 @@
#ifdef ENABLE_ARM32
.text
.align 5
.global ConvDwFp32Border
#ifndef __APPLE__
.type ConvDwFp32Border, %function
#endif
// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
// r8: kernel_w, r9: relu, r10: relu6
ConvDwFp32Border:
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r4-r12, lr}
vpush {q4-q7}
add sp, sp, #104
ldr r4, [sp] // height
ldr r5, [sp, #4] // width
ldr r6, [sp, #8] // in_kh_step
ldr r7, [sp, #12] // in_kw_step
ldr r8, [sp, #16] // kernel_w
ldr r9, [sp, #20] // relu
ldr r10, [sp, #24] // relu6
vld1.32 {q0}, [r3] // bias
vmov.i32 q1, #6 // relu6
vcvt.f32.s32 q1, q1
veor q2, q2, q2 // relu
LoopH:
mov r11, r1
mov r12, r2
mov r14, r5
LoopW:
vld1.32 {q3}, [r11], r7
vld1.32 {q4}, [r12]!
vmla.f32 q0, q3, q4
subs r14, r14, #1
bne LoopW
subs r4, r4, #1
add r1, r1, r6
add r2, r2, r8
bne LoopH
cmp r10, #0
bne Relu6
cmp r9, #0
bne Relu
b Write
Relu6:
vmin.f32 q0, q0, q1
Relu:
vmax.f32 q0, q0, q2
Write:
vst1.32 {q0}, [r0]
sub sp, sp, #104
vpop {q4-q7}
pop {r4-r12, pc}
#endif

@ -11,9 +11,9 @@
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w,
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
// #88: relu, #92: relu6
// r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step
// #40: relu, #44: relu6
ConvDwFp32Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
@ -24,7 +24,7 @@ ConvDwFp32Center:
vpush {q4-q7}
add sp, sp, #112
ldr r4, [sp, #48]
ldr r4, [sp] // height
vld1.32 {q13}, [r3]
vmov.i32 q14, #6
@ -32,22 +32,25 @@ ConvDwFp32Center:
veor q15, q15, q15
LoopH:
ldr r1, [sp, #4] // src_w
ldr r5, [sp, #52] // width
ldr r0, [sp] // dst_w
ldr r1, [sp, #-44] // src_w, src_h = src
ldr r5, [sp, #4] // width
ldr r0, [sp, #-48] // dst_w, dst_h = dst
cmp r5, #4
blt LoopW
LoopW4:
ldr r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
ldr r11, [sp, #28] // in_sw_step
mov r8, r1 // src_kh, src_w
ldr r2, [sp, #-40] // weight_kh, weight
ldr r6, [sp, #8] // kernel_h
vmov q0, q13
vmov q1, q13
vmov q2, q13
vmov q3, q13
LoopKh4:
ldr r12, [sp, #80] //in_kh_step
ldr r7, [sp, #60] // kernel_w
mov lr, r8 // src_kw
ldr r7, [sp, #12] // kernel_w
mov lr, r8 // src_kw, src_kh
LoopKw4:
ldr r12, [sp, #36] //in_kw_step
mov r10, lr
vld1.32 {q12}, [r2]!
vld1.32 {q4}, [r10]
@ -65,14 +68,14 @@ ConvDwFp32Center:
subs r7, r7, #1
add lr, lr, r12
bne LoopKw4
ldr r12, [sp, #80]
ldr r12, [sp, #32] // in_kh_step
add r8, r8, r12
subs r6, r6, #1
bne LoopKh4
ldr r12, [sp, #92]
ldr r12, [sp, #44]
cmp r12, #0
bne Relu64
ldr r12, [sp, #88]
ldr r12, [sp, #40]
cmp r12, #0
bne Relu4
b Write4
@ -87,7 +90,7 @@ ConvDwFp32Center:
vmax.f32 q2, q2, q15
vmax.f32 q3, q3, q15
Write4:
ldr r12, [sp, #68]
ldr r12, [sp, #20] // block_channel
vst1.32 {q0}, [r0]
add r0, r0, r12
vst1.32 {q1}, [r0]
@ -98,36 +101,36 @@ ConvDwFp32Center:
add r0, r0, r12
mov r12, #4
mul r11, r11, r12
add r1, r1, r11
add r1, r1, r11 // src_w += in_sw_step
sub r5, r5, #4
cmp r5, #0
ble LoopWEnd
cmp r5, #4
bge LoopW
LoopW:
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q13
mov r8, r1 // src_kh, src_w
ldr r2, [sp, #-40] // weight_kh, weight
ldr r6, [sp, #8] // kernel_h
vmov q0, q13 // bias
LoopKh:
ldr r12, [sp, #84] //in_kw_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
ldr r7, [sp, #12] // kernel_w
mov r10, r8 // src_kw, src_kh
LoopKw:
ldr r12, [sp, #36] //in_kw_step
vld1.32 {q1}, [r10]
add r10, r10, r12
vld1.32 {q12}, [r2]!
vmla.f32 q0, q1, q12
subs r7, r7, #1
bne LoopKw
ldr r12, [sp, #80]
ldr r12, [sp, #32] // in_kh_step
add r8, r8, r12
subs r6, r6, #1
bne LoopKh
ldr r12, [sp, #92]
ldr r12, [sp, #44]
cmp r12, #0
bne Relu6
ldr r12, [sp, #88]
ldr r12, [sp, #40]
cmp r12, #0
bne Relu
b Write
@ -136,22 +139,24 @@ ConvDwFp32Center:
Relu:
vmax.f32 q0, q0, q15
Write:
ldr r12, [sp, #68]
vst1.32 {q0}, [r0]
ldr r12, [sp, #20] // block_channel
vst1.32 {q0}, [r0] // dst_kw += block_channel
add r0, r0, r12
ldr r12, [sp, #76]
add r1, r1, r12
ldr r12, [sp, #28] // in_sw_step
add r1, r1, r12 // src_w += in_sw_step
subs r5, r5, #1
bne LoopW
ldr r3, [sp, #64]
ldr r12, [sp]
ldr r3, [sp, #16] // out_h_step
ldr r12, [sp, #-48]
add r12, r12, r3
str r12, [sp]
ldr r3, [sp, #72]
ldr r12, [sp, #4]
str r12, [sp, #-48]
ldr r3, [sp, #24] // in_sh_step
ldr r12, [sp, #-44] // src_h += in_sh_step
add r12, r12, r3
str r12, [sp, #4]
subs r4, r4, #1
str r12, [sp, #-44]
subs r4, r4, #1 // height
bne LoopH
LoopWEnd:
sub sp, sp, #112

@ -0,0 +1,113 @@
#ifdef ENABLE_ARM32
.text
.align 5
.global ConvDwFp32Row
#ifndef __APPLE__
.type ConvDwFp32Row, %function
#endif
// voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr,
// size_t num_pixels, size_t input_channel, size_t input_step)
// r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels,
// r4: input_channel, r5: input_step
ConvDwFp32Row:
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r4-r6, r8, r10, r11}
vpush {q4-q7}
add sp, sp, #88
mov r11, r0
ldr r4, [sp]
ldr r5, [sp, #4]
mov r6, #4
mul r5, r5, r6
cmp r3, #0
beq End
LoopNumPixel:
mov r6, r1 // input_ptr
mov r8, r2 // filter_ptr
mov r10, r4 // input_channel
LoopDepth16In:
cmp r10, #16
blt L4
sub r10, r10, #16
vld1.32 {q0, q1}, [r6]!
vld1.32 {q4, q5}, [r8]!
vld1.32 {q8, q9}, [r0]!
cmp r10, #16
blt LoopDepth16Out
LoopDepth16:
vmla.f32 q8, q0, q4
vmla.f32 q9, q1, q5
vst1.32 {q8, q9}, [r11]!
vld1.32 {q2, q3}, [r6]!
vld1.32 {q6, q7}, [r8]!
vld1.32 {q10, q11}, [r0]!
vmla.f32 q10, q2, q6
vmla.f32 q11, q3, q7
vst1.32 {q10, q11}, [r11]!
vld1.32 {q0, q1}, [r6]!
vld1.32 {q4, q5}, [r8]!
vld1.32 {q8, q9}, [r0]!
sub r10, r10, #16
cmp r10, #16
bge LoopDepth16
LoopDepth16Out:
vmla.f32 q8, q0, q4
vmla.f32 q9, q1, q5
vst1.32 {q8, q9}, [r11]!
vld1.32 {q2, q3}, [r6]!
vld1.32 {q6, q7}, [r8]!
vld1.32 {q10, q11}, [r0]!
vmla.f32 q10, q2, q6
vmla.f32 q11, q3, q7
vst1.32 {q10, q11}, [r11]!
L4:
cmp r10, #4
blt L0
LoopDepth4:
vld1.32 {q0}, [r6]!
vld1.32 {q4}, [r8]!
vld1.32 {q8}, [r0]!
vmla.f32 q8, q0, q4
vst1.32 {q8}, [r11]!
sub r10, r10, #4
cmp r10, #4
bge LoopDepth4
L0:
cmp r10, #0
beq Loop16LineEnd
LoopDepth0:
vld1.32 {s0}, [r6]!
vld1.32 {s1}, [r8]!
vld1.32 {s2}, [r0]!
vmla.f32 s2, s0, s1
vst1.32 {s2}, [r11]!
subs r10, r10, #1
bne LoopDepth0
Loop16LineEnd:
subs r3, r3, #1
add r1, r1, r5
bne LoopNumPixel
End:
sub sp, sp, #88
vpop {q4-q7}
pop {r4-r6, r8, r10, r11}
bx lr
#endif

@ -40,6 +40,11 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t kernel_h,
size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
size_t in_kh_step, size_t in_kw_step);
void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels,
size_t output_channel, size_t input_step);
void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6);
#endif
#ifdef ENABLE_ARM64
@ -49,12 +54,6 @@ void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size);
void Relu6(float *data, size_t element4);
void Relu(float *data, size_t element4);
void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels,
size_t output_channel, size_t input_step);
void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6);
void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
@ -70,5 +69,4 @@ void ConvSwFp32Center(float *dst, const float *src, const float *weight, const f
#ifdef __cplusplus
}
#endif
#endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */

@ -21,7 +21,7 @@
#include <arm_neon.h>
#endif
#ifndef ENABLE_ARM64
#ifndef ENABLE_ARM
void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels,
int output_channel, int input_step) {
for (int i = 0; i < num_pixels; i++) {
@ -202,7 +202,7 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM;
#ifdef ENABLE_ARM64
#ifdef ENABLE_ARM
ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6);
@ -286,7 +286,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
#ifdef ENABLE_ARM64
#ifdef ENABLE_ARM
ConvDwFp32Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float),
sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float),

Loading…
Cancel
Save