parent
36a8013b0a
commit
7d97c1b903
@ -0,0 +1,248 @@
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global PostFuncBiasReluC4
|
||||
#ifndef __APPLE__
|
||||
.type PostFuncBiasReluC4, %function
|
||||
#endif
|
||||
|
||||
//void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
|
||||
// size_t plane_size, size_t plane_stride, size_t relu_type);
|
||||
// r0 dst r1 srx r2 bias
|
||||
// r3 oc4div r4 oc4mod r5 plane_size
|
||||
// r6 plane_stride r7 relu_type
|
||||
|
||||
// v0 ~ v15 value
|
||||
// v16 v17 bias data
|
||||
// r10 r11 weite loop tmp buf
|
||||
// r16 relu6 #6; r17 relu #0
|
||||
// lr oc8 loop control
|
||||
// r8 hw loop control
|
||||
|
||||
PostFuncBiasReluC4:
|
||||
push {r4-r8, r10, r11, lr}
|
||||
add sp, sp, #32
|
||||
|
||||
ldr r4, [sp]
|
||||
ldr r5, [sp, #4]
|
||||
ldr r6, [sp, #8]
|
||||
ldr r7, [sp, #12]
|
||||
|
||||
vmov.i32 q14, #6
|
||||
vcvt.f32.s32 q14, q14
|
||||
veor q15, q15, q15
|
||||
|
||||
mov lr, #4
|
||||
add r12, r3, r4
|
||||
mul r12, r12, lr
|
||||
|
||||
mov lr, #0
|
||||
|
||||
Loop_C4:
|
||||
cmp lr, r3
|
||||
beq Loop_C1
|
||||
mov r11, #4
|
||||
mul r10, lr, r11
|
||||
add r11, r0, r10
|
||||
add lr, lr, #4
|
||||
mov r8, r5
|
||||
vld1.32 {q12}, [r2]!
|
||||
|
||||
Loop_4x4:
|
||||
cmp r8, #4
|
||||
blt Loop_1x4
|
||||
sub r8, r8, #4
|
||||
vld1.32 {q0-q1}, [r1]!
|
||||
vld1.32 {q2-q3}, [r1]!
|
||||
|
||||
vadd.f32 q0, q0, q12
|
||||
vadd.f32 q1, q1, q12
|
||||
vadd.f32 q2, q2, q12
|
||||
vadd.f32 q3, q3, q12
|
||||
|
||||
cmp r7, #3
|
||||
beq Relu6_4x4
|
||||
cmp r7, #1
|
||||
beq Relu_4x4
|
||||
b Write_4x4
|
||||
Relu6_4x4:
|
||||
vmin.f32 q0, q0, q14
|
||||
vmin.f32 q1, q1, q14
|
||||
vmin.f32 q2, q2, q14
|
||||
vmin.f32 q3, q3, q14
|
||||
Relu_4x4:
|
||||
vmax.f32 q0, q0, q15
|
||||
vmax.f32 q1, q1, q15
|
||||
vmax.f32 q2, q2, q15
|
||||
vmax.f32 q3, q3, q15
|
||||
Write_4x4:
|
||||
vst1.32 {q0}, [r11], r12
|
||||
vst1.32 {q1}, [r11], r12
|
||||
vst1.32 {q2}, [r11], r12
|
||||
vst1.32 {q3}, [r11], r12
|
||||
b Loop_4x4
|
||||
|
||||
Loop_1x4:
|
||||
cmp r7, #3
|
||||
beq Relu6_1x4
|
||||
cmp r7, #1
|
||||
beq Relu_1x4
|
||||
b Write_1x4
|
||||
Relu6_1x4:
|
||||
cmp r8, #0
|
||||
beq HW_Add
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmin.f32 q0, q0, q14
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {q0}, [r11], r12
|
||||
b Relu6_1x4
|
||||
Relu_1x4:
|
||||
cmp r8, #0
|
||||
beq HW_Add
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {q0}, [r11], r12
|
||||
b Relu_1x4
|
||||
Write_1x4:
|
||||
cmp r8, #0
|
||||
beq HW_Add
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vst1.32 {q0}, [r11], r12
|
||||
b Write_1x4
|
||||
|
||||
HW_Add:
|
||||
add r1, r1, r6
|
||||
b Loop_C4
|
||||
|
||||
Loop_C1:
|
||||
cmp r4, #0
|
||||
beq End
|
||||
mov r8, r5
|
||||
vld1.32 {q12}, [r2]!
|
||||
mov r11, #4
|
||||
mul r10, lr, r11
|
||||
add r0, r0, r10
|
||||
|
||||
cmp r4, #1
|
||||
beq Loop_C1_1
|
||||
cmp r4, #2
|
||||
beq Loop_C1_2
|
||||
cmp r4, #3
|
||||
beq Loop_C1_3
|
||||
|
||||
Loop_C1_1:
|
||||
cmp r7, #3
|
||||
beq Loop_C1_1_Relu6
|
||||
cmp r7, #1
|
||||
beq Loop_C1_1_Relu
|
||||
b Loop_C1_1_Write
|
||||
Loop_C1_1_Relu6:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmin.f32 q0, q0, q14
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0[0]}, [r0], r12
|
||||
b Loop_C1_1_Relu6
|
||||
Loop_C1_1_Relu:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0[0]}, [r0], r12
|
||||
b Loop_C1_1_Relu
|
||||
Loop_C1_1_Write:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vst1.32 {d0[0]}, [r0], r12
|
||||
b Loop_C1_1_Write
|
||||
|
||||
Loop_C1_2:
|
||||
cmp r7, #3
|
||||
beq Loop_C1_2_Relu6
|
||||
cmp r7, #1
|
||||
beq Loop_C1_2_Relu
|
||||
b Loop_C1_2_Write
|
||||
Loop_C1_2_Relu6:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmin.f32 q0, q0, q14
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0}, [r0], r12
|
||||
b Loop_C1_2_Relu6
|
||||
Loop_C1_2_Relu:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0}, [r0], r12
|
||||
b Loop_C1_2_Relu
|
||||
Loop_C1_2_Write:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vst1.32 {d0}, [r0], r12
|
||||
b Loop_C1_2_Write
|
||||
|
||||
Loop_C1_3:
|
||||
add r11, r0, #8
|
||||
cmp r7, #3
|
||||
beq Loop_C1_3_Relu6
|
||||
cmp r7, #1
|
||||
beq Loop_C1_3_Relu
|
||||
b Loop_C1_3_Write
|
||||
Loop_C1_3_Relu6:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmin.f32 q0, q0, q14
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0}, [r0], r6
|
||||
vst1.32 {d1[0]}, [r11], r12
|
||||
b Loop_C1_3_Relu6
|
||||
Loop_C1_3_Relu:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vmax.f32 q0, q0, q15
|
||||
vst1.32 {d0}, [r0], r6
|
||||
vst1.32 {d1[0]}, [r11], r12
|
||||
b Loop_C1_3_Relu
|
||||
Loop_C1_3_Write:
|
||||
cmp r8, #0
|
||||
beq End
|
||||
sub r8, r8, #1
|
||||
vld1.32 {q0}, [r1]!
|
||||
vadd.f32 q0, q0, q12
|
||||
vst1.32 {d0}, [r0], r6
|
||||
vst1.32 {d1[0]}, [r11], r12
|
||||
b Loop_C1_3_Write
|
||||
|
||||
End:
|
||||
sub sp, sp, #32
|
||||
pop {r4-r8, r10, r11, pc}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue