optimization for fp32 winograd on arm32

pull/7291/head
lixian 4 years ago
parent d573a1180d
commit 7f3582d0f5

@ -381,6 +381,8 @@ LoopRow:
ldr lr, [sp, #20] ldr lr, [sp, #20]
cmp lr, #0 cmp lr, #0
beq C8DstStep beq C8DstStep
cmp lr, #2
beq WinoDstStep
mov lr, #4 mov lr, #4
ldr r7, [sp, #12] // reload rhs col ldr r7, [sp, #12] // reload rhs col
mul lr, lr, r7 mul lr, lr, r7
@ -391,6 +393,10 @@ LoopRow:
ldr lr, [sp, #-40] ldr lr, [sp, #-40]
add r2, lr, #128 add r2, lr, #128
str r2, [sp, #-40] str r2, [sp, #-40]
b NoDstStep
WinoDstStep:
add r2, r2, r10
str r2, [sp, #-40]
NoDstStep: NoDstStep:
cmp r6, #4 cmp r6, #4
ble LoopRowEnd ble LoopRowEnd

@ -1,82 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global BiasAdd
#ifndef __APPLE__
.type BiasAdd, %function
#endif
//void BiasAdd(const float* bias, float* data, size_t oc4, size_t plan_size)
//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
BiasAdd:
cmp x2, #0
beq BiasAddEnd
cmp x3, #0
beq BiasAddEnd
LoopOc4:
ld1 {v0.4s}, [x0], #16
mov x6, x3
mov x5, x1
Loop16LineIn:
cmp x6, #4
blt L4
sub x6, x6, #4
ld1 {v1.4s, v2.4s}, [x5], #32
fadd v5.4s, v0.4s, v1.4s
fadd v6.4s, v0.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
cmp x6, #4
blt Loop16LineOut
Loop16:
st1 {v5.4s, v6.4s}, [x1], #32
fadd v7.4s, v0.4s, v3.4s
fadd v8.4s, v0.4s, v4.4s
ld1 {v1.4s, v2.4s}, [x5], #32
st1 {v7.4s, v8.4s}, [x1], #32
fadd v5.4s, v0.4s, v1.4s
fadd v6.4s, v0.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
sub x6, x6, #4
cmp x6, #4
bge Loop16
Loop16LineOut:
st1 {v5.4s, v6.4s}, [x1], #32
fadd v7.4s, v0.4s, v3.4s
fadd v8.4s, v0.4s, v4.4s
st1 {v7.4s, v8.4s}, [x1], #32
L4:
cmp x6, #0
beq Loop16LineEnd
Loop4:
ld1 {v1.4s}, [x5], #16
fadd v2.4s, v1.4s, v0.4s
subs x6, x6, #1
st1 {v2.4s}, [x1], #16
bne Loop4
Loop16LineEnd:
subs x2, x2, #1
bne LoopOc4
BiasAddEnd:
ret
#endif

@ -1,94 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global BiasAddRelu
#ifndef __APPLE__
.type BiasAddRelu, %function
#endif
//void BiasAddRelu(const float* bias, float* data, size_t oc4, size_t plan_size)
//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
BiasAddRelu:
cmp x2, #0
beq BiasAddEnd
cmp x3, #0
beq BiasAddEnd
dup v16.4s, wzr
LoopOc4:
ld1 {v0.4s}, [x0], #16
mov x6, x3
mov x5, x1
Loop16LineIn:
cmp x6, #4
blt L4
sub x6, x6, #4
ld1 {v1.4s, v2.4s}, [x5], #32
fadd v21.4s, v0.4s, v1.4s
fadd v22.4s, v0.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
fmax v23.4s, v21.4s, v16.4s
fmax v24.4s, v22.4s, v16.4s
cmp x6, #4
blt Loop16LineOut
Loop16:
st1 {v23.4s, v24.4s}, [x1], #32
fadd v25.4s, v0.4s, v3.4s
fadd v26.4s, v0.4s, v4.4s
ld1 {v1.4s, v2.4s}, [x5], #32
fmax v27.4s, v25.4s, v16.4s
fmax v28.4s, v26.4s, v16.4s
fadd v21.4s, v0.4s, v1.4s
fadd v22.4s, v0.4s, v2.4s
st1 {v27.4s, v28.4s}, [x1], #32
ld1 {v3.4s, v4.4s}, [x5], #32
fmax v23.4s, v21.4s, v16.4s
fmax v24.4s, v22.4s, v16.4s
sub x6, x6, #4
cmp x6, #4
bge Loop16
Loop16LineOut:
st1 {v23.4s, v24.4s}, [x1], #32
fadd v25.4s, v0.4s, v3.4s
fadd v26.4s, v0.4s, v4.4s
fmax v27.4s, v25.4s, v16.4s
fmax v28.4s, v26.4s, v16.4s
st1 {v27.4s, v28.4s}, [x1], #32
L4:
cmp x6, #0
beq Loop16LineEnd
Loop4:
ld1 {v1.4s}, [x5], #16
fadd v1.4s, v1.4s, v0.4s
fmax v1.4s, v1.4s, v16.4s
subs x6, x6, #1
st1 {v1.4s}, [x1], #16
bne Loop4
Loop16LineEnd:
subs x2, x2, #1
bne LoopOc4
BiasAddEnd:
ret
#endif

@ -1,113 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global BiasAddRelu6
#ifndef __APPLE__
.type BiasAddRelu6, %function
#endif
//void BiasAddRelu6(const float* bias, float* data, size_t oc4, size_t plan_size)
//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
BiasAddRelu6:
cmp x2, #0
beq BiasAddEnd
cmp x3, #0
beq BiasAddEnd
dup v16.4s, wzr
movi v17.4s, #6
scvtf v17.4s, v17.4s
LoopOc4:
ld1 {v0.4s}, [x0], #16
mov x6, x3
mov x5, x1
Loop16LineIn:
cmp x6, #4
blt L4
sub x6, x6, #4
ld1 {v1.4s, v2.4s}, [x5], #32
fadd v21.4s, v0.4s, v1.4s
fadd v22.4s, v0.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
fmax v23.4s, v21.4s, v16.4s
fmax v24.4s, v22.4s, v16.4s
cmp x6, #4
blt Loop16LineOut
Loop16:
fmin v23.4s, v23.4s, v17.4s
fmin v24.4s, v24.4s, v17.4s
fadd v25.4s, v0.4s, v3.4s
fadd v26.4s, v0.4s, v4.4s
ld1 {v1.4s, v2.4s}, [x5], #32
st1 {v23.4s, v24.4s}, [x1], #32
fmax v27.4s, v25.4s, v16.4s
fmax v28.4s, v26.4s, v16.4s
fadd v21.4s, v0.4s, v1.4s
fadd v22.4s, v0.4s, v2.4s
fmin v27.4s, v27.4s, v17.4s
fmin v28.4s, v28.4s, v17.4s
fmax v23.4s, v21.4s, v16.4s
fmax v24.4s, v22.4s, v16.4s
ld1 {v3.4s, v4.4s}, [x5], #32
st1 {v27.4s, v28.4s}, [x1], #32
sub x6, x6, #4
cmp x6, #4
bge Loop16
Loop16LineOut:
fmin v23.4s, v23.4s, v17.4s
fmin v24.4s, v24.4s, v17.4s
fadd v25.4s, v0.4s, v3.4s
fadd v26.4s, v0.4s, v4.4s
fmax v27.4s, v25.4s, v16.4s
fmax v28.4s, v26.4s, v16.4s
st1 {v23.4s, v24.4s}, [x1], #32
fmin v27.4s, v27.4s, v17.4s
fmin v28.4s, v28.4s, v17.4s
st1 {v27.4s, v28.4s}, [x1], #32
L4:
cmp x6, #0
beq Loop16LineEnd
Loop4:
ld1 {v1.4s}, [x5], #16
fadd v1.4s, v1.4s, v0.4s
fmax v1.4s, v1.4s, v16.4s
fmin v1.4s, v1.4s, v17.4s
subs x6, x6, #1
st1 {v1.4s}, [x1], #16
bne Loop4
Loop16LineEnd:
subs x2, x2, #1
bne LoopOc4
BiasAddEnd:
ret
#endif

@ -1,73 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global Relu
#ifndef __APPLE__
.type Relu, %function
#endif
//void Relu(float* data, size_t element4)
//Auto: x0:data, x1: element4
Relu:
cmp x1, #0
beq ReluEnd
dup v16.4s, wzr
mov x5, x0
Loop16LineIn:
cmp x1, #4
blt L4
sub x1, x1, #4
ld1 {v1.4s, v2.4s}, [x5], #32
fmax v5.4s, v16.4s, v1.4s
fmax v6.4s, v16.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
cmp x1, #4
blt Loop16LineOut
Loop16:
st1 {v5.4s, v6.4s}, [x0], #32
fmax v7.4s, v16.4s, v3.4s
fmax v8.4s, v16.4s, v4.4s
ld1 {v1.4s, v2.4s}, [x5], #32
st1 {v7.4s, v8.4s}, [x0], #32
fmax v5.4s, v16.4s, v1.4s
fmax v6.4s, v16.4s, v2.4s
ld1 {v3.4s, v4.4s}, [x5], #32
sub x1, x1, #4
cmp x1, #4
bge Loop16
Loop16LineOut:
st1 {v5.4s, v6.4s}, [x0], #32
fmax v7.4s, v16.4s, v3.4s
fmax v8.4s, v16.4s, v4.4s
st1 {v7.4s, v8.4s}, [x0], #32
L4:
cmp x1, #0
beq ReluEnd
Loop4:
ld1 {v1.4s}, [x5], #16
fmax v2.4s, v16.4s, v0.4s
subs x1, x1, #1
st1 {v2.4s}, [x0], #16
bne Loop4
ReluEnd:
ret
#endif

@ -1,89 +0,0 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global Relu6
#ifndef __APPLE__
.type Relu6, %function
#endif
//void Relu6(float* data, size_t element4)
//Auto: x0:data, x1: element4
Relu6:
cmp x1, #0
beq Relu6End
dup v16.4s, wzr
movi v17.4s, #6
scvtf v17.4s, v17.4s
mov x5, x0
Loop16LineIn:
cmp x1, #4
blt L4
sub x1, x1, #4
ld1 {v1.4s, v2.4s}, [x5], #32
fmax v21.4s, v1.4s, v16.4s
fmax v22.4s, v2.4s, v16.4s
ld1 {v3.4s, v4.4s}, [x5], #32
fmin v23.4s, v21.4s, v17.4s
fmin v24.4s, v22.4s, v17.4s
cmp x1, #4
blt Loop16LineOut
Loop16:
st1 {v23.4s, v24.4s}, [x0], #32
fmax v25.4s, v3.4s, v16.4s
fmax v26.4s, v4.4s, v16.4s
ld1 {v1.4s, v2.4s}, [x5], #32
fmin v27.4s, v25.4s, v17.4s
fmin v28.4s, v26.4s, v17.4s
fmax v21.4s, v1.4s, v16.4s
fmax v22.4s, v2.4s, v16.4s
st1 {v27.4s, v28.4s}, [x0], #32
ld1 {v3.4s, v4.4s}, [x5], #32
fmin v23.4s, v21.4s, v17.4s
fmin v24.4s, v22.4s, v17.4s
sub x1, x1, #4
cmp x1, #4
bge Loop16
Loop16LineOut:
st1 {v23.4s, v24.4s}, [x0], #32
fmax v25.4s, v3.4s, v16.4s
fmax v26.4s, v4.4s, v16.4s
fmin v27.4s, v25.4s, v17.4s
fmin v28.4s, v26.4s, v17.4s
st1 {v27.4s, v28.4s}, [x0], #32
L4:
cmp x1, #0
beq Relu6End
Loop4:
ld1 {v1.4s}, [x5], #16
fmax v1.4s, v1.4s, v16.4s
fmin v1.4s, v1.4s, v17.4s
subs x1, x1, #1
st1 {v1.4s}, [x0], #16
bne Loop4
Relu6End:
ret
#endif

@ -81,11 +81,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
int out_w_block = UP_DIV(conv_param->output_w_, out_unit); int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
int out_h_block = UP_DIV(conv_param->output_h_, out_unit); int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
int output_count = out_w_block * out_h_block; int output_count = out_w_block * out_h_block;
#ifdef ENABLE_ARM32 const int tile_num = C12NUM;
const int tile_num = 4;
#else
const int tile_num = 12;
#endif
int output_tile_count = UP_DIV(output_count, tile_num); int output_tile_count = UP_DIV(output_count, tile_num);
int out_channel = conv_param->output_channel_; int out_channel = conv_param->output_channel_;
int oc8 = UP_DIV(out_channel, C8NUM); int oc8 = UP_DIV(out_channel, C8NUM);
@ -117,7 +113,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
for (int i = 0; i < input_unit_square; ++i) { for (int i = 0; i < input_unit_square; ++i) {
#ifdef ENABLE_ARM32 #ifdef ENABLE_ARM32
RowMajor2Col4Major(src_ptr + i * C4NUM * in_channel, tmp_col_ptr, C4NUM, in_channel); RowMajor2Col4Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
#else #else
RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel); RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
#endif #endif

@ -85,11 +85,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
} // interval y loop } // interval y loop
} }
// input transform // input transform
#ifdef ENABLE_ARM32 const int tile_num = C12NUM;
const int tile_num = 4;
#else
const int tile_num = 12;
#endif
int dst_ic4_offset = dst_plane_offset + ic * C4NUM; int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
size_t dst_step = tile_num * in_channel; size_t dst_step = tile_num * in_channel;
float *trans_input_ptr = trans_input + dst_ic4_offset; float *trans_input_ptr = trans_input + dst_ic4_offset;

@ -184,11 +184,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
int ConvolutionWinogradCPUKernel::InitTmpBuffer() { int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
int channel_out = conv_param_->output_channel_; int channel_out = conv_param_->output_channel_;
int oc8 = UP_DIV(channel_out, C8NUM); int oc8 = UP_DIV(channel_out, C8NUM);
#ifdef ENABLE_ARM32 int tile_num = C12NUM;
int tile_num = 4;
#else
int tile_num = 12;
#endif
MS_ASSERT(ctx_->allocator != nullptr); MS_ASSERT(ctx_->allocator != nullptr);
size_t tile_buffer_size = size_t tile_buffer_size =

Loading…
Cancel
Save