!10216 [MSLITE] Optimize fp32 convolution 1x1

From: @zhanyuan1
Reviewed-by: @zhang_xue_tong,@hangangqiang
Signed-off-by: @zhang_xue_tong
pull/10216/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit a1eee2a15d

@ -35,6 +35,12 @@ MatmulFloatNeon32Opt12x4:
mov lr, #4
mul r8, r8, lr // stride * sizeof(float)
LoopRowStart:
cmp r6, #4
ble LoopRow4
cmp r6, #8
ble LoopRow8
LoopRow:
ldr r1, [sp, #-44] // reload rhs ptr
ldr r7, [sp, #12] // reload rhs col
@ -142,6 +148,158 @@ LoopRow:
vmax.f32 q13, q13, q3
vmax.f32 q14, q14, q3
vmax.f32 q15, q15, q3
b Write
LoopRow8:
ldr r1, [sp, #-44] // reload rhs ptr
ldr r7, [sp, #12] // reload rhs col
ldr r3, [sp, #-36] // reload bias ptr
LoopCol_R8:
ldr r2, [sp, #-40] // reload dst ptr
ldr r0, [sp, #-48] // reload lhs ptr
ldr r5, [sp, #4] // reload depth
vld1.32 {q3}, [r1]!
vld1.32 {q0, q1}, [r0]!
vmul.f32 q4, q3, d0[0]
vmul.f32 q5, q3, d0[1]
vmul.f32 q6, q3, d1[0]
vld1.32 {q2}, [r0]!
vmul.f32 q7, q3, d1[1]
vmul.f32 q8, q3, d2[0]
vmul.f32 q9, q3, d2[1]
vmul.f32 q10, q3, d3[0]
vmul.f32 q11, q3, d3[1]
subs r5, r5, #1
beq Bias_R8
LoopDepth_R8:
vld1.32 {q3}, [r1]!
vld1.32 {q0, q1}, [r0]!
vmla.f32 q4, q3, d0[0]
vmla.f32 q5, q3, d0[1]
vmla.f32 q6, q3, d1[0]
vld1.32 {q2}, [r0]!
vmla.f32 q7, q3, d1[1]
vmla.f32 q8, q3, d2[0]
vmla.f32 q9, q3, d2[1]
vmla.f32 q10, q3, d3[0]
vmla.f32 q11, q3, d3[1]
subs r5, r5, #1
bne LoopDepth_R8
Bias_R8:
cmp r3, #0
beq Activation_R8
vld1.32 {q0}, [r3]!
vadd.f32 q4, q4, q0
vadd.f32 q5, q5, q0
vadd.f32 q6, q6, q0
vadd.f32 q7, q7, q0
vadd.f32 q8, q8, q0
vadd.f32 q9, q9, q0
vadd.f32 q10, q10, q0
vadd.f32 q11, q11, q0
Activation_R8:
ldr lr, [sp]
cmp lr, #3
beq Relu6_R8
cmp lr, #1
beq Relu_R8
b Write
Relu6_R8:
vmov.i32 q2, #6
vcvt.f32.s32 q2, q2
vmin.f32 q4, q4, q2
vmin.f32 q5, q5, q2
vmin.f32 q6, q6, q2
vmin.f32 q7, q7, q2
vmin.f32 q8, q8, q2
vmin.f32 q9, q9, q2
vmin.f32 q10, q10, q2
vmin.f32 q11, q11, q2
Relu_R8:
veor q3, q3, q3
vmax.f32 q4, q4, q3
vmax.f32 q5, q5, q3
vmax.f32 q6, q6, q3
vmax.f32 q7, q7, q3
vmax.f32 q8, q8, q3
vmax.f32 q9, q9, q3
vmax.f32 q10, q10, q3
vmax.f32 q11, q11, q3
b Write
LoopRow4:
ldr r1, [sp, #-44] // reload rhs ptr
ldr r7, [sp, #12] // reload rhs col
ldr r3, [sp, #-36] // reload bias ptr
LoopCol_R4:
ldr r2, [sp, #-40] // reload dst ptr
ldr r0, [sp, #-48] // reload lhs ptr
ldr r5, [sp, #4] // reload depth
vld1.32 {q3}, [r1]!
vld1.32 {q0, q1}, [r0]!
vmul.f32 q4, q3, d0[0]
vmul.f32 q5, q3, d0[1]
vmul.f32 q6, q3, d1[0]
vld1.32 {q2}, [r0]!
vmul.f32 q7, q3, d1[1]
subs r5, r5, #1
beq Bias_R4
LoopDepth_R4:
vld1.32 {q3}, [r1]!
vld1.32 {q0, q1}, [r0]!
vmla.f32 q4, q3, d0[0]
vmla.f32 q5, q3, d0[1]
vmla.f32 q6, q3, d1[0]
vld1.32 {q2}, [r0]!
vmla.f32 q7, q3, d1[1]
subs r5, r5, #1
bne LoopDepth_R4
Bias_R4:
cmp r3, #0
beq Activation_R4
vld1.32 {q0}, [r3]!
vadd.f32 q4, q4, q0
vadd.f32 q5, q5, q0
vadd.f32 q6, q6, q0
vadd.f32 q7, q7, q0
Activation_R4:
ldr lr, [sp]
cmp lr, #3
beq Relu6_R4
cmp lr, #1
beq Relu_R4
b Write
Relu6_R4:
vmov.i32 q2, #6
vcvt.f32.s32 q2, q2
vmin.f32 q4, q4, q2
vmin.f32 q5, q5, q2
vmin.f32 q6, q6, q2
vmin.f32 q7, q7, q2
Relu_R4:
veor q3, q3, q3
vmax.f32 q4, q4, q3
vmax.f32 q5, q5, q3
vmax.f32 q6, q6, q3
vmax.f32 q7, q7, q3
Write:
cmp r7, #1
@ -398,7 +556,7 @@ LoopRow:
cmp r6, #12
ble LoopRowEnd
sub r6, r6, #12 // lhs row - 12
b LoopRow
b LoopRowStart
LoopRowEnd:
sub sp, sp, #112

@ -62,6 +62,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM);
matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
matmul_param_->act_type_ = conv_param_->act_type_;
return;
@ -73,14 +74,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
auto output_channel = filter_tensor->Batch();
#ifdef ENABLE_AVX
int col_tile = C16NUM;
row_tile_ = C6NUM;
col_tile_ = C16NUM;
#elif defined(ENABLE_SSE)
row_tile_ = C4NUM;
col_tile_ = C8NUM;
#elif defined(ENABLE_ARM32)
int col_tile = C4NUM;
row_tile_ = C12NUM;
col_tile_ = C4NUM;
#else
int col_tile = C8NUM;
row_tile_ = C12NUM;
col_tile_ = C8NUM;
#endif
if (in_tensors_.size() == 3) {
int size = UP_ROUND(output_channel, col_tile) * sizeof(float);
int size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
int weight_size = output_channel * sizeof(float);
bias_data_ = malloc(size);
if (bias_data_ == nullptr) {
@ -91,8 +99,8 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
}
int size = input_channel * UP_ROUND(output_channel, col_tile) * sizeof(float);
int down_size = input_channel * DOWN_DIV(output_channel, col_tile) * col_tile * sizeof(float);
int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float);
weight_ptr_ = reinterpret_cast<float *>(malloc(size));
if (weight_ptr_ == nullptr) {
MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
@ -113,27 +121,14 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
}
int Convolution1x1CPUKernel::InitConv1x1Param() {
int hw_tile = C12NUM;
#ifdef ENABLE_AVX
hw_tile = C6NUM;
#elif defined(ENABLE_SSE)
hw_tile = C4NUM;
#endif
if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
multi_thread_by_hw_ = true;
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile;
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
} else {
#ifdef ENABLE_AVX
int col_tile = C16NUM;
#elif defined(ENABLE_ARM32)
int col_tile = C4NUM;
#else
int col_tile = C8NUM;
#endif
multi_thread_by_hw_ = false;
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile), thread_count_) * col_tile;
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
}
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
@ -167,6 +162,16 @@ int Convolution1x1CPUKernel::Init() {
return ReSize();
}
void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
#if ENABLE_AVX
RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
#elif defined(ENABLE_SSE)
RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
#else
RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
#endif
}
int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
int res_stride = matmul_param_->col_ - task_id * thread_stride_;
int cur_oc = MSMIN(thread_stride_, res_stride);
@ -198,20 +203,20 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
}
float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
#if ENABLE_AVX
RowMajor2Col6Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
#elif defined(ENABLE_SSE)
RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
#else
RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
#endif
float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_;
float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
float *cur_intput = thread_input_ptr;
float *cur_output = thread_output_ptr;
for (int i = 0; i < cur_hw_; i += row_tile_) {
int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i);
PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_);
MatMulOpt(thread_pack_input, weight_ptr_, cur_output, reinterpret_cast<float *>(bias_data_),
matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_,
OutType_Nhwc);
cur_intput += row_tile_ * matmul_param_->deep_;
cur_output += row_tile_ * matmul_param_->col_;
}
return RET_OK;
}
@ -228,17 +233,9 @@ int Convolution1x1RunHw(void *cdata, int task_id) {
int Convolution1x1CPUKernel::Run() {
auto src_in = reinterpret_cast<float *>(in_tensors_[0]->MutableData());
auto src_out = reinterpret_cast<float *>(out_tensors_[0]->MutableData());
#ifdef ENABLE_AVX
pack_input_ =
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_6_ * matmul_param_->deep_ * sizeof(float)));
#elif defined(ENABLE_SSE)
pack_input_ =
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_4_ * matmul_param_->deep_ * sizeof(float)));
#else
pack_input_ =
reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float)));
#endif
int pack_input_size = multi_thread_by_hw_ ? (thread_count_ * row_tile_ * matmul_param_->deep_)
: (matmul_param_->row_align_ * matmul_param_->deep_);
pack_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(pack_input_size * sizeof(float)));
if (pack_input_ == nullptr) {
MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
return RET_MEMORY_FAILED;
@ -256,13 +253,7 @@ int Convolution1x1CPUKernel::Run() {
if (multi_thread_by_hw_) {
ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_);
} else {
#ifdef ENABLE_AVX
RowMajor2Col6Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
#elif defined(ENABLE_SSE)
RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
#else
RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
#endif
PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_);
}
}

@ -51,6 +51,7 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
int InitConv1x1BiasWeight();
void InitConv1x1MatmulParam();
void FreeTmpBuffer();
void PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col);
private:
MatMulParameter *matmul_param_ = nullptr;
@ -62,6 +63,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
float *pack_input_ = nullptr;
float *input_ptr_ = nullptr;
float *output_ptr_ = nullptr;
int row_tile_ = 0;
int col_tile_ = 0;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_

Loading…
Cancel
Save