From 7fd260c0ac4943d254577b662b51e1e710b56b6f Mon Sep 17 00:00:00 2001 From: zhanyuan Date: Thu, 17 Dec 2020 16:30:05 +0800 Subject: [PATCH] [MSLITE] Optimize fp32 conv 1x1 for arm v7a --- .../nnacl/assembly/arm32/MatmulFp32Opt12x4.S | 160 +++++++++++++++++- .../kernel/arm/fp32/convolution_1x1_fp32.cc | 101 +++++------ .../kernel/arm/fp32/convolution_1x1_fp32.h | 3 + 3 files changed, 208 insertions(+), 56 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S index 03f0832923..bb765a7534 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S @@ -35,6 +35,12 @@ MatmulFloatNeon32Opt12x4: mov lr, #4 mul r8, r8, lr // stride * sizeof(float) +LoopRowStart: + cmp r6, #4 + ble LoopRow4 + cmp r6, #8 + ble LoopRow8 + LoopRow: ldr r1, [sp, #-44] // reload rhs ptr ldr r7, [sp, #12] // reload rhs col @@ -142,6 +148,158 @@ LoopRow: vmax.f32 q13, q13, q3 vmax.f32 q14, q14, q3 vmax.f32 q15, q15, q3 + b Write + +LoopRow8: + ldr r1, [sp, #-44] // reload rhs ptr + ldr r7, [sp, #12] // reload rhs col + ldr r3, [sp, #-36] // reload bias ptr + + LoopCol_R8: + ldr r2, [sp, #-40] // reload dst ptr + ldr r0, [sp, #-48] // reload lhs ptr + ldr r5, [sp, #4] // reload depth + vld1.32 {q3}, [r1]! + vld1.32 {q0, q1}, [r0]! + vmul.f32 q4, q3, d0[0] + vmul.f32 q5, q3, d0[1] + vmul.f32 q6, q3, d1[0] + vld1.32 {q2}, [r0]! + vmul.f32 q7, q3, d1[1] + + vmul.f32 q8, q3, d2[0] + vmul.f32 q9, q3, d2[1] + vmul.f32 q10, q3, d3[0] + vmul.f32 q11, q3, d3[1] + + subs r5, r5, #1 + beq Bias_R8 + + LoopDepth_R8: + vld1.32 {q3}, [r1]! + vld1.32 {q0, q1}, [r0]! + vmla.f32 q4, q3, d0[0] + vmla.f32 q5, q3, d0[1] + vmla.f32 q6, q3, d1[0] + vld1.32 {q2}, [r0]! + vmla.f32 q7, q3, d1[1] + + vmla.f32 q8, q3, d2[0] + vmla.f32 q9, q3, d2[1] + vmla.f32 q10, q3, d3[0] + vmla.f32 q11, q3, d3[1] + + subs r5, r5, #1 + bne LoopDepth_R8 + + Bias_R8: + cmp r3, #0 + beq Activation_R8 + vld1.32 {q0}, [r3]! + vadd.f32 q4, q4, q0 + vadd.f32 q5, q5, q0 + vadd.f32 q6, q6, q0 + vadd.f32 q7, q7, q0 + vadd.f32 q8, q8, q0 + vadd.f32 q9, q9, q0 + vadd.f32 q10, q10, q0 + vadd.f32 q11, q11, q0 + + Activation_R8: + ldr lr, [sp] + cmp lr, #3 + beq Relu6_R8 + cmp lr, #1 + beq Relu_R8 + b Write + + Relu6_R8: + vmov.i32 q2, #6 + vcvt.f32.s32 q2, q2 + vmin.f32 q4, q4, q2 + vmin.f32 q5, q5, q2 + vmin.f32 q6, q6, q2 + vmin.f32 q7, q7, q2 + vmin.f32 q8, q8, q2 + vmin.f32 q9, q9, q2 + vmin.f32 q10, q10, q2 + vmin.f32 q11, q11, q2 + + Relu_R8: + veor q3, q3, q3 + vmax.f32 q4, q4, q3 + vmax.f32 q5, q5, q3 + vmax.f32 q6, q6, q3 + vmax.f32 q7, q7, q3 + vmax.f32 q8, q8, q3 + vmax.f32 q9, q9, q3 + vmax.f32 q10, q10, q3 + vmax.f32 q11, q11, q3 + b Write + +LoopRow4: + ldr r1, [sp, #-44] // reload rhs ptr + ldr r7, [sp, #12] // reload rhs col + ldr r3, [sp, #-36] // reload bias ptr + + LoopCol_R4: + ldr r2, [sp, #-40] // reload dst ptr + ldr r0, [sp, #-48] // reload lhs ptr + ldr r5, [sp, #4] // reload depth + vld1.32 {q3}, [r1]! + vld1.32 {q0, q1}, [r0]! + vmul.f32 q4, q3, d0[0] + vmul.f32 q5, q3, d0[1] + vmul.f32 q6, q3, d1[0] + vld1.32 {q2}, [r0]! + vmul.f32 q7, q3, d1[1] + + subs r5, r5, #1 + beq Bias_R4 + + LoopDepth_R4: + vld1.32 {q3}, [r1]! + vld1.32 {q0, q1}, [r0]! + vmla.f32 q4, q3, d0[0] + vmla.f32 q5, q3, d0[1] + vmla.f32 q6, q3, d1[0] + vld1.32 {q2}, [r0]! + vmla.f32 q7, q3, d1[1] + + subs r5, r5, #1 + bne LoopDepth_R4 + + Bias_R4: + cmp r3, #0 + beq Activation_R4 + vld1.32 {q0}, [r3]! + vadd.f32 q4, q4, q0 + vadd.f32 q5, q5, q0 + vadd.f32 q6, q6, q0 + vadd.f32 q7, q7, q0 + + Activation_R4: + ldr lr, [sp] + cmp lr, #3 + beq Relu6_R4 + cmp lr, #1 + beq Relu_R4 + b Write + + Relu6_R4: + vmov.i32 q2, #6 + vcvt.f32.s32 q2, q2 + vmin.f32 q4, q4, q2 + vmin.f32 q5, q5, q2 + vmin.f32 q6, q6, q2 + vmin.f32 q7, q7, q2 + + Relu_R4: + veor q3, q3, q3 + vmax.f32 q4, q4, q3 + vmax.f32 q5, q5, q3 + vmax.f32 q6, q6, q3 + vmax.f32 q7, q7, q3 Write: cmp r7, #1 @@ -398,7 +556,7 @@ LoopRow: cmp r6, #12 ble LoopRowEnd sub r6, r6, #12 // lhs row - 12 - b LoopRow + b LoopRowStart LoopRowEnd: sub sp, sp, #112 diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc index 400c225bc3..e6a6b66db4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc @@ -62,6 +62,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM); matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); + matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_); matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); matmul_param_->act_type_ = conv_param_->act_type_; return; @@ -73,14 +74,21 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { auto output_channel = filter_tensor->Batch(); #ifdef ENABLE_AVX - int col_tile = C16NUM; + row_tile_ = C6NUM; + col_tile_ = C16NUM; +#elif defined(ENABLE_SSE) + row_tile_ = C4NUM; + col_tile_ = C8NUM; #elif defined(ENABLE_ARM32) - int col_tile = C4NUM; + row_tile_ = C12NUM; + col_tile_ = C4NUM; #else - int col_tile = C8NUM; + row_tile_ = C12NUM; + col_tile_ = C8NUM; #endif + if (in_tensors_.size() == 3) { - int size = UP_ROUND(output_channel, col_tile) * sizeof(float); + int size = UP_ROUND(output_channel, col_tile_) * sizeof(float); int weight_size = output_channel * sizeof(float); bias_data_ = malloc(size); if (bias_data_ == nullptr) { @@ -91,8 +99,8 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { memset(reinterpret_cast(bias_data_) + weight_size, 0, size - weight_size); } - int size = input_channel * UP_ROUND(output_channel, col_tile) * sizeof(float); - int down_size = input_channel * DOWN_DIV(output_channel, col_tile) * col_tile * sizeof(float); + int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float); + int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float); weight_ptr_ = reinterpret_cast(malloc(size)); if (weight_ptr_ == nullptr) { MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; @@ -113,27 +121,14 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { } int Convolution1x1CPUKernel::InitConv1x1Param() { - int hw_tile = C12NUM; -#ifdef ENABLE_AVX - hw_tile = C6NUM; -#elif defined(ENABLE_SSE) - hw_tile = C4NUM; -#endif - if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) { + if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) { multi_thread_by_hw_ = true; - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile)); - thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_; } else { -#ifdef ENABLE_AVX - int col_tile = C16NUM; -#elif defined(ENABLE_ARM32) - int col_tile = C4NUM; -#else - int col_tile = C8NUM; -#endif multi_thread_by_hw_ = false; - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile)); - thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile), thread_count_) * col_tile; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_; } pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || @@ -167,6 +162,16 @@ int Convolution1x1CPUKernel::Init() { return ReSize(); } +void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) { +#if ENABLE_AVX + RowMajor2Col6Major(src_ptr, dst_ptr, row, col); +#elif defined(ENABLE_SSE) + RowMajor2Col4Major(src_ptr, dst_ptr, row, col); +#else + RowMajor2Col12Major(src_ptr, dst_ptr, row, col); +#endif +} + int Convolution1x1CPUKernel::DoConv1x1(int task_id) { int res_stride = matmul_param_->col_ - task_id * thread_stride_; int cur_oc = MSMIN(thread_stride_, res_stride); @@ -198,20 +203,20 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) { } float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_; - float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_; - -#if ENABLE_AVX - RowMajor2Col6Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); -#elif defined(ENABLE_SSE) - RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); -#else - RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); -#endif - + float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_; float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; - MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast(bias_data_), - matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, - OutType_Nhwc); + float *cur_intput = thread_input_ptr; + float *cur_output = thread_output_ptr; + for (int i = 0; i < cur_hw_; i += row_tile_) { + int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i); + PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_); + MatMulOpt(thread_pack_input, weight_ptr_, cur_output, reinterpret_cast(bias_data_), + matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_, + OutType_Nhwc); + cur_intput += row_tile_ * matmul_param_->deep_; + cur_output += row_tile_ * matmul_param_->col_; + } + return RET_OK; } @@ -228,17 +233,9 @@ int Convolution1x1RunHw(void *cdata, int task_id) { int Convolution1x1CPUKernel::Run() { auto src_in = reinterpret_cast(in_tensors_[0]->MutableData()); auto src_out = reinterpret_cast(out_tensors_[0]->MutableData()); - -#ifdef ENABLE_AVX - pack_input_ = - reinterpret_cast(ctx_->allocator->Malloc(matmul_param_->row_6_ * matmul_param_->deep_ * sizeof(float))); -#elif defined(ENABLE_SSE) - pack_input_ = - reinterpret_cast(ctx_->allocator->Malloc(matmul_param_->row_4_ * matmul_param_->deep_ * sizeof(float))); -#else - pack_input_ = - reinterpret_cast(ctx_->allocator->Malloc(matmul_param_->row_12_ * matmul_param_->deep_ * sizeof(float))); -#endif + int pack_input_size = multi_thread_by_hw_ ? (thread_count_ * row_tile_ * matmul_param_->deep_) + : (matmul_param_->row_align_ * matmul_param_->deep_); + pack_input_ = reinterpret_cast(ctx_->allocator->Malloc(pack_input_size * sizeof(float))); if (pack_input_ == nullptr) { MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!"; return RET_MEMORY_FAILED; @@ -256,13 +253,7 @@ int Convolution1x1CPUKernel::Run() { if (multi_thread_by_hw_) { ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_); } else { -#ifdef ENABLE_AVX - RowMajor2Col6Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); -#elif defined(ENABLE_SSE) - RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); -#else - RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); -#endif + PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_); } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h index 342ca4c5b2..8594784fb8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h @@ -51,6 +51,7 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel { int InitConv1x1BiasWeight(); void InitConv1x1MatmulParam(); void FreeTmpBuffer(); + void PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col); private: MatMulParameter *matmul_param_ = nullptr; @@ -62,6 +63,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel { float *pack_input_ = nullptr; float *input_ptr_ = nullptr; float *output_ptr_ = nullptr; + int row_tile_ = 0; + int col_tile_ = 0; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_