From 1cd66fb7299d27447abd76acc71446abee558924 Mon Sep 17 00:00:00 2001 From: lzk Date: Mon, 21 Dec 2020 02:25:18 -0800 Subject: [PATCH] matmul depthwise optimize --- .../lite/nnacl/fp32/conv_depthwise_fp32.c | 2 + .../lite/nnacl/fp32/conv_depthwise_fp32.h | 3 + .../nnacl/x86_64_sse/ConvDwFp32IndirectRow.c | 116 ++++++++++++++++++ .../arm/fp32/convolution_depthwise_fp32.cc | 7 +- .../kernel/arm/fp32/fullconnection_fp32.cc | 2 +- 5 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 mindspore/lite/nnacl/x86_64_sse/ConvDwFp32IndirectRow.c diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c index a5f7847ee3..3f0d86e6cb 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c +++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c @@ -685,6 +685,8 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c int output_width, int input_stride, bool relu, bool relu6, int kernel) { if (kernel == 9) { ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6); + } else if (kernel == 25) { + ConvDwFp32Avx5x5(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6); } } #endif diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h index 6efe5e7b6e..933b0bf633 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h +++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h @@ -69,6 +69,9 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c #ifdef ENABLE_AVX void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, size_t channels, size_t output_width, size_t input_stride, size_t relu, size_t relu6); + +void ConvDwFp32Avx5x5(float *output, float **input, const float *weights, const float *bias, size_t channels, + size_t output_width, size_t input_stride, size_t relu, size_t relu6); #endif void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, diff --git a/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32IndirectRow.c b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32IndirectRow.c new file mode 100644 index 0000000000..924c21b93f --- /dev/null +++ b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32IndirectRow.c @@ -0,0 +1,116 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_AVX + +#include +#include "nnacl/fp32/conv_depthwise_fp32.h" + +void ConvDwFp32Avx5x5(float *output, float **input, const float *weights, const float *bias, size_t channels, + size_t output_width, size_t input_stride, size_t relu, size_t relu6) { + input_stride /= sizeof(float *); + size_t c8 = UP_DIV(channels, C8NUM) * C8NUM; + size_t c8_mod = channels % C8NUM; + int kernel = 25; + for (int i = 0; i < output_width; ++i) { + float *in[kernel]; + for (int k = 0; k < kernel; k++) { + in[k] = input[k]; + } + input += input_stride; + size_t c = c8; + const float *w = weights; + const float *bias1 = bias; + for (; c >= C8NUM; c -= C8NUM) { + __m256 out1 = _mm256_loadu_ps(bias1); + bias1 += 8; + for (int k = 0; k < kernel; k += 5) { + __m256 in1 = _mm256_loadu_ps(in[k]); + __m256 w1 = _mm256_loadu_ps(w); + __m256 in2 = _mm256_loadu_ps(in[k + 1]); + __m256 w2 = _mm256_loadu_ps(w + 8); + out1 = _mm256_fmadd_ps(in1, w1, out1); + __m256 in3 = _mm256_loadu_ps(in[k + 2]); + __m256 w3 = _mm256_loadu_ps(w + 16); + out1 = _mm256_fmadd_ps(in2, w2, out1); + __m256 in4 = _mm256_loadu_ps(in[k + 3]); + __m256 w4 = _mm256_loadu_ps(w + 24); + out1 = _mm256_fmadd_ps(in3, w3, out1); + __m256 in5 = _mm256_loadu_ps(in[k + 8]); + __m256 w5 = _mm256_loadu_ps(w + 32); + out1 = _mm256_fmadd_ps(in4, w4, out1); + w += 40; + in[k] += C8NUM; + in[k + 1] += C8NUM; + in[k + 2] += C8NUM; + in[k + 3] += C8NUM; + in[k + 4] += C8NUM; + out1 = _mm256_fmadd_ps(in5, w5, out1); + } + if (relu6 != 0) { + __m256 relu6_data = _mm256_set1_ps(6.0); + out1 = _mm256_min_ps(out1, relu6_data); + } + if (relu != 0 || relu6 != 0) { + __m256 zero = _mm256_setzero_ps(); + out1 = _mm256_max_ps(out1, zero); + } + if (c == C8NUM) { + __m128 tmp; + switch (c8_mod) { + case 1: + _mm_store_ss(output, _mm256_castps256_ps128(out1)); + break; + case 2: + _mm_storel_pi((__m64 *)output, _mm256_castps256_ps128(out1)); + break; + case 3: + tmp = _mm256_castps256_ps128(out1); + _mm_storel_pi((__m64 *)output, tmp); + tmp = _mm_unpackhi_ps(tmp, tmp); + _mm_store_ss(output + 2, tmp); + break; + case 4: + _mm_storeu_ps(output, _mm256_castps256_ps128(out1)); + break; + case 5: + _mm_storeu_ps(output, _mm256_castps256_ps128(out1)); + _mm_store_ss(output + 4, _mm256_extractf128_ps(out1, 1)); + break; + case 6: + _mm_storeu_ps(output, _mm256_castps256_ps128(out1)); + _mm_storel_pi((__m64 *)(output + 4), _mm256_extractf128_ps(out1, 1)); + break; + case 7: + _mm_storeu_ps(output, _mm256_castps256_ps128(out1)); + tmp = _mm256_extractf128_ps(out1, 1); + _mm_storel_pi((__m64 *)(output + 4), tmp); + tmp = _mm_unpackhi_ps(tmp, tmp); + _mm_store_ss(output + 6, tmp); + break; + default: + _mm256_storeu_ps(output, out1); + break; + } + output += c8_mod == 0 ? C8NUM : c8_mod; + } else { + _mm256_storeu_ps(output, out1); + output += C8NUM; + } + } + } +} +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc index a88b43cd3d..305f2716c0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc @@ -147,12 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector conv_param->input_channel_ = inputs[kInputIndex]->Channel(); conv_param->output_h_ = outputs[kOutputIndex]->Height(); conv_param->output_w_ = outputs[kOutputIndex]->Width(); -#ifdef ENABLE_AVX - if (conv_param->kernel_h_ == 3 && conv_param->kernel_w_ == 3) { - kernel = - new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive); - } -#elif defined(ENABLE_ARM64) +#if defined(ENABLE_ARM64) || defined(ENABLE_AVX) if (CheckConvDwUseIndirectBuffer(conv_param)) { kernel = new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc index c1bf9321bc..8395313933 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc @@ -60,7 +60,7 @@ int FullconnectionCPUKernel::ReSize() { #endif fc_param_->row_12_ = UP_ROUND(fc_param_->row_, C12NUM); fc_param_->col_align_ = UP_ROUND(fc_param_->col_, col_tile); - fc_param_->row_6_ = UP_ROUND(fc_param_->col_, C6NUM); + fc_param_->row_6_ = UP_ROUND(fc_param_->row_, C6NUM); fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_align_, col_tile));