From f5c86d128346476a297539a077eaf012bfc0d3c0 Mon Sep 17 00:00:00 2001 From: lzk Date: Thu, 17 Dec 2020 17:26:17 -0800 Subject: [PATCH] x86 sse optimize --- .../lite/nnacl/fp32/conv_depthwise_fp32.c | 23 ++++++++-- .../lite/nnacl/fp32/conv_depthwise_fp32.h | 5 +++ mindspore/lite/nnacl/pack.c | 41 +++++++++++++++++ mindspore/lite/nnacl/pack.h | 4 ++ .../arm/fp32/convolution_depthwise_fp32.cc | 2 +- .../convolution_depthwise_indirect_fp32.cc | 45 ++++++++++++++----- 6 files changed, 105 insertions(+), 15 deletions(-) diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c index 83467834bc..a5f7847ee3 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c +++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c @@ -587,10 +587,16 @@ bool CheckConvDwUseIndirectBuffer(const ConvParameter *conv_param) { void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, const ConvParameter *conv_param, int step_h, int step_w) { - int ic_4 = UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM; +#ifdef ENABLE_AVX + int div = C8NUM; +#else + int div = C4NUM; +#endif + + int ic_div = UP_DIV(conv_param->input_channel_, div) * div; for (int b = 0; b < conv_param->output_batch_; b++) { float **indirect = indirect_buffer + b * conv_param->output_h_ * step_h; - float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_4; + float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_div; for (int oh = 0; oh < conv_param->output_h_; oh++) { for (int kh = 0; kh < conv_param->kernel_h_; kh++) { int ih = oh * conv_param->stride_h_ + kh * conv_param->dilation_h_ - conv_param->pad_u_; @@ -600,7 +606,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, int iw = ow * conv_param->stride_w_ + kw * conv_param->dilation_w_ - conv_param->pad_l_; int index = oh * step_h + ow * step_w * conv_param->kernel_h_ + kw * conv_param->kernel_h_ + kh; if (iw < conv_param->input_w_ && iw >= 0) { - indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_4; + indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_div; } else { indirect[index] = zero_ptr; } @@ -619,7 +625,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, } } -#ifndef ENABLE_ARM64 +#if !defined(ENABLE_ARM64) && !defined(ENABLE_AVX) void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, int output_width, int input_stride, bool relu, bool relu6, int kernel) { do { @@ -674,6 +680,15 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c } #endif +#ifdef ENABLE_AVX +void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, + int output_width, int input_stride, bool relu, bool relu6, int kernel) { + if (kernel == 9) { + ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6); + } +} +#endif + void ConvDwIndirection(float *output_data, float **indirect_buffer, const float *weight_data, const float *bias_data, float *zero_ptr, const ConvParameter *conv_param, int task_id) { int step_w = conv_param->dilation_w_ == 1 ? conv_param->stride_w_ : conv_param->kernel_w_; diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h index 7aae3a51f2..f81077fce8 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h +++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.h @@ -66,6 +66,11 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c int output_width, size_t input_stride, size_t relu, size_t relu6); #endif +#ifdef ENABLE_AVX +void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, int channels, + int output_width, size_t input_stride, size_t relu, size_t relu6); +#endif + void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, int output_width, int input_stride, bool relu, bool relu6, int kernel); diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 245fbb11ca..13b5c9a13d 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -500,6 +500,30 @@ void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int c } } +void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) { + int c8 = UP_DIV(channel, C8NUM); + int c8_channel = c8 * C8NUM; + int nhwc8_batch_unit_offset = c8 * C8NUM * plane; + int ic_remainder_ = channel % C8NUM; + if (ic_remainder_ != 0) { + int nhwc8_batch_offset = 0; + for (int b = 0; b < batch; b++) { + int batch_offset = b * channel * plane; + for (int i = 0; i < plane; i++) { + float *dst_per_plane = (float *)dst + nhwc8_batch_offset + i * c8_channel; + memcpy(dst_per_plane, (float *)src + batch_offset + i * channel, channel * sizeof(float)); + for (int j = channel; j < c8_channel; ++j) { + dst_per_plane[j] = 0; + } + } + nhwc8_batch_offset += nhwc8_batch_unit_offset; + } + } else { + size_t ori_input_size = batch * plane * channel * sizeof(float); + memcpy((float *)dst, (float *)src, ori_input_size); + } +} + void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); int ic_remainder_ = channel % C4NUM; @@ -600,6 +624,23 @@ void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, i } } +void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel) { + int c8 = UP_DIV(channel, C8NUM); + for (int c = 0; c < c8; c++) { + int dst_off_c = c * C8NUM * height * width; + for (int i = 0; i < C8NUM; i++) { + int src_off_c = (c * C8NUM + i) * height * width; + for (int kh = 0; kh < height; kh++) { + int src_off_kh = src_off_c + kh * width; + for (int kw = 0; kw < width; kw++) { + int dst_off = dst_off_c + kw * height * C8NUM + kh * C8NUM + i; + ((float *)dst)[dst_off] = ((float *)src)[src_off_kh + kw]; + } + } + } + } +} + void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); int c4_channel = c4 * C4NUM; diff --git a/mindspore/lite/nnacl/pack.h b/mindspore/lite/nnacl/pack.h index 33f6321e28..13288c50e7 100644 --- a/mindspore/lite/nnacl/pack.h +++ b/mindspore/lite/nnacl/pack.h @@ -64,6 +64,8 @@ void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); +void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); + void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel); @@ -80,6 +82,8 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, int width, int channel); +void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel); + void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel); void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc index 7958e09d82..305f2716c0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc @@ -147,7 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector conv_param->input_channel_ = inputs[kInputIndex]->Channel(); conv_param->output_h_ = outputs[kOutputIndex]->Height(); conv_param->output_w_ = outputs[kOutputIndex]->Width(); -#ifdef ENABLE_ARM64 +#if defined(ENABLE_ARM64) || defined(ENABLE_AVX) if (CheckConvDwUseIndirectBuffer(conv_param)) { kernel = new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc index a3eabd9834..156ceed79a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc @@ -47,37 +47,47 @@ int ConvolutionDepthwiseIndirectCPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = in_tensors_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->MutableData()); - int C4 = UP_DIV(weight_tensor->Batch(), C4NUM); - int pack_weight_size = C4NUM * C4 * weight_tensor->Height() * weight_tensor->Width(); +#ifdef ENABLE_AVX + int div_flag = C8NUM; +#else + int div_flag = C4NUM; +#endif + int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag); + int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } +#ifdef ENABLE_AVX + PackDepthwiseIndirectWeightC8Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(), + weight_tensor->Batch()); +#else PackDepthwiseIndirectWeightC4Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch()); +#endif auto bias_tensor = in_tensors_[kBiasIndex]; - bias_data_ = reinterpret_cast(malloc(C4NUM * C4 * sizeof(float))); + bias_data_ = reinterpret_cast(malloc(batch_flag * div_flag * sizeof(float))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(bias_data_, 0, C4NUM * C4 * sizeof(float)); + memset(bias_data_, 0, batch_flag * div_flag * sizeof(float)); if (in_tensors_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(bias_tensor->MutableData()); memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float)); } // malloc zero ptr - zero_ptr_ = reinterpret_cast(malloc(C4NUM * C4 * sizeof(float))); + zero_ptr_ = reinterpret_cast(malloc(batch_flag * div_flag * sizeof(float))); if (zero_ptr_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(zero_ptr_, 0, C4NUM * C4 * sizeof(float)); + memset(zero_ptr_, 0, batch_flag * div_flag * sizeof(float)); return RET_OK; } @@ -139,8 +149,13 @@ int ConvDwIndirectRun(void *cdata, int task_id) { } int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() { - int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); - int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; +#ifdef ENABLE_AVX + int div_flag = C8NUM; +#else + int div_flag = C4NUM; +#endif + int IC_DIV = UP_DIV(conv_param_->input_channel_, div_flag); + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * div_flag * IC_DIV; packed_input_ = reinterpret_cast(context_->allocator->Malloc(pack_input_size * sizeof(float))); if (packed_input_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; @@ -152,14 +167,24 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() { int ConvolutionDepthwiseIndirectCPUKernel::Run() { auto input_tensor = in_tensors_.at(kInputIndex); auto input_ptr = reinterpret_cast(input_tensor->data_c()); - if (conv_param_->input_channel_ % C4NUM != 0) { +#ifdef ENABLE_AVX + int div_flag = C8NUM; +#else + int div_flag = C4NUM; +#endif + if (conv_param_->input_channel_ % div_flag != 0) { auto ret = MallocPackedInput(); if (ret != 0) { MS_LOG(ERROR) << "Convolution depthwise fp32 indirect buffer MallocPackedInput failed."; return RET_ERROR; } +#ifdef ENABLE_AVX + PackNHWCToNHWC8Fp32(input_ptr, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); +#else PackNHWCToNHWC4Fp32(input_ptr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); +#endif } else { packed_input_ = input_ptr; } @@ -174,7 +199,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() { MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]"; return RET_ERROR; } - if (conv_param_->input_channel_ % C4NUM != 0) { + if (conv_param_->input_channel_ % div_flag != 0) { context_->allocator->Free(packed_input_); } return RET_OK;