!6878 [MS][LITE][CPU]optimize fp16 winograd

Merge pull request !6878 from fuzhiye/tmp
pull/6878/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 0fdb359775

File diff suppressed because it is too large Load Diff

@ -40,17 +40,6 @@ void IndirectGemmFp16_16x8_c8(float16_t *output, float16_t *input, float16_t *we
#ifdef __cplusplus
extern "C" {
#endif
void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding);
void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int height,
int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int ic, int in_sh_step,
int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6);
// fp16 sliding window
void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data,
float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param,
SlidingWindowParam *slidingWindow_param);
// fp16 convolution common (im2col+gemm)
void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_weight, float16_t *bias_data,
@ -69,17 +58,9 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i
// fp16 convolution winograd
void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data,
TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
MatricesFp16 *matrices);
void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
int output_unit);
void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
int output_unit);
float16_t *output_data, TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
InputTransFp16Func in_func, OutputTransFp16Func out_func);
void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
int output_unit);
#ifdef __cplusplus
}
#endif

@ -569,8 +569,8 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data,
// fp16 common winograd
void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b,
float16_t *matrix_bt) {
int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
InputTransFp16Func func) {
const int tile_num = 16;
int input_unit = conv_param->input_unit_;
int output_unit = conv_param->output_unit_;
@ -593,36 +593,56 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
int interval_x_e = src_x_e < input_w ? input_unit : (input_w - src_x_s);
int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);
int src_plane_offset = ic8 * C8NUM * (src_y_s * input_w + src_x_s);
int src_plane_offset = in_channel * (src_y_s * input_w + src_x_s);
int dst_plane_offset = c * C4NUM;
for (int ic = 0; ic < ic8; ic++) {
// clear tmp buffer
memset(tmp_data, 0, input_unit * input_unit * C8NUM * sizeof(float16_t));
// get real input block with padding
int real_c = in_channel - ic * C8NUM;
real_c = real_c > C8NUM ? C8NUM : real_c;
int src_ic8_offset = src_plane_offset + ic * C8NUM;
for (int interval = interval_y_s; interval < interval_y_e; interval++) {
int src_y_offset = src_ic8_offset + (interval * input_w + interval_x_s) * ic8 * C8NUM;
int dst_y_offset = interval * input_unit * C8NUM + interval_x_s * C8NUM;
for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
int src_x_offset = src_y_offset + j * ic8 * C8NUM;
int dst_x_offset = dst_y_offset + j * C8NUM;
const float16_t *src_addr = input_data + src_x_offset;
float16_t *dst_addr = tmp_data + dst_x_offset;
// get real input block with padding
if (real_c == C8NUM) {
for (int interval = interval_y_s; interval < interval_y_e; interval++) {
int src_y_offset = src_ic8_offset + (interval * input_w + interval_x_s) * in_channel;
int dst_y_offset = interval * input_unit * C8NUM + interval_x_s * C8NUM;
for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
int src_x_offset = src_y_offset + j * in_channel;
int dst_x_offset = dst_y_offset + j * C8NUM;
const float16_t *src_addr = input_data + src_x_offset;
float16_t *dst_addr = tmp_data + dst_x_offset;
#ifdef ENABLE_NEON
vst1q_f16(dst_addr, vld1q_f16(src_addr));
vst1q_f16(dst_addr, vld1q_f16(src_addr));
#else
for (int k = 0; k < C8NUM; k++) {
dst_addr[k] = src_addr[k];
}
for (int k = 0; k < C8NUM; k++) {
dst_addr[k] = src_addr[k];
}
#endif
}
}
} else {
for (int interval = interval_y_s; interval < interval_y_e; interval++) {
int src_y_offset = src_ic8_offset + (interval * input_w + interval_x_s) * in_channel;
int dst_y_offset = interval * input_unit * C8NUM + interval_x_s * C8NUM;
for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
int src_x_offset = src_y_offset + j * in_channel;
int dst_x_offset = dst_y_offset + j * C8NUM;
const float16_t *src_addr = input_data + src_x_offset;
float16_t *dst_addr = tmp_data + dst_x_offset;
for (int k = 0; k < real_c; k++) {
dst_addr[k] = src_addr[k];
}
}
}
}
// input transform
int dst_ic8_offset = dst_plane_offset + ic * tile_num * C8NUM;
size_t dst_step = ic8 * C8NUM * tile_num;
float16_t *trans_input_ptr = trans_input + dst_ic8_offset;
GeneralInputTransformUnitFp16(tmp_data, trans_input_ptr, matrix_b, matrix_bt, C8NUM, dst_step, input_unit);
func(tmp_data, trans_input_ptr, C8NUM, dst_step);
}
out_tile_index++;
} // cal_tile_num loop
@ -630,12 +650,10 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
float16_t *matrix_a, float16_t *matrix_at) {
OutputTransFp16Func func) {
int output_unit = conv_param->output_unit_;
int output_w = conv_param->output_w_;
int output_h = conv_param->output_h_;
int output_w_unit_block = UP_DIV(output_w, output_unit);
int output_h_unit_block = UP_DIV(output_h, output_unit);
int output_channel = conv_param->output_channel_;
int oc8 = UP_DIV(output_channel, C8NUM);
int input_unit = conv_param->input_unit_;
@ -645,18 +663,27 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
for (int i = 0; i < cal_num; i++) {
int dst_x_s = out_tile_index % output_unit_num;
int dst_y_s = out_tile_index / output_unit_num;
int r_w = output_w - dst_x_s * output_unit;
r_w = r_w > output_unit ? output_unit : r_w;
int r_h = output_h - dst_y_s * output_unit;
r_h = r_h > output_unit ? output_unit : r_h;
int tmp_ix = dst_x_s * output_unit;
dst_x_s = tmp_ix > output_w ? output_w : tmp_ix;
int tmp_iy = dst_y_s * output_unit;
dst_y_s = tmp_iy > output_h ? output_h : tmp_iy;
int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
int dst_tile_offset = C8NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);
int dst_tile_offset = output_channel * (dst_x_s + dst_y_s * output_w);
for (int j = 0; j < oc8; j++) {
int r_c = output_channel - j * C8NUM;
r_c = r_c > C8NUM ? C8NUM : r_c;
int src_oc8_offset = src_tile_offset + j * input_unit * input_unit * C8NUM;
int dst_oc8_offset =
dst_tile_offset + j * C8NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
int dst_oc8_offset = dst_tile_offset + j * C8NUM;
const float16_t *src_ptr = gemm_out + src_oc8_offset;
const float16_t *bias_ptr = bias_data + j * C8NUM;
float16_t *dst_ptr = tmp_out_data + dst_oc8_offset;
GeneralOutputTransformUnitFp16(src_ptr, dst_ptr, bias_ptr, matrix_a, matrix_at, C8NUM,
output_w_unit_block * output_unit, input_unit, output_unit);
func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w, output_channel, r_w, r_h, r_c);
}
out_tile_index++;
}

@ -43,12 +43,12 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data,
// fp16 common winograd
void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b,
float16_t *matrix_bt);
int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
InputTransFp16Func func);
void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
float16_t *matrix_a, float16_t *matrix_at);
OutputTransFp16Func func);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -41,8 +41,7 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
#endif
int output_tile_count = UP_DIV(output_count, cal_num);
int kernel_plane = kernel_h * kernel_w;
int unit_size = kernel_plane * in_channel;
int deep = in_channel * kernel_plane;
int deep = kernel_plane * in_channel;
for (int b = 0; b < in_batch; b++) {
int in_batch_offset = b * in_channel * in_h * in_w;
@ -50,9 +49,9 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
int start_index = thread_id * cal_num;
int real_cal_num = (output_count - start_index) < cal_num ? (output_count - start_index) : cal_num;
float *gemm_input = packed_input + task_id * unit_size * cal_num;
float *col_major_gemm_input = col_major_input + task_id * unit_size * cal_num;
size_t packed_input_size = unit_size * cal_num * sizeof(float);
float *gemm_input = packed_input + task_id * deep * cal_num;
float *col_major_gemm_input = col_major_input + task_id * deep * cal_num;
size_t packed_input_size = deep * cal_num * sizeof(float);
memset(gemm_input, 0, packed_input_size);
memset(col_major_gemm_input, 0, packed_input_size);
Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
@ -95,8 +94,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
float *trans_input = buffer_list[0];
float *gemm_out = buffer_list[1];
float *tmp_data = buffer_list[3];
float *col_buffer = buffer_list[4];
float *tmp_data = buffer_list[2];
float *col_buffer = buffer_list[3];
int trans_input_offset = tile_num * input_unit_square * ic4 * C4NUM;
int gemm_out_offset = tile_num * input_unit_square * oc8 * C8NUM;
int tmp_data_offset = input_unit_square * C4NUM;

@ -16,7 +16,6 @@
#include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
#include <vector>
#include "src/runtime/kernel/arm/fp16/convolution_sw_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
@ -203,19 +202,13 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int stride_h = conv_param->stride_h_;
int stride_w = conv_param->stride_w_;
int dilation_h = conv_param->dilation_h_;
int dilation_w = conv_param->dilation_w_;
conv_param->input_h_ = inputs.front()->Height();
conv_param->input_w_ = inputs.front()->Width();
conv_param->output_h_ = outputs.front()->Height();
conv_param->output_w_ = outputs.front()->Width();
kernel::LiteKernel *kernel = nullptr;
if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else if (kernel_h == 1 && kernel_w == 1) {
if (kernel_h == 1 && kernel_w == 1) {
kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
bool use_winograd = false;

@ -1,236 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/convolution_sw_fp16.h"
#include <vector>
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp32/conv_depthwise.h"
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2D;
namespace mindspore::kernel {
int ConvolutionSWFP16CPUKernel::ProcessFilter() {
int kernel_h = conv_param_->kernel_h_;
int kernel_w = conv_param_->kernel_w_;
int in_channel = conv_param_->input_channel_;
int out_channel = conv_param_->output_channel_;
int ic4 = UP_DIV(in_channel, C4NUM);
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute filter failed.";
return ret;
}
for (int oc = 0; oc < out_channel; ++oc) {
int src_oc_offset = oc * kernel_h * kernel_w * in_channel;
int dst_oc_offset = oc * kernel_h * kernel_w * ic4 * C4NUM;
for (int i = 0; i < kernel_h * kernel_w; ++i) {
const float16_t *src = execute_weight_ + src_oc_offset + i * in_channel;
float16_t *dst = packed_weight_ + dst_oc_offset + i * ic4 * C4NUM;
memcpy(dst, src, in_channel * sizeof(float16_t));
}
}
return RET_OK;
}
int ConvolutionSWFP16CPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
int in_channel = filter_tensor->Channel();
int out_channel = filter_tensor->Batch();
conv_param_->input_channel_ = in_channel;
conv_param_->output_channel_ = out_channel;
int oc4 = UP_DIV(out_channel, C4NUM);
int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
auto ret = ProcessFilter();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Process filter failed.";
return ret;
}
bias_data_ = malloc(oc4 * C4NUM * sizeof(float16_t));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_data_ failed.";
return RET_ERROR;
}
memset(bias_data_, 0, oc4 * C4NUM * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
for (int i = 0; i < out_channel; ++i) {
fp16_bias_data[i] = (float16_t)ori_bias[i];
}
} else {
MS_ASSERT(in_tensor_.size() == kInputSize1);
}
return RET_OK;
}
int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
int out_channel = conv_param_->output_channel_;
int oc4 = UP_DIV(out_channel, C4NUM);
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
size_t nhwc4_input_size =
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
if (nhwc4_input_ == nullptr) {
MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
return RET_ERROR;
}
tmp_output_block_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(
conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t)));
if (tmp_output_block_ == nullptr) {
MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
return RET_ERROR;
}
return RET_OK;
}
void ConvolutionSWFP16CPUKernel::ConfigInputOutput() {
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format::Format_NHWC4;
convert_func_ = LayoutTransformFp16(input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return;
}
}
int ConvolutionSWFP16CPUKernel::Init() {
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
ConfigInputOutput();
return ReSize();
}
int ConvolutionSWFP16CPUKernel::ReSize() {
auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}
if (slidingWindow_param_ != nullptr) {
delete slidingWindow_param_;
slidingWindow_param_ = nullptr;
}
ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret;
return ret;
}
// init sliding window param
slidingWindow_param_ = new (std::nothrow) SlidingWindowParam;
if (slidingWindow_param_ == nullptr) {
MS_LOG(ERROR) << "new SlidingWindowParam fail!";
return RET_ERROR;
}
InitSlidingParamConv(slidingWindow_param_, conv_param_, C4NUM);
return RET_OK;
}
int ConvolutionSWFP16CPUKernel::RunImpl(int task_id) {
ConvSWFp16(reinterpret_cast<float16_t *>(nhwc4_input_), packed_weight_, reinterpret_cast<float16_t *>(bias_data_),
tmp_output_block_, execute_output_, task_id, conv_param_, slidingWindow_param_);
return RET_OK;
}
static int ConvolutionSWFp16Impl(void *cdata, int task_id) {
auto conv = reinterpret_cast<ConvolutionSWFP16CPUKernel *>(cdata);
auto error_code = conv->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "ConvolutionFp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionSWFP16CPUKernel::Run() {
auto ret = Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Prepare failed.";
return RET_ERROR;
}
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
return RET_ERROR;
}
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_channel = conv_param_->input_channel_;
convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionSWFp16Impl, this, thread_count_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
FreeTmpBuffer();
return RET_ERROR;
}
// output nhwc4
int oc4_res = conv_param_->output_channel_ % C4NUM;
if (oc4_res != 0) {
PackNHWC4ToNHWCFp16(reinterpret_cast<const void *>(tmp_output_block_), reinterpret_cast<void *>(execute_output_),
conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_,
conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel

@ -1,72 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_SW_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_SW_FP16_H_
#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
namespace mindspore::kernel {
class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionSWFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionSWFP16CPUKernel() override {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
if (slidingWindow_param_ != nullptr) {
delete slidingWindow_param_;
slidingWindow_param_ = nullptr;
}
}
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
int InitWeightBias();
int InitTmpBuffer();
void ConfigInputOutput();
int ProcessFilter();
private:
void FreeTmpBuffer() {
if (nhwc4_input_ != nullptr) {
ctx_->allocator->Free(nhwc4_input_);
nhwc4_input_ = nullptr;
}
if (tmp_output_block_ != nullptr) {
ctx_->allocator->Free(tmp_output_block_);
tmp_output_block_ = nullptr;
}
}
float16_t *packed_weight_ = nullptr;
float16_t *tmp_output_block_ = nullptr;
SlidingWindowParam *slidingWindow_param_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_SW_FP16_H_

@ -42,7 +42,6 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
free(trans_weight_);
trans_weight_ = nullptr;
}
FreeTransformMatrices();
}
int Init() override;
@ -50,19 +49,12 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int Run() override;
int RunImpl(int task_id);
int InitWeightBias();
int MallocTransformMatrices();
void FreeTransformMatrices();
int InitTmpBuffer();
int ConfigInputOutput();
int PostProcess();
int WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, float *matrix_gt, int oc_block);
private:
void FreeTmpBuffer() {
if (nhwc4_input_ != nullptr) {
ctx_->allocator->Free(nhwc4_input_);
nhwc4_input_ = nullptr;
}
if (trans_input_ != nullptr) {
ctx_->allocator->Free(trans_input_);
trans_input_ = nullptr;
@ -75,10 +67,6 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
ctx_->allocator->Free(gemm_out_);
gemm_out_ = nullptr;
}
if (tmp_out_data_ != nullptr) {
ctx_->allocator->Free(tmp_out_data_);
tmp_out_data_ = nullptr;
}
}
int kernel_unit_;
int input_unit_;
@ -86,14 +74,10 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
float16_t *tmp_data_ = nullptr;
float16_t *trans_input_ = nullptr;
float16_t *gemm_out_ = nullptr;
float16_t *tmp_out_data_ = nullptr;
float16_t *matrix_a_ = nullptr;
float16_t *matrix_at_ = nullptr;
float16_t *matrix_b_ = nullptr;
float16_t *matrix_bt_ = nullptr;
float16_t *trans_weight_ = nullptr;
TmpBufferAddressFp16 tmp_buffer_address_list_[4];
MatricesFp16 matrices_[4];
TmpBufferAddressFp16 tmp_buffer_address_list_[3];
InputTransFp16Func in_func_;
OutputTransFp16Func out_func_;
};
} // namespace mindspore::kernel

@ -117,9 +117,8 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
conv_param_->output_channel_ = out_channel;
int oc4 = UP_DIV(out_channel, C4NUM);
int oc_block, oc_block_num;
oc_block = C8NUM;
oc_block_num = UP_DIV(out_channel, C8NUM);
const int oc_block = C8NUM;
int oc_block_num = UP_DIV(out_channel, C8NUM);
// set data
auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float);
@ -172,9 +171,6 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
int channel_out = conv_param_->output_channel_;
int output_h = conv_param_->output_h_;
int output_w = conv_param_->output_w_;
int oc4 = UP_DIV(channel_out, C4NUM);
int oc8 = UP_DIV(channel_out, C8NUM);
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
#ifdef ENABLE_ARM32
@ -198,16 +194,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
return RET_ERROR;
}
int out_w_block = UP_DIV(output_w, output_unit_);
int out_h_block = UP_DIV(output_h, output_unit_);
tmp_out_data_ =
reinterpret_cast<float *>(ctx_->allocator->Malloc(conv_param_->output_batch_ * out_w_block * out_h_block *
output_unit_ * output_unit_ * oc4 * C4NUM * sizeof(float)));
if (tmp_out_data_ == nullptr) {
MS_LOG(ERROR) << "malloc tmp_out_data_ failed.";
return RET_MEMORY_FAILED;
}
tmp_data_ = reinterpret_cast<float *>(
ctx_->allocator->Malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float)));
if (tmp_data_ == nullptr) {
@ -224,16 +210,12 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
tmp_buffer_address_list_[0] = trans_input_;
tmp_buffer_address_list_[1] = gemm_out_;
tmp_buffer_address_list_[2] = tmp_out_data_;
tmp_buffer_address_list_[3] = tmp_data_;
tmp_buffer_address_list_[4] = col_buffer_;
tmp_buffer_address_list_[2] = tmp_data_;
tmp_buffer_address_list_[3] = col_buffer_;
return RET_OK;
}
int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
auto output_tensor = out_tensors_.at(kOutputIndex);
output_tensor->SetFormat(schema::Format::Format_NHWC);
in_func_ = GetInputTransFunc(input_unit_);
if (in_func_ == nullptr) {
MS_LOG(ERROR) << "in_func_ is null.";

@ -61,10 +61,6 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
ctx_->allocator->Free(gemm_out_);
gemm_out_ = nullptr;
}
if (tmp_out_data_ != nullptr) {
ctx_->allocator->Free(tmp_out_data_);
tmp_out_data_ = nullptr;
}
if (col_buffer_ != nullptr) {
ctx_->allocator->Free(col_buffer_);
col_buffer_ = nullptr;
@ -76,10 +72,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
float *tmp_data_ = nullptr;
float *trans_input_ = nullptr;
float *gemm_out_ = nullptr;
float *tmp_out_data_ = nullptr;
float *col_buffer_ = nullptr;
float *trans_weight_ = nullptr;
TmpBufferAddress tmp_buffer_address_list_[5];
TmpBufferAddress tmp_buffer_address_list_[4];
InputTransFunc in_func_;
OutputTransFunc out_func_;
};

Loading…
Cancel
Save