diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c index 504ab0670f..91ac6874cf 100644 --- a/mindspore/lite/nnacl/fp32/conv.c +++ b/mindspore/lite/nnacl/fp32/conv.c @@ -307,8 +307,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ } // step 4 : output transform - WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, - cal_num, out_tile_index, out_w_block, conv_param, output_trans_func); + WinogradOutputTransform(dst_ptr, tmp_out_data + tmp_out_batch_offset, bias_data, cal_num, out_tile_index, + out_w_block, conv_param, output_trans_func); } } } @@ -449,8 +449,8 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig } // fp32 conv3x3 -void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { +void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list, + int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { int thread_count = conv_param->thread_num_; int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); int output_channel = conv_param->output_channel_; @@ -461,6 +461,7 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat int output_count = out_w_block * out_h_block; int output_tile_count = UP_DIV(output_count, C12NUM); const int input_unit_square = 4 * 4; + float *tile_buffer = buffer_list[0]; float *block_unit_buffer = buffer_list[1]; float *tmp_dst_buffer = buffer_list[2]; @@ -491,8 +492,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM, real_cal_num, oc8 * C8NUM, input_unit_square, 2); } - Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset, - bias_data, start_index, real_cal_num, out_w_block, conv_param); + Conv3x3Fp32OutputTransform(dst_ptr, nc4hw4_out + nc4hw4_buffer_offset, bias_data, start_index, real_cal_num, + out_w_block, conv_param); } } } diff --git a/mindspore/lite/nnacl/fp32/conv.h b/mindspore/lite/nnacl/fp32/conv.h index a08f661140..3ea865d6b2 100644 --- a/mindspore/lite/nnacl/fp32/conv.h +++ b/mindspore/lite/nnacl/fp32/conv.h @@ -65,8 +65,8 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig int output_unit); // fp32 conv3x3 -void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); +void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list, + int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h index 58b46a57c8..6154dedb74 100644 --- a/mindspore/lite/nnacl/op_base.h +++ b/mindspore/lite/nnacl/op_base.h @@ -61,6 +61,6 @@ typedef struct OpParameter { int thread_num_; } OpParameter; -typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; +typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6, ActType_Prelu } ActType; #endif // MINDSPORE_LITE_NNACL_OP_BASE_H_ diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 1a5e1c7d5d..0cf7e52ef9 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -761,6 +761,8 @@ void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, } } +void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel) {} + void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); for (int b = 0; b < batch; b++) { diff --git a/mindspore/lite/nnacl/pack.h b/mindspore/lite/nnacl/pack.h index d380bbf5cb..715d9f1da7 100644 --- a/mindspore/lite/nnacl/pack.h +++ b/mindspore/lite/nnacl/pack.h @@ -81,6 +81,8 @@ void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel); +void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel); + void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index 8d11db725c..8c9343ba3f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -207,6 +207,28 @@ static int Convolution3x3Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void return RET_OK; } +int Convolution3x3FP16CPUKernel::PostProcess() { + auto act_type = conv_param_->act_type_; + switch (act_type) { + case ActType_No: + UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_); + break; + case ActType_Relu: + UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_); + break; + case ActType_Relu6: + UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_); + break; + default: + MS_LOG(ERROR) << "Unsupport activation type."; + return RET_ERROR; + } + return RET_OK; +} + int Convolution3x3FP16CPUKernel::Run() { auto ret = Prepare(); if (ret != RET_OK) { @@ -236,20 +258,11 @@ int Convolution3x3FP16CPUKernel::Run() { return RET_ERROR; } - // get real output - bool relu = conv_param_->act_type_ == ActType_Relu; - bool relu6 = conv_param_->act_type_ == ActType_Relu6; - if (relu) { - UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_); - } else if (relu6) { - UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_); - } else { - UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_); + ret = PostProcess(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Post process failed."; + return ret; } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeTmpBuffer(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h index e14ebc6f99..b1ff6bdc2d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h @@ -52,6 +52,7 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { int InitWeightBias(); int InitTmpBuffer(); void ConfigInputOutput(); + int PostProcess(); private: void FreeTmpBuffer() { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index ef2235b289..ff547040fa 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -358,6 +358,28 @@ static int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv, return RET_OK; } +int ConvolutionWinogradFP16CPUKernel::PostProcess() { + auto act_type = conv_param_->act_type_; + switch (act_type) { + case ActType_No: + UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + case ActType_Relu: + UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + case ActType_Relu6: + UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + default: + MS_LOG(ERROR) << "Unsupport activation type."; + return RET_ERROR; + } + return RET_OK; +} + int ConvolutionWinogradFP16CPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { @@ -390,16 +412,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() { return RET_ERROR; } - // get real output - if (conv_param_->act_type_ == ActType_Relu) { - UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else if (conv_param_->act_type_ == ActType_Relu6) { - UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else { - UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + ret = PostProcess(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Post process failed."; + return ret; } ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index d9dc03611e..e259b240a5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -56,6 +56,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { int MallocFilterMatrix(int oc_block, int oc_block_num); int InitTmpBuffer(); int ConfigInputOutput(); + int PostProcess(); private: void FreeTmpBuffer() { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index a0387e0165..fbc15eb739 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -207,9 +207,8 @@ int Convolution3x3CPUKernel::RunImpl(int task_id) { MS_LOG(ERROR) << "gemm_func is nullptr."; return RET_ERROR; } - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); Conv3x3Fp32(reinterpret_cast(nhwc4_input_), transformed_filter_addr_, reinterpret_cast(bias_data_), - output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_); + tmp_buffer_address_list_, task_id, conv_param_, gemm_func_); return RET_OK; } @@ -223,6 +222,29 @@ int Convolution3x3Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { return RET_OK; } +int Convolution3x3CPUKernel::PostProcess() { + auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); + auto act_type = conv_param_->act_type_; + switch (act_type) { + case ActType_No: + PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + break; + case ActType_Relu: + PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + break; + case ActType_Relu6: + PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + break; + default: + MS_LOG(ERROR) << "Unsupport activation type."; + return RET_ERROR; + } + return RET_OK; +} + int Convolution3x3CPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { @@ -247,18 +269,10 @@ int Convolution3x3CPUKernel::Run() { return RET_ERROR; } - auto is_relu = conv_param_->act_type_ == ActType_Relu; - auto is_relu6 = conv_param_->act_type_ == ActType_Relu6; - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); - if (is_relu) { - PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - } else if (is_relu6) { - PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - } else { - PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + ret = PostProcess(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Post process failed."; + return ret; } FreeTmpBuffer(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index acae077896..6c4349dda1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -45,6 +45,7 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { int InitWeightBias(); int InitTmpBuffer(); void ConfigInputOutput(); + int PostProcess(); private: void FreeTmpBuffer() { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 2262566480..ad5596d052 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -353,18 +353,43 @@ int ConvolutionWinogradImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata return RET_OK; } +int ConvolutionWinogradCPUKernel::PostProcess() { + auto out_tensor = out_tensors_.front(); + auto out_data = reinterpret_cast(out_tensor->Data()); + auto act_type = conv_param_->act_type_; + switch (act_type) { + case ActType_No: + UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + case ActType_Relu: + UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + case ActType_Relu6: + UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + break; + default: + MS_LOG(ERROR) << "Unsupport activation type."; + return RET_ERROR; + } + return RET_OK; +} + int ConvolutionWinogradCPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } - // malloc tmp buffer + auto ret = InitTmpBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Init tmp buffer failed."; return RET_ERROR; } + auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->Data(); PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, @@ -377,18 +402,10 @@ int ConvolutionWinogradCPUKernel::Run() { return RET_ERROR; } - // get real output - auto out_tensor = out_tensors_.front(); - auto out_data = reinterpret_cast(out_tensor->Data()); - if (conv_param_->act_type_ == ActType_Relu) { - UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else if (conv_param_->act_type_ == ActType_Relu6) { - UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else { - UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + ret = PostProcess(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Post process failed."; + return ret; } FreeTmpBuffer(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index 584dcde4ef..5181808457 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -51,6 +51,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { int MallocFilterMatrix(int oc_block, int oc_block_num); int InitTmpBuffer(); int ConfigInputOutput(); + int PostProcess(); private: void FreeTmpBuffer() {