!11105 [MS][LITE][CPU]dispatch convolution op through delegate

From: @fuzhiye
Reviewed-by: @hangangqiang,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
pull/11105/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 3f03b83257

@ -21,9 +21,6 @@
#include <limits.h>
#include "nnacl/op_base.h"
#define INPUT_ASYMMETRIC 0b001
#define FILTER_ASYMMETRIC 0b010
#define OUTPUT_ASYMMETRIC 0b100
#define INPUT_PER_CHANNEL 0b001
#define FILTER_PER_CHANNEL 0b010
#define OUTPUT_PER_CHANNEL 0b100

@ -93,13 +93,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
return RET_ERROR;
}
auto bias_tensor = in_tensors_.at(kBiasIndex);
if (bias_tensor->data_type() == kNumberTypeFloat16) {
memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
} else {
Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
output_channel);
}
memcpy(bias_data_, fp16_bias_, output_channel * sizeof(float16_t));
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
}
@ -111,8 +105,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
weight_tensor->data_type() == kNumberTypeFloat16);
ColMajor2Row8MajorFp16(fp16_weight_, weight_ptr_, input_channel, output_channel, true);
return RET_OK;
}
@ -127,10 +120,7 @@ int Convolution1x1FP16CPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return ret;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}
void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
@ -143,7 +133,6 @@ void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
int Convolution1x1FP16CPUKernel::ReSize() {
FreeTmpBuffer();
auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";

@ -30,8 +30,11 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight,
float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
fp16_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~Convolution1x1FP16CPUKernel() override;
int Init() override;
@ -53,6 +56,8 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
bool multi_thread_by_hw_ = false;
int thread_count_ = 1;
int thread_stride_ = 0;
float16_t *fp16_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *weight_ptr_ = nullptr;
float16_t *input_ptr_ = nullptr;
float16_t *pack_input_ = nullptr;

@ -0,0 +1,65 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/op_base.h"
#define WEIGHT_NEED_FREE 0b01
#define BIAS_NEED_FREE 0b10
namespace mindspore::kernel {
class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
public:
ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDelegateFP16CPUKernel() override {
FreeCopiedData();
if (fp16_conv_kernel_ != nullptr) {
op_parameter_ = nullptr; // set op_parameter of delegate to nullptr, avoiding double free
delete fp16_conv_kernel_;
fp16_conv_kernel_ = nullptr;
}
}
int GetFp16WeightAndBias();
int GetFp16Weight();
int GetFp16Bias();
float16_t *CopyData(lite::Tensor *tensor);
void FreeCopiedData();
int Init() override;
int ReSize() override;
int Run() override { return fp16_conv_kernel_->Run(); }
private:
uint8_t need_free_ = 0b00;
kernel::LiteKernel *fp16_conv_kernel_ = nullptr;
float16_t *fp16_weight_ = nullptr;
float16_t *fp16_bias_ = nullptr;
};
kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float16_t *fp16_weight, float16_t *fp16_bias);
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_

@ -27,13 +27,11 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight, float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
fp16_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~ConvolutionFP16CPUKernel() override {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
@ -58,6 +56,8 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
col_major_input_ = nullptr;
}
}
float16_t *fp16_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *packed_input_ = nullptr;
float16_t *packed_weight_ = nullptr;
float16_t *col_major_input_ = nullptr;

@ -43,12 +43,6 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
int oc_block_num = UP_DIV(out_channel, C8NUM);
// init weight
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute filter failed.";
return ret;
}
// set data
auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float16_t);
trans_weight_ = reinterpret_cast<float16_t *>(malloc(trans_matrix_data_size));
@ -68,21 +62,17 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
if (input_unit_ == 8) {
coef = 0.5f;
}
ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
auto ret =
CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
return ret;
}
ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, oc_block);
ret = WinogradFilterTransformFp16(fp16_origin_weight_, matrix_g, matrix_gt, oc_block);
if (ret != RET_OK) {
MS_LOG(ERROR) << "winograd filter transfrom failed.";
return ret;
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
execute_weight_ = nullptr;
}
// init bias
bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
@ -93,10 +83,7 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
for (int i = 0; i < out_channel; ++i) {
fp16_bias_data[i] = (float16_t)ori_bias[i];
}
memcpy(fp16_bias_data, fp16_bias_, out_channel * sizeof(float16_t));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@ -163,15 +150,13 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}
int ConvolutionWinogradFP16CPUKernel::ReSize() {
@ -180,17 +165,11 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}
ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return RET_ERROR;
}
kernel_unit_ = conv_param_->kernel_h_;
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;
ret = ConfigInputOutput();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConfigInputOutput failed.";

@ -31,13 +31,13 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive, int out_unit)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive), output_unit_(out_unit) {}
const mindspore::lite::PrimitiveC *primitive, int out_unit, float16_t *fp16_weight,
float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
output_unit_(out_unit),
fp16_origin_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~ConvolutionWinogradFP16CPUKernel() override {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if (trans_weight_ != nullptr) {
free(trans_weight_);
trans_weight_ = nullptr;
@ -75,6 +75,8 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int kernel_unit_;
int input_unit_;
int output_unit_;
float16_t *fp16_origin_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *tmp_data_ = nullptr;
float16_t *trans_input_ = nullptr;
float16_t *gemm_out_ = nullptr;

@ -87,11 +87,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
std::vector<int> out_shape;
for (int i = 0; i < group_num_; ++i) {
// in
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_c = conv_param_->input_channel_;
in_shape = {in_batch, in_h, in_w, in_c};
auto in_tensor = in_tensors_.front();
in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
sub_kernel_in_tensor->set_shape(in_shape);
ret = sub_kernel_in_tensor->MallocData();
@ -101,11 +98,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
return ret;
}
// out
int out_batch = conv_param_->output_batch_;
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_c = conv_param_->output_channel_;
out_shape = {out_batch, out_h, out_w, out_c};
auto out_tensor = out_tensors_.front();
out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
auto sub_kernel_out_tensors = group_convs_[i]->out_tensors();
for (auto tensor : sub_kernel_out_tensors) {
tensor->set_shape(out_shape);
@ -139,7 +133,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
// input may either be float32 or float16
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
auto in_tensor = in_tensors_.front();
int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = group_convs_.at(group_id)->in_tensors().front()->data_c();
@ -179,7 +174,8 @@ int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
void GroupConvolutionFP16CPUKernel::PostConcat(int group_id) {
// output is must float16 data type
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
auto out_tensor = out_tensors_.front();
int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float16_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());

@ -31,6 +31,33 @@ using mindspore::schema::PrimitiveType_Adder;
using mindspore::schema::Format::Format_NHWC;
namespace mindspore::kernel {
int AdderCPUKernel::Init() {
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int AdderCPUKernel::ReSize() {
auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}
ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return ret;
}
return RET_OK;
}
int AdderCPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int kernel_h = filter_tensor->Height();

@ -29,10 +29,12 @@ class AdderCPUKernel : public ConvolutionCPUKernel {
AdderCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive, nullptr, nullptr) {}
~AdderCPUKernel() override = default;
int InitWeightBias() override;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id) override;
};

@ -44,10 +44,13 @@ void Convolution1x1CPUKernel::FreeTmpBuffer() {
int Convolution1x1CPUKernel::ReSize() {
FreeTmpBuffer();
ConvolutionBaseCPUKernel::Init();
auto error_code = ConvolutionBaseCPUKernel::Init();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv base init failed.";
return error_code;
}
InitConv1x1MatmulParam();
int error_code = InitConv1x1Param();
error_code = InitConv1x1Param();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Convolution base init failed.";
return error_code;
@ -95,7 +98,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
return RET_ERROR;
}
memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
memcpy(bias_data_, origin_bias_, weight_size);
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
}
@ -108,14 +111,11 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
}
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
#ifdef ENABLE_AVX
RowMajor2Col16Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col16Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#elif defined(ENABLE_ARM32)
RowMajor2Col4Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col4Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#else
RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col8Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#endif
return RET_OK;
}
@ -153,13 +153,10 @@ int Convolution1x1CPUKernel::Init() {
}
int error_code = InitConv1x1BiasWeight();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Convolution base init failed.";
MS_LOG(ERROR) << "Convolution1x1 init weight and bias failed.";
return error_code;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}
void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {

@ -34,8 +34,10 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
public:
Convolution1x1CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~Convolution1x1CPUKernel();
int Init() override;
int Run() override;
@ -58,6 +60,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
bool multi_thread_by_hw_ = false;
int thread_count_ = 0;
int thread_stride_ = 0;
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *weight_ptr_ = nullptr;
float *pack_input_ = nullptr;
float *input_ptr_ = nullptr;

@ -0,0 +1,77 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_
#include <vector>
#include "src/ops/conv2d.h"
#include "src/lite_kernel.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/op_base.h"
using mindspore::lite::InnerContext;
namespace mindspore::kernel {
class ConvolutionDelegateCPUKernel : public LiteKernel {
public:
ConvolutionDelegateCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDelegateCPUKernel() override {
FreeCopiedData();
if (conv_kernel_ != nullptr) {
op_parameter_ = nullptr; // op_parameter will be freed in conv_kernel
delete conv_kernel_;
conv_kernel_ = nullptr;
}
};
int Init() override;
int ReSize() override;
int Run() override { return conv_kernel_->Run(); }
int GetWeightAndBias();
int GetWeightData();
int GetBiasData();
static float *CopyData(lite::Tensor *tensor);
void FreeCopiedData();
protected:
bool need_free_weight_ = false;
bool need_free_bias_ = false;
kernel::LiteKernel *conv_kernel_ = nullptr;
float *origin_weight_ = nullptr;
float *origin_bias_ = nullptr;
};
void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
const InnerContext *ctx);
void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs);
ConvParameter *CreateNewConvParameter(ConvParameter *parameter);
lite::Tensor *CreateInputTensor(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag);
lite::Tensor *CreateOutputTensor(const std::vector<int> &out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index);
kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float *origin_weight, float *origin_bias);
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_

@ -28,8 +28,10 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~ConvolutionCPUKernel() override {
if (packed_weight_ != nullptr) {
free(packed_weight_);
@ -57,20 +59,12 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
}
protected:
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *packed_weight_ = nullptr;
float *packed_input_ = nullptr;
float *col_major_input_ = nullptr;
};
void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs);
ConvParameter *CreateNewConvParameter(ConvParameter *parameter);
lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag);
lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index);
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_

@ -81,8 +81,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
return ret;
}
auto weight_data = reinterpret_cast<float *>(filter_tensor->MutableData());
ret = WinogradFilterTransform(weight_data, matrix_g, matrix_gt, oc_block);
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block);
if (ret != RET_OK) {
MS_LOG(ERROR) << "winograd filter transfrom failed.";
return ret;
@ -97,8 +96,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
}
memset(bias_data_, 0, new_bias_size);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias_addr = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
memcpy(bias_data_, ori_bias_addr, out_channel * sizeof(float));
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@ -171,10 +169,7 @@ int ConvolutionWinogradCPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}
int ConvolutionWinogradCPUKernel::ReSize() {
@ -183,18 +178,11 @@ int ConvolutionWinogradCPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}
ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return RET_ERROR;
MS_LOG(ERROR) << "conv base init failed.";
return ret;
}
kernel_unit_ = conv_param_->kernel_h_;
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;
ret = ConfigInputOutput();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConfigInputOutput failed.";

@ -28,10 +28,12 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive, int output_unit)
const mindspore::lite::PrimitiveC *primitive, int output_unit, float *origin_weight,
float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
output_unit_(output_unit),
trans_weight_(nullptr) {}
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~ConvolutionWinogradCPUKernel() override {
if (trans_weight_ != nullptr) {
free(trans_weight_);
@ -69,6 +71,8 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int kernel_unit_;
int input_unit_;
int output_unit_;
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *tmp_data_ = nullptr;
float *trans_input_ = nullptr;
float *gemm_out_ = nullptr;

@ -92,11 +92,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
std::vector<int> out_shape;
for (int i = 0; i < group_num_; ++i) {
// in
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_c = conv_param_->input_channel_;
in_shape = {in_batch, in_h, in_w, in_c};
auto in_tensor = in_tensors_.front();
in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
sub_kernel_in_tensor->set_shape(in_shape);
ret = sub_kernel_in_tensor->MallocData();
@ -106,11 +103,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
return ret;
}
// out
int out_batch = conv_param_->output_batch_;
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_c = conv_param_->output_channel_;
out_shape = {out_batch, out_h, out_w, out_c};
auto out_tensor = out_tensors_.front();
out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
auto sub_kernel_out_tensors = group_convs_.at(i)->out_tensors();
for (auto tensor : sub_kernel_out_tensors) {
tensor->set_shape(out_shape);
@ -143,7 +137,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
}
void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
auto in_tensor = in_tensors_.front();
int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = reinterpret_cast<float *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
@ -157,7 +152,8 @@ void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
}
void GroupConvolutionCPUKernel::PostConcat(int group_id) {
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
auto out_tensor = out_tensors_.front();
int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float *>(group_convs_.at(group_id)->out_tensors().front()->data_c());

@ -19,8 +19,7 @@
#include "nnacl/int8/conv_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"

@ -139,146 +139,4 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
EXPECT_EQ(0, CompareOutputData(out, correct, 54));
delete conv_param;
}
int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
ConvParameter *conv_param, float **correct) {
auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, lite::Tensor::VAR);
in_t->MallocData();
float in[] = {12.216284, 3.3466918, 15.327419, 5.234958, 0.804376, 9.952188, 14.727955, -8.080715,
13.71383, 8.055829, 6.5845337, -9.25232, -4.24519, 11.550042, 9.262012, 1.2780352,
6.7263746, -3.9301445, 3.764492, -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
memcpy(in_t->MutableData(), in, sizeof(float) * 24);
inputs_->push_back(in_t);
auto *weight_t = new lite::Tensor(kNumberTypeFloat, {3, 1, 1, 4}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
weight_t->MallocData();
float weight[] = {-0.7308652, 0.5257509, -0.87825793, -1.123181, -1.2206168, 0.562695,
1.5382664, -0.5020635, 0.8591602, -0.26410004, 1.1262615, 0.073132955}; /* nhwc */
memcpy(weight_t->MutableData(), weight, sizeof(float) * 12);
inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {3}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
bias_t->MallocData();
float bias[] = {2, 2, 2};
memcpy(bias_t->MutableData(), bias, sizeof(float) * 3);
inputs_->push_back(bias_t);
auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 3}, schema::Format_NHWC, lite::Tensor::VAR);
out_t->MallocData();
outputs_->push_back(out_t);
*correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
float co[] = {2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.3731456, 1.6877825, 12.427691, 2., 2., 2.};
memcpy(*correct, co, out_t->ElementsNum() * sizeof(float));
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
conv_param->stride_h_ = conv_param->stride_w_ = 2;
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
conv_param->pad_u_ = conv_param->pad_l_ = 1;
conv_param->act_type_ = ActType_No;
return out_t->ElementsNum();
}
TEST_F(TestConv1x1Fp32, Conv1x1Test1) {
std::vector<lite::Tensor *> inputs_;
std::vector<lite::Tensor *> outputs_;
auto conv_param = new ConvParameter();
auto *ctx = new lite::InnerContext();
ctx->thread_num_ = 1;
ASSERT_EQ(lite::RET_OK, ctx->Init());
float *correct;
int total_size = Conv1x1TestInit1(&inputs_, &outputs_, conv_param, &correct);
auto *conv1x1 =
new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
conv1x1->Init();
conv1x1->Run();
ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
delete conv_param;
delete conv1x1;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
free(correct);
}
int Conv1x1TestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
ConvParameter *conv_param, float **correct) {
size_t buffer_size;
auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 24}, schema::Format_NHWC, lite::Tensor::VAR);
in_t->MallocData();
std::string input_path = "./conv/conv1x1fp32_input1_nhwc.bin";
auto in = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &buffer_size));
memcpy(in_t->MutableData(), in, buffer_size);
inputs_->push_back(in_t);
auto *weight_t = new lite::Tensor(kNumberTypeFloat, {40, 1, 1, 24}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
weight_t->MallocData();
std::string weight_path = "./conv/conv1x1fp32_weight1_nhwc.bin";
auto weight = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size));
memcpy(weight_t->MutableData(), weight, buffer_size);
inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
bias_t->MallocData();
std::string bias_path = "./conv/conv1x1fp32_bias1_nhwc.bin";
auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
memcpy(bias_t->MutableData(), bias, buffer_size);
inputs_->push_back(bias_t);
auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 40}, schema::Format_NHWC, lite::Tensor::VAR);
out_t->MallocData();
outputs_->push_back(out_t);
std::string out_path = "./conv/conv1x1fp32_output1_nhwc.bin";
auto out_nhwc = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
*correct = reinterpret_cast<float *>(malloc(buffer_size));
memcpy(*correct, out_nhwc, buffer_size);
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
conv_param->stride_h_ = conv_param->stride_w_ = 1;
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
conv_param->pad_u_ = conv_param->pad_l_ = 0;
conv_param->act_type_ = ActType_No;
return out_t->ElementsNum();
}
TEST_F(TestConv1x1Fp32, Conv1x1Test2) {
std::vector<lite::Tensor *> inputs_;
std::vector<lite::Tensor *> outputs_;
auto conv_param = new ConvParameter();
auto *ctx = new lite::InnerContext();
ctx->thread_num_ = 2;
ASSERT_EQ(lite::RET_OK, ctx->Init());
float *correct;
int total_size = Conv1x1TestInit2(&inputs_, &outputs_, conv_param, &correct);
auto *conv1x1 =
new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
conv1x1->Init();
conv1x1->Run();
ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
/* running warm up */
for (int i = 0; i < 0; i++) {
conv1x1->Run();
}
/* running time cost */
int loop_count = 1;
auto time_start = mindspore::lite::GetTimeUs();
for (int i = 0; i < loop_count; i++) {
conv1x1->Run();
}
auto time_end = mindspore::lite::GetTimeUs();
auto cost = time_end - time_start;
uint64_t time_avg = cost / loop_count;
printf("1x1 average time : %f ms\n", time_avg / 1000.0f);
delete conv_param;
delete conv1x1;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
free(correct);
}
} // namespace mindspore

Loading…
Cancel
Save