|
|
|
@ -29,66 +29,67 @@ using mindspore::lite::RET_OK;
|
|
|
|
|
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
|
|
|
|
|
|
|
|
|
|
namespace mindspore::kernel {
|
|
|
|
|
ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); }
|
|
|
|
|
|
|
|
|
|
void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
|
|
|
|
|
ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
|
|
|
|
|
if (sliding_ != nullptr) {
|
|
|
|
|
delete sliding_;
|
|
|
|
|
sliding_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (packed_weight_ != nullptr) {
|
|
|
|
|
delete packed_weight_;
|
|
|
|
|
packed_weight_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (packed_input_ != nullptr) {
|
|
|
|
|
delete packed_input_;
|
|
|
|
|
packed_input_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (packed_output_ != nullptr) {
|
|
|
|
|
delete packed_output_;
|
|
|
|
|
packed_output_ = nullptr;
|
|
|
|
|
FreeTmpBuffer();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
|
|
|
|
|
if (need_align_) {
|
|
|
|
|
if (packed_input_ != nullptr) {
|
|
|
|
|
delete packed_input_;
|
|
|
|
|
packed_input_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (packed_output_ != nullptr) {
|
|
|
|
|
delete packed_output_;
|
|
|
|
|
packed_output_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
|
|
|
|
|
// malloc pack input buffer
|
|
|
|
|
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
|
|
|
|
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
|
|
|
|
|
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
|
|
|
|
|
if (packed_input_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
|
|
|
|
|
if (conv_param_->input_channel_ % C4NUM != 0) {
|
|
|
|
|
need_align_ = true;
|
|
|
|
|
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
|
|
|
|
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
|
|
|
|
|
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
|
|
|
|
|
if (packed_input_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// malloc pack output buffer
|
|
|
|
|
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
|
|
|
|
|
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
|
|
|
|
|
if (packed_output_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
|
|
|
|
|
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
|
|
|
|
|
if (packed_output_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|
|
|
|
// init weight: o, h, w, i; o == group, i == 1
|
|
|
|
|
int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
|
|
|
|
auto weight_tensor = in_tensors_[kWeightIndex];
|
|
|
|
|
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
|
|
|
|
|
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
|
|
|
|
|
int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
|
|
|
|
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
|
|
|
|
|
|
|
|
|
|
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
|
|
|
|
if (packed_weight_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
|
|
|
|
|
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
|
|
|
|
|
conv_param_->output_channel_);
|
|
|
|
|
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
|
|
|
|
weight_tensor->Batch());
|
|
|
|
|
|
|
|
|
|
// init bias
|
|
|
|
|
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
|
|
|
|
|
if (bias_data_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Malloc buffer failed.";
|
|
|
|
@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|
|
|
|
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
|
|
|
|
|
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
|
|
|
|
|
if (in_tensors_.size() == kInputSize2) {
|
|
|
|
|
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
|
|
|
|
|
for (int i = 0; i < conv_param_->output_channel_; i++) {
|
|
|
|
|
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
|
|
|
|
auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
|
|
|
|
|
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
|
|
|
|
|
bias_fp16[i] = (float16_t)ori_bias[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ConvolutionDepthwiseFp16CPUKernel::Init() {
|
|
|
|
|
sliding_ = new (std::nothrow) SlidingWindowParam;
|
|
|
|
|
if (sliding_ == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "new sliding window param failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto ret = InitWeightBias();
|
|
|
|
|
if (ret != 0) {
|
|
|
|
|
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!InferShapeDone()) {
|
|
|
|
|
return RET_OK;
|
|
|
|
|
}
|
|
|
|
@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
|
|
|
|
|
|
|
|
|
|
int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
|
|
|
|
|
FreeTmpBuffer();
|
|
|
|
|
// conv base init
|
|
|
|
|
auto ret = ConvolutionBaseCPUKernel::Init();
|
|
|
|
|
if (ret != RET_OK) {
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
// init sliding_ window param
|
|
|
|
|
sliding_ = new SlidingWindowParam;
|
|
|
|
|
InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
|
|
|
|
|
|
|
|
|
|
ret = InitWeightBias();
|
|
|
|
|
if (ret != 0) {
|
|
|
|
|
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = InitBuffer();
|
|
|
|
|
if (ret != 0) {
|
|
|
|
|
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
|
|
|
|
@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
|
|
|
|
|
MS_LOG(ERROR) << "Get Execute tensor failed.";
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
// pack input: to nhwc8
|
|
|
|
|
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
|
|
|
|
|
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
|
|
|
|
|
if (need_align_) {
|
|
|
|
|
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
|
|
|
|
|
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
|
|
|
|
|
} else {
|
|
|
|
|
packed_input_ = execute_input_;
|
|
|
|
|
}
|
|
|
|
|
if (!need_align_) {
|
|
|
|
|
packed_output_ = execute_output_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
|
|
|
|
|
if (ret != RET_OK) {
|
|
|
|
|
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
|
|
|
|
|
return RET_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
|
|
|
|
|
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
|
|
|
|
|
|
|
|
|
|
if (need_align_) {
|
|
|
|
|
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
|
|
|
|
|
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
|
|
|
|
|
}
|
|
|
|
|
ConvolutionBaseFP16CPUKernel::IfCastOutput();
|
|
|
|
|
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
|
|
|
|
|
return RET_OK;
|
|
|
|
|