!10148 [ms][lite][cpu] x86 depthwise modify

From: @lzkcode
Reviewed-by: @zhang_xue_tong,@zhanghaibo5
Signed-off-by: @zhang_xue_tong
pull/10148/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit b859dbdc3e

@ -587,10 +587,16 @@ bool CheckConvDwUseIndirectBuffer(const ConvParameter *conv_param) {
void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, const ConvParameter *conv_param,
int step_h, int step_w) {
int ic_4 = UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM;
#ifdef ENABLE_AVX
int div = C8NUM;
#else
int div = C4NUM;
#endif
int ic_div = UP_DIV(conv_param->input_channel_, div) * div;
for (int b = 0; b < conv_param->output_batch_; b++) {
float **indirect = indirect_buffer + b * conv_param->output_h_ * step_h;
float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_4;
float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_div;
for (int oh = 0; oh < conv_param->output_h_; oh++) {
for (int kh = 0; kh < conv_param->kernel_h_; kh++) {
int ih = oh * conv_param->stride_h_ + kh * conv_param->dilation_h_ - conv_param->pad_u_;
@ -600,7 +606,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
int iw = ow * conv_param->stride_w_ + kw * conv_param->dilation_w_ - conv_param->pad_l_;
int index = oh * step_h + ow * step_w * conv_param->kernel_h_ + kw * conv_param->kernel_h_ + kh;
if (iw < conv_param->input_w_ && iw >= 0) {
indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_4;
indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_div;
} else {
indirect[index] = zero_ptr;
}
@ -619,7 +625,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr,
}
}
#ifndef ENABLE_ARM64
#if !defined(ENABLE_ARM64) && !defined(ENABLE_AVX)
void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
int output_width, int input_stride, bool relu, bool relu6, int kernel) {
do {
@ -674,6 +680,15 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
}
#endif
#ifdef ENABLE_AVX
void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
int output_width, int input_stride, bool relu, bool relu6, int kernel) {
if (kernel == 9) {
ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6);
}
}
#endif
void ConvDwIndirection(float *output_data, float **indirect_buffer, const float *weight_data, const float *bias_data,
float *zero_ptr, const ConvParameter *conv_param, int task_id) {
int step_w = conv_param->dilation_w_ == 1 ? conv_param->stride_w_ : conv_param->kernel_w_;

@ -66,6 +66,11 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c
int output_width, size_t input_stride, size_t relu, size_t relu6);
#endif
#ifdef ENABLE_AVX
void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, int channels,
int output_width, size_t input_stride, size_t relu, size_t relu6);
#endif
void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
int output_width, int input_stride, bool relu, bool relu6, int kernel);

@ -500,6 +500,30 @@ void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int c
}
}
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
int c8 = UP_DIV(channel, C8NUM);
int c8_channel = c8 * C8NUM;
int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
int ic_remainder_ = channel % C8NUM;
if (ic_remainder_ != 0) {
int nhwc8_batch_offset = 0;
for (int b = 0; b < batch; b++) {
int batch_offset = b * channel * plane;
for (int i = 0; i < plane; i++) {
float *dst_per_plane = (float *)dst + nhwc8_batch_offset + i * c8_channel;
memcpy(dst_per_plane, (float *)src + batch_offset + i * channel, channel * sizeof(float));
for (int j = channel; j < c8_channel; ++j) {
dst_per_plane[j] = 0;
}
}
nhwc8_batch_offset += nhwc8_batch_unit_offset;
}
} else {
size_t ori_input_size = batch * plane * channel * sizeof(float);
memcpy((float *)dst, (float *)src, ori_input_size);
}
}
void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) {
int c4 = UP_DIV(channel, C4NUM);
int ic_remainder_ = channel % C4NUM;
@ -600,6 +624,23 @@ void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, i
}
}
void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel) {
int c8 = UP_DIV(channel, C8NUM);
for (int c = 0; c < c8; c++) {
int dst_off_c = c * C8NUM * height * width;
for (int i = 0; i < C8NUM; i++) {
int src_off_c = (c * C8NUM + i) * height * width;
for (int kh = 0; kh < height; kh++) {
int src_off_kh = src_off_c + kh * width;
for (int kw = 0; kw < width; kw++) {
int dst_off = dst_off_c + kw * height * C8NUM + kh * C8NUM + i;
((float *)dst)[dst_off] = ((float *)src)[src_off_kh + kw];
}
}
}
}
}
void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) {
int c4 = UP_DIV(channel, C4NUM);
int c4_channel = c4 * C4NUM;

@ -64,6 +64,8 @@ void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel);
@ -80,6 +82,8 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int
void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, int width, int channel);
void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel);
void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel);

@ -147,7 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *>
conv_param->input_channel_ = inputs[kInputIndex]->Channel();
conv_param->output_h_ = outputs[kOutputIndex]->Height();
conv_param->output_w_ = outputs[kOutputIndex]->Width();
#ifdef ENABLE_ARM64
#if defined(ENABLE_ARM64) || defined(ENABLE_AVX)
if (CheckConvDwUseIndirectBuffer(conv_param)) {
kernel =
new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive);

@ -47,37 +47,47 @@ int ConvolutionDepthwiseIndirectCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
int C4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * C4 * weight_tensor->Height() * weight_tensor->Width();
#ifdef ENABLE_AVX
int div_flag = C8NUM;
#else
int div_flag = C4NUM;
#endif
int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag);
int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
#ifdef ENABLE_AVX
PackDepthwiseIndirectWeightC8Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
weight_tensor->Batch());
#else
PackDepthwiseIndirectWeightC4Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(),
weight_tensor->Batch());
#endif
auto bias_tensor = in_tensors_[kBiasIndex];
bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
bias_data_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * C4 * sizeof(float));
memset(bias_data_, 0, batch_flag * div_flag * sizeof(float));
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float));
}
// malloc zero ptr
zero_ptr_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float)));
zero_ptr_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float)));
if (zero_ptr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(zero_ptr_, 0, C4NUM * C4 * sizeof(float));
memset(zero_ptr_, 0, batch_flag * div_flag * sizeof(float));
return RET_OK;
}
@ -139,8 +149,13 @@ int ConvDwIndirectRun(void *cdata, int task_id) {
}
int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
#ifdef ENABLE_AVX
int div_flag = C8NUM;
#else
int div_flag = C4NUM;
#endif
int IC_DIV = UP_DIV(conv_param_->input_channel_, div_flag);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * div_flag * IC_DIV;
packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
@ -152,14 +167,24 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() {
int ConvolutionDepthwiseIndirectCPUKernel::Run() {
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_ptr = reinterpret_cast<float *>(input_tensor->data_c());
if (conv_param_->input_channel_ % C4NUM != 0) {
#ifdef ENABLE_AVX
int div_flag = C8NUM;
#else
int div_flag = C4NUM;
#endif
if (conv_param_->input_channel_ % div_flag != 0) {
auto ret = MallocPackedInput();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 indirect buffer MallocPackedInput failed.";
return RET_ERROR;
}
#ifdef ENABLE_AVX
PackNHWCToNHWC8Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
#else
PackNHWCToNHWC4Fp32(input_ptr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
#endif
} else {
packed_input_ = input_ptr;
}
@ -174,7 +199,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {
MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]";
return RET_ERROR;
}
if (conv_param_->input_channel_ % C4NUM != 0) {
if (conv_param_->input_channel_ % div_flag != 0) {
context_->allocator->Free(packed_input_);
}
return RET_OK;

Loading…
Cancel
Save