!8452 [MS][LITE][CPU]optimize int8 spaceToBatch

From: @fuzhiye
Reviewed-by: @zhang_xue_tong,@hangangqiang
Signed-off-by: @zhang_xue_tong
pull/8452/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 696e6d959e

@ -20,6 +20,7 @@
typedef struct SpaceToBatchParameter {
OpParameter op_parameter_;
bool need_paddings_;
int m_;
int block_sizes_[4];
int paddings_[4];
int input_shape_[4];

@ -46,47 +46,40 @@ void DoSpaceToBatchNHWCInt8(const int8_t *input, int8_t *output, const int *bloc
}
}
void DoSpaceToBatchPaddingNHWCInt8(const int8_t *input, int8_t *output, const int *in_shape, const int *padding,
const int *out_shape, int32_t zp) {
void DoSpaceToBatchPaddingNHWCInt8(const int8_t *input, int8_t *output, SpaceToBatchParameter *param, int32_t zp) {
int *in_shape = param->input_shape_;
int *out_shape = param->output_shape_;
int *paddings = param->paddings_;
int block_shape_h = param->block_sizes_[0];
int block_shape_w = param->m_ == 2 ? param->block_sizes_[1] : 1;
int in_b = in_shape[0];
int in_h = in_shape[1];
int in_w = in_shape[2];
int in_c = in_shape[3];
int channel = in_shape[3];
int out_h = out_shape[1];
int out_w = out_shape[2];
int out_c = out_shape[3];
size_t ped_h_num = out_w * out_c;
size_t ped_h_size = ped_h_num * sizeof(int8_t);
size_t ped_w_size = out_c * sizeof(int8_t);
size_t out_offset = 0;
int in_strides[4];
ComputeStrides(in_shape, in_strides, 4);
int out_strides[4];
ComputeStrides(out_shape, out_strides, 4);
size_t copy_size = in_c * sizeof(int8_t);
for (int i = 0; i < in_shape[0]; ++i) {
size_t in_offset0 = i * in_strides[0];
for (int pad_h_top = 0; pad_h_top < padding[0]; ++pad_h_top) {
memset(output + out_offset, zp, ped_h_size);
out_offset += ped_h_num;
}
for (int j = 0; j < in_h; ++j) {
size_t in_offset1 = in_offset0 + j * in_strides[1];
for (int pad_w_left = 0; pad_w_left < padding[2]; ++pad_w_left) {
memset(output + out_offset, zp, ped_w_size);
out_offset += out_c;
}
for (int k = 0; k < in_w; ++k) {
size_t in_offset2 = in_offset1 + k * in_strides[2];
memcpy(output + out_offset, input + in_offset2, copy_size);
out_offset += in_c;
int pad_t = paddings[0];
int pad_l = param->m_ == 2 ? paddings[2] : 0;
for (int i = 0; i < out_shape[0]; ++i) {
int in_batch = i % in_b;
int offset_w = (i / in_b) % block_shape_w;
int offset_h = (i / in_b) / block_shape_w;
int in_b_offset = in_batch * in_h * in_w * channel;
int out_b_offset = i * out_h * out_w * channel;
for (int j = 0; j < out_h; ++j) {
int out_h_offset = out_b_offset + j * out_w * channel;
for (int k = 0; k < out_w; ++k) {
int8_t *out_ptr = output + out_h_offset + k * channel;
int index_h = j * block_shape_h + offset_h;
int index_w = k * block_shape_w + offset_w;
if (index_h < pad_t || index_h >= (pad_t + in_h) || index_w < pad_l || index_w >= (pad_l + in_w)) {
memset(out_ptr, zp, channel * sizeof(int8_t));
} else {
int in_plane_offset = in_b_offset + ((index_h - pad_t) * in_w + (index_w - pad_l)) * channel;
const int8_t *in_ptr = input + in_plane_offset;
memcpy(out_ptr, in_ptr, channel * sizeof(int8_t));
}
}
for (int pad_w_right = 0; pad_w_right < padding[3]; ++pad_w_right) {
memset(output + out_offset, zp, ped_w_size);
out_offset += out_c;
}
}
for (int pad_h_bottom = 0; pad_h_bottom < padding[1]; ++pad_h_bottom) {
memset(output + out_offset, zp, ped_h_size);
out_offset += ped_h_num;
}
}
}

@ -17,14 +17,14 @@
#define MINDSPORE_LITE_NNACL_INT8_SPACE_TO_BATCH_INT8_H_
#include "nnacl/op_base.h"
#include "nnacl/fp32/space_to_batch.h"
#ifdef __cplusplus
extern "C" {
#endif
void DoSpaceToBatchNHWCInt8(const int8_t *input, int8_t *output, const int *block_sizes, const int *in_shape,
const int *out_shape);
void DoSpaceToBatchPaddingNHWCInt8(const int8_t *input, int8_t *output, const int *in_shape, const int *padding,
const int *out_shape, int32_t zp);
void DoSpaceToBatchPaddingNHWCInt8(const int8_t *input, int8_t *output, SpaceToBatchParameter *param, int32_t zp);
#ifdef __cplusplus
}
#endif

@ -31,6 +31,7 @@ OpParameter *PopulateSpaceToBatchNDParameter(const mindspore::lite::PrimitiveC *
space_batch_param_nd->op_parameter_.type_ = primitive->Type();
auto block_sizes = ((mindspore::lite::SpaceToBatchND *)primitive)->GetBlockShape();
space_batch_param_nd->m_ = block_sizes.size();
memcpy(space_batch_param_nd->block_sizes_, (block_sizes.data()), block_sizes.size() * sizeof(int));
auto paddings = ((mindspore::lite::SpaceToBatchND *)primitive)->GetPaddings();
memcpy(space_batch_param_nd->paddings_, (paddings.data()), paddings.size() * sizeof(int));

@ -33,6 +33,7 @@ OpParameter *PopulateSpaceToBatchParameter(const mindspore::lite::PrimitiveC *pr
memset(space_batch_param, 0, sizeof(SpaceToBatchParameter));
space_batch_param->op_parameter_.type_ = primitive->Type();
auto block_sizes = ((mindspore::lite::SpaceToBatch *)primitive)->BlockSizes();
space_batch_param->m_ = block_sizes.size();
memcpy(space_batch_param->block_sizes_, (block_sizes.data()), block_sizes.size() * sizeof(int));
auto paddings = ((mindspore::lite::SpaceToBatch *)primitive)->Paddings();
memcpy(space_batch_param->paddings_, (paddings.data()), paddings.size() * sizeof(int));

@ -80,8 +80,6 @@ Registry SpaceToBatchRegistry(schema::PrimitiveType_SpaceToBatch, SpaceToBatchCr
namespace {
constexpr int kSpaceToBatchNDOutputNum = 1;
constexpr int kSpaceToBatchNDInputNum = 1;
constexpr int kBlockSizesSize = 2;
constexpr int kPaddingsSize = 4;
} // namespace
int SpaceToBatch::InferShape(std::vector<lite::Tensor *> inputs, std::vector<lite::Tensor *> outputs) {
@ -103,20 +101,13 @@ int SpaceToBatch::InferShape(std::vector<lite::Tensor *> inputs, std::vector<lit
}
auto input_shape = input->shape();
if (input_shape.size() != kDimension_4d) {
MS_LOG(ERROR) << "input shape dimension size should == " << kDimension_4d;
return 1;
}
if (GetBlockShape().size() != kBlockSizesSize) {
MS_LOG(ERROR) << "Block shape size should be " << kBlockSizesSize;
return 1;
}
if (GetPaddings().size() != kPaddingsSize) {
MS_LOG(ERROR) << "Crops size should be " << kPaddingsSize;
return 1;
MS_LOG(ERROR) << "Space_to_batch op only support 4D input currently. But got %d dimensionality input."
<< kDimension_4d;
return RET_ERROR;
}
for (int &iter : GetBlockShape()) {
auto block_shape_vector = GetBlockShape();
for (int &iter : block_shape_vector) {
block_sizes_.emplace_back(iter);
}
@ -125,7 +116,8 @@ int SpaceToBatch::InferShape(std::vector<lite::Tensor *> inputs, std::vector<lit
paddings_.clear();
in_shape_.emplace_back(input_shape.at(NHWC_N));
padded_in_shape_.emplace_back(input_shape.at(NHWC_N));
for (int i = 0; i < kBlockSizesSize; i++) {
auto block_shape_size = block_shape_vector.size();
for (size_t i = 0; i < block_shape_size; i++) {
in_shape_.emplace_back(input_shape.at(i + 1));
padded_in_shape_.emplace_back(input_shape.at(i + 1) + (paddings_.at(2 * i) + paddings_.at(2 * i + 1)));
paddings_.emplace_back(paddings_.at(2 * i));
@ -137,11 +129,19 @@ int SpaceToBatch::InferShape(std::vector<lite::Tensor *> inputs, std::vector<lit
}
in_shape_.emplace_back(input_shape.at(NHWC_C));
padded_in_shape_.emplace_back(input_shape.at(NHWC_C));
int padding_left = 0;
int padding_right = 0;
int block_w = 1;
if (block_shape_size == 2) {
padding_left = paddings_[2];
padding_right = paddings_[3];
block_w = block_sizes_[1];
}
std::vector<int32_t> output_shape(input_shape.size());
output_shape[NHWC_N] = input_shape[NHWC_N] * (block_sizes_[NHWC_N] * block_sizes_[NHWC_H]);
output_shape[NHWC_H] = (input_shape[NHWC_H] + paddings_[0] + paddings_[1]) / block_sizes_[NHWC_N];
output_shape[NHWC_W] = (input_shape[NHWC_W] + paddings_[2] + paddings_[3]) / block_sizes_[NHWC_H];
output_shape[NHWC_N] = input_shape[NHWC_N] * (block_sizes_[0] * block_w);
output_shape[NHWC_H] = (input_shape[NHWC_H] + paddings_[0] + paddings_[1]) / block_sizes_[0];
output_shape[NHWC_W] = (input_shape[NHWC_W] + padding_left + padding_right) / block_w;
output_shape[NHWC_C] = input_shape[NHWC_C];
outputs[0]->set_shape(output_shape);
return RET_OK;

@ -26,8 +26,6 @@ namespace lite {
namespace {
constexpr int kSpaceToBatchNDOutputNum = 1;
constexpr int kSpaceToBatchNDInputNum = 1;
constexpr int kBlockSizesSize = 2;
constexpr int kPaddingsSize = 4;
} // namespace
#ifdef PRIMITIVE_WRITEABLE
@ -109,20 +107,19 @@ int SpaceToBatchND::InferShape(std::vector<lite::Tensor *> inputs, std::vector<l
return RET_ERROR;
}
auto block_shape = GetBlockShape();
if (block_shape.size() != kBlockSizesSize) {
MS_LOG(ERROR) << "blockShape size != " << kBlockSizesSize;
return RET_ERROR;
}
auto pedding = GetPaddings();
if (pedding.size() != kPaddingsSize) {
MS_LOG(ERROR) << "pedding size should be " << kPaddingsSize;
return RET_ERROR;
auto padding = GetPaddings();
int padding_left = 0;
int padding_right = 0;
int block_w = 1;
if (block_shape.size() == 2) {
padding_left = padding[2];
padding_right = padding[3];
block_w = block_shape[1];
}
std::vector<int32_t> output_shape(input_shape.size());
output_shape[NHWC_N] = input_shape[NHWC_N] * block_shape[0] * block_shape[1];
output_shape[NHWC_H] = (input_shape[NHWC_H] + pedding[0] + pedding[1]) / block_shape[0];
output_shape[NHWC_W] = (input_shape[NHWC_W] + pedding[2] + pedding[3]) / block_shape[1];
output_shape[NHWC_N] = input_shape[NHWC_N] * block_shape[0] * block_w;
output_shape[NHWC_H] = (input_shape[NHWC_H] + padding[0] + padding[1]) / block_shape[0];
output_shape[NHWC_W] = (input_shape[NHWC_W] + padding_left + padding_right) / block_w;
output_shape[NHWC_C] = input_shape[NHWC_C];
outputs[0]->set_shape(output_shape);
return RET_OK;

@ -54,9 +54,15 @@ int SpaceToBatchCPUKernel::ReSize() {
}
}
if (param->need_paddings_) {
int padding_left = 0;
int padding_right = 0;
if (param->m_ == 2) {
padding_left = param->paddings_[2];
padding_right = param->paddings_[3];
}
param->padded_in_shape_[kNHWC_N] = input_tensor->shape().at(kNHWC_N);
param->padded_in_shape_[kNHWC_H] = input_tensor->shape().at(kNHWC_H) + param->paddings_[0] + param->paddings_[1];
param->padded_in_shape_[kNHWC_W] = input_tensor->shape().at(kNHWC_W) + param->paddings_[2] + param->paddings_[3];
param->padded_in_shape_[kNHWC_W] = input_tensor->shape().at(kNHWC_W) + padding_left + padding_right;
param->padded_in_shape_[kNHWC_C] = input_tensor->shape().at(kNHWC_C);
param->padded_input_element_num = param->padded_in_shape_[kNHWC_N] * param->padded_in_shape_[kNHWC_H] *
param->padded_in_shape_[kNHWC_W] * param->padded_in_shape_[kNHWC_C];

@ -38,17 +38,7 @@ int SpaceToBatchInt8CPUKernel::Run() {
auto quant_arg = output_tensor->GetQuantParams().front();
if (param->need_paddings_) {
padded_input_ = context_->allocator->Malloc(param->padded_input_element_num * sizeof(int8_t));
if (padded_input_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
return RET_ERROR;
}
auto padded_input = reinterpret_cast<int8_t *>(padded_input_);
DoSpaceToBatchPaddingNHWCInt8(input_ptr, padded_input, param->input_shape_, param->paddings_,
param->padded_in_shape_, quant_arg.zeroPoint);
DoSpaceToBatchNHWCInt8(padded_input, output_ptr, param->block_sizes_, param->padded_in_shape_,
param->output_shape_);
FreeTmpBuffer();
DoSpaceToBatchPaddingNHWCInt8(input_ptr, output_ptr, param, quant_arg.zeroPoint);
} else {
DoSpaceToBatchNHWCInt8(input_ptr, output_ptr, param->block_sizes_, param->input_shape_, param->output_shape_);
}

@ -34,7 +34,7 @@ TEST_F(SpaceToBatchTestInt8, test1) {
std::vector<lite::Tensor *> inputs = {&in_tensor};
std::vector<lite::Tensor *> outputs = {&out_tensor};
SpaceToBatchParameter parameter = {{}, false, {2, 2}, {1, 1, 1, 1}};
SpaceToBatchParameter parameter = {{}, false, 2, {2, 2}, {1, 1, 1, 1}};
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_SpaceToBatchND};
auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);

Loading…
Cancel
Save