[MSLITE][Develop]conv1x1 arm32 filter peroc

pull/6114/head
ling 4 years ago
parent a3cc26ffcc
commit 236c8de5da

@ -735,6 +735,16 @@ void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int
return;
}
void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
int32_t *multiplier, ConvParameter *conv_param) {
MatMulInt8_4x2_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.filter_arg_num_ != 1);
return;
}
void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
int32_t *multiplier, ConvParameter *conv_param) {

@ -64,6 +64,9 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t
void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
const int32_t *bias, int row, int col, int deep4, int32_t *left_shift, int32_t *right_shift,
int32_t *multiplier, ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func);
void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift,
int32_t *multiplier, ConvParameter *conv_param);
// int8 convolution 3x3
void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,

@ -46,12 +46,12 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co
void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
int col16 = UP_ROUND(col, C16NUM);
for (int r = 0; r < row; r++) {
int rd4 = r / C2NUM;
int rm4 = r % C2NUM;
int rd2 = r / C2NUM;
int rm2 = r % C2NUM;
for (int c = 0; c < col; c++) {
int cd16 = c / C16NUM;
int cm16 = c % C16NUM;
int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16;
int dst_index = rd2 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm2 * C16NUM + cm16;
int src_index = r * col + c;
dst_ptr[dst_index] = src_ptr[src_index];
}
@ -232,6 +232,40 @@ void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
return;
}
void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
bool peroc) {
/* support per-layer && weight per-channel */
/* row4x16-major * row16x2-major => (int8)row-major*/
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r4div = r / C4NUM, r4mod = r % C4NUM;
int c2div = c / C2NUM, c2mod = c % C2NUM;
size_t ci = r * stride + c;
int32_t value = 0;
for (int d = 0; d < deep_16; d++) {
int d16div = d / C16NUM, d16mod = d % C16NUM;
size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
size_t bi = c2div * deep_16 * C2NUM + d16div * C2NUM * C16NUM + c2mod * C16NUM + d16mod;
value = value + a[ai] * b[bi];
}
int32_t cur_input_sum =
peroc ? input_sum[c2div * UP_ROUND(row, C4NUM) * C2NUM + r * C2NUM + c2mod] : input_sum[r];
value -= cur_input_sum;
value += bias[c];
int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0];
int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0];
int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0];
value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
value = MSMIN(maxi, value);
value = MSMAX(mini, value);
dst[ci] = (int8_t)value;
}
}
return;
}
void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,

@ -52,6 +52,10 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums
int stride);
void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
bool peroc);
#ifdef ENABLE_ARM64
void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,

@ -404,14 +404,42 @@ void PackInputSum16x4PerChannel(const int8_t *input_value, int32_t *input_sum, i
return;
}
void PackInputSum16x4PerChannelArm32(const int8_t *input_value, int32_t *input_sum, int32_t *filter_zp_ptr,
size_t plane_size, size_t input_channel, size_t output_channel) {
size_t hw4 = UP_ROUND(plane_size, C4NUM);
size_t ic16 = UP_ROUND(input_channel, C16NUM);
for (int ri = 0; ri < plane_size; ri++) {
int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
for (int ci = 0; ci < output_channel; ci++) {
int32_t tmp_sum_value = 0;
int ci2div = ci / C2NUM, ci2mod = ci % C2NUM;
int32_t filter_zp = filter_zp_ptr[ci];
for (int di = 0; di < input_channel; di++) {
size_t di16div = di / C16NUM, di16mod = di % C16NUM;
int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
tmp_sum_value += input_value[src_index];
}
int dst_index = ci2div * C2NUM * hw4 + ri * C2NUM + ci2mod;
input_sum[dst_index] = tmp_sum_value * filter_zp;
}
}
return;
}
void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, int32_t *filter_zp, ConvParameter *conv_param) {
size_t hw4 = UP_ROUND(conv_param->input_h_ * conv_param->input_w_, C4NUM);
size_t ic16 = UP_ROUND(conv_param->input_channel_, C16NUM);
if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
PackInputSum16x4PerLayer(input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
} else {
#ifdef ENABLE_ARM32
PackInputSum16x4PerChannelArm32(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_,
conv_param->input_channel_, conv_param->output_channel_);
#else
PackInputSum16x4PerChannel(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_,
conv_param->input_channel_, conv_param->output_channel_);
#endif
}
return;
}

@ -175,7 +175,7 @@ int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() {
MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!";
return RET_ERROR;
}
memset(bias_data_, 0, size);
memset(bias_data_, 0, col2 * sizeof(int32_t));
if (in_tensors_.size() == 3) {
memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
}
@ -249,16 +249,16 @@ int Convolution1x1Int8CPUKernel::InitParam() {
/* init input sum size */
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
input_sum_size = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count);
input_sum_size_ = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count);
} else {
input_sum_size = UP_ROUND(matmul_param_->row_, row_pack_count);
input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
}
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, row_pack_count));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, row_pack_count), thread_count_);
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_pack_count));
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_pack_count), thread_count_);
thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, col_pack_count));
thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, col_pack_count), thread_count_hw_);
thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_pack_count));
thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, row_pack_count), thread_count_hw_);
if (pre_trans_input_) {
input_ptr_ = reinterpret_cast<int8_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
@ -269,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t));
}
return RET_OK;
} // namespace mindspore::kernel
}
int Convolution1x1Int8CPUKernel::ReSize() {
FreeResizeBuf();
@ -314,10 +314,10 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
if (cur_oc <= 0) {
return RET_OK;
}
Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc,
matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
Conv1x1Int8Arm32(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_,
cur_oc, matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
#else
if (support_optimize_) {
int cur_stride = thread_stride_ * C8NUM;
@ -392,7 +392,7 @@ int Convolution1x1Int8Impl(void *cdata, int task_id) {
}
int Convolution1x1Int8CPUKernel::InitRunBuf() {
input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size * sizeof(int32_t)));
input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size_ * sizeof(int32_t)));
if (input_sum_ == nullptr) {
MS_LOG(ERROR) << "malloc input_sum_ failed.";
return RET_ERROR;

@ -69,7 +69,7 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
size_t thread_count_hw_ = 1;
size_t thread_stride_hw_ = 0;
bool pre_trans_input_ = false;
size_t input_sum_size = 0;
size_t input_sum_size_ = 0;
MatMulParameter *matmul_param_ = nullptr;
MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
bool support_optimize_ = false;

Loading…
Cancel
Save