optimize int8 reduce

pull/8277/head
fuzhiye 4 years ago
parent afd58ab5d8
commit 35acaabc02

@ -20,6 +20,160 @@
#include "nnacl/quantization/fixed_point.h"
#include "nnacl/common_func.h"
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
int32_t bias) {
int stride = plane * UP_ROUND(c, C4NUM);
for (int batch = 0; batch < n; ++batch) {
int8_t *in_ptr = in_data + batch * stride;
int8_t *out_ptr = out_data + batch * c;
for (int i = 0; i < count; ++i) {
int32_t sum_array = 0;
int j = 0;
#ifdef ENABLE_ARM64
for (; j < plane; j += 16) {
int8x16_t in_data_vec = vld1q_s8(in_ptr);
sum_array += vaddlvq_s8(in_data_vec);
in_ptr += 16;
}
for (; j < plane; j += 8) {
int8x8_t in_data_vec = vld1_s8(in_ptr);
sum_array += vaddlv_s8(in_data_vec);
in_ptr += 8;
}
for (; j < plane; j += 4) {
int32x4_t in_data_vec;
in_data_vec[0] = in_ptr[0];
in_data_vec[1] = in_ptr[1];
in_data_vec[2] = in_ptr[2];
in_data_vec[3] = in_ptr[3];
sum_array += vaddvq_s32(in_data_vec);
in_ptr += 4;
}
#elif ENABLE_ARM32
int32x4_t accum = vmovq_n_s32(0);
for (; j < plane; j += 16) {
int32x4_t in_data_vec1;
int32x4_t in_data_vec2;
int32x4_t in_data_vec3;
int32x4_t in_data_vec4;
in_data_vec1[0] = in_ptr[0];
in_data_vec1[1] = in_ptr[1];
in_data_vec1[2] = in_ptr[2];
in_data_vec1[3] = in_ptr[3];
in_data_vec2[0] = in_ptr[4];
in_data_vec2[1] = in_ptr[5];
in_data_vec2[2] = in_ptr[6];
in_data_vec2[3] = in_ptr[7];
in_data_vec3[0] = in_ptr[8];
in_data_vec3[1] = in_ptr[9];
in_data_vec3[2] = in_ptr[10];
in_data_vec3[3] = in_ptr[11];
in_data_vec4[0] = in_ptr[12];
in_data_vec4[1] = in_ptr[13];
in_data_vec4[2] = in_ptr[14];
in_data_vec4[3] = in_ptr[15];
accum = vaddq_s32(accum, in_data_vec1);
accum = vaddq_s32(accum, in_data_vec2);
accum = vaddq_s32(accum, in_data_vec3);
accum = vaddq_s32(accum, in_data_vec4);
in_ptr += 16;
}
for (; j < plane; j += 8) {
int32x4_t in_data_vec1;
int32x4_t in_data_vec2;
in_data_vec1[0] = in_ptr[0];
in_data_vec1[1] = in_ptr[1];
in_data_vec1[2] = in_ptr[2];
in_data_vec1[3] = in_ptr[3];
in_data_vec2[0] = in_ptr[4];
in_data_vec2[1] = in_ptr[5];
in_data_vec2[2] = in_ptr[6];
in_data_vec2[3] = in_ptr[7];
accum = vaddq_s32(accum, in_data_vec1);
accum = vaddq_s32(accum, in_data_vec2);
in_ptr += 8;
}
for (; j < plane; j += 4) {
int32x4_t in_data_vec;
in_data_vec[0] = in_ptr[0];
in_data_vec[1] = in_ptr[1];
in_data_vec[2] = in_ptr[2];
in_data_vec[3] = in_ptr[3];
accum = vaddq_s32(accum, in_data_vec);
in_ptr += 4;
}
sum_array += accum[0];
sum_array += accum[1];
sum_array += accum[2];
sum_array += accum[3];
#endif
for (; j < plane; j++) {
sum_array += in_ptr[0];
in_ptr++;
}
int32_t mean =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum_array * (1 << (unsigned int)quant_arg.left_shift_),
quant_arg.multiplier_),
quant_arg.right_shift_);
mean += bias;
mean = MSMIN(mean, INT8_MAX);
mean = MSMAX(mean, INT8_MIN);
out_ptr[0] = mean;
out_ptr++;
}
}
return NNACL_OK;
}
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
// Get x such that (x-zp_in) * scale_in = mean
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,

@ -21,6 +21,23 @@
extern "C" {
#endif
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
int32_t bias);
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,

@ -929,13 +929,144 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
return;
}
void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel) {
for (int n = 0; n < batch; n++) {
for (int c = 0; c < channel; c++) {
for (int hw = 0; hw < plane; hw++) {
int nhwc_index = n * channel * plane + hw * channel + c;
int nchw_index = n * channel * plane + c * plane + hw;
((int8_t *)dst)[nchw_index] = ((int8_t *)src)[nhwc_index];
void PackNHWCToNCHWInt8(const void *src, void *dst, int batches, int plane, int channel) {
int hw8 = plane / C8NUM * C8NUM;
int c8 = channel / C8NUM * C8NUM;
int batch = plane * channel;
for (int n = 0; n < batches; n++) {
const int8_t *src_batch = (const int8_t *)src + n * batch;
int8_t *dst_batch = (int8_t *)dst + n * batch;
int hw = 0;
for (; hw < hw8; hw += C8NUM) {
int c = 0;
for (; c < c8; c += C8NUM) {
const int8_t *src_ptr = src_batch + hw * channel + c;
int8_t *dst_ptr = dst_batch + c * plane + hw;
#ifdef ENABLE_ARM64
size_t srcStride = channel * sizeof(int8_t);
size_t dstStride = plane * sizeof(int8_t);
asm volatile(
"mov x10, %[src_ptr]\n"
"mov x11, %[dst_ptr]\n"
"ld1 {v0.8b}, [x10], %[srcStride]\n"
"ld1 {v1.8b}, [x10], %[srcStride]\n"
"ld1 {v2.8b}, [x10], %[srcStride]\n"
"ld1 {v3.8b}, [x10], %[srcStride]\n"
"trn1 v4.8b, v0.8b, v1.8b\n"
"trn2 v5.8b, v0.8b, v1.8b\n"
"trn1 v6.8b, v2.8b, v3.8b\n"
"trn2 v7.8b, v2.8b, v3.8b\n"
"ld1 {v0.8b}, [x10], %[srcStride]\n"
"ld1 {v1.8b}, [x10], %[srcStride]\n"
"ld1 {v2.8b}, [x10], %[srcStride]\n"
"ld1 {v3.8b}, [x10], %[srcStride]\n"
"trn1 v8.4h, v4.4h, v6.4h\n"
"trn2 v9.4h, v4.4h, v6.4h\n"
"trn1 v10.4h, v5.4h, v7.4h\n"
"trn2 v11.4h, v5.4h, v7.4h\n"
"trn1 v4.8b, v0.8b, v1.8b\n"
"trn2 v5.8b, v0.8b, v1.8b\n"
"trn1 v6.8b, v2.8b, v3.8b\n"
"trn2 v7.8b, v2.8b, v3.8b\n"
"trn1 v12.4h, v4.4h, v6.4h\n"
"trn2 v13.4h, v4.4h, v6.4h\n"
"trn1 v14.4h, v5.4h, v7.4h\n"
"trn2 v15.4h, v5.4h, v7.4h\n"
"trn1 v0.2s, v8.2s, v12.2s\n"
"trn2 v4.2s, v8.2s, v12.2s\n"
"trn1 v1.2s, v10.2s, v14.2s\n"
"trn2 v5.2s, v10.2s, v14.2s\n"
"trn1 v2.2s, v9.2s, v13.2s\n"
"trn2 v6.2s, v9.2s, v13.2s\n"
"trn1 v3.2s, v11.2s, v15.2s\n"
"trn2 v7.2s, v11.2s, v15.2s\n"
"st1 {v0.8b}, [x11], %[dstStride]\n"
"st1 {v1.8b}, [x11], %[dstStride]\n"
"st1 {v2.8b}, [x11], %[dstStride]\n"
"st1 {v3.8b}, [x11], %[dstStride]\n"
"st1 {v4.8b}, [x11], %[dstStride]\n"
"st1 {v5.8b}, [x11], %[dstStride]\n"
"st1 {v6.8b}, [x11], %[dstStride]\n"
"st1 {v7.8b}, [x11], %[dstStride]\n"
:
:
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31");
#elif ENABLE_ARM32
size_t srcStride = channel * sizeof(int8_t);
size_t dstStride = plane * sizeof(int8_t);
asm volatile(
"mov r10, %[src_ptr]\n"
"mov r12, %[dst_ptr]\n"
"vld1.8 {d0}, [r10], %[srcStride]\n"
"vld1.8 {d1}, [r10], %[srcStride]\n"
"vld1.8 {d2}, [r10], %[srcStride]\n"
"vld1.8 {d3}, [r10], %[srcStride]\n"
"vld1.8 {d4}, [r10], %[srcStride]\n"
"vld1.8 {d5}, [r10], %[srcStride]\n"
"vld1.8 {d6}, [r10], %[srcStride]\n"
"vld1.8 {d7}, [r10], %[srcStride]\n"
"vtrn.8 d0, d1\n"
"vtrn.8 d2, d3\n"
"vtrn.8 d4, d5\n"
"vtrn.8 d6, d7\n"
"vtrn.16 d0, d2\n"
"vtrn.16 d1, d3\n"
"vtrn.16 d4, d6\n"
"vtrn.16 d5, d7\n"
"vtrn.32 d0, d4\n"
"vtrn.32 d1, d5\n"
"vtrn.32 d2, d6\n"
"vtrn.32 d3, d7\n"
"vst1.8 {d0}, [r12], %[dstStride]\n"
"vst1.8 {d1}, [r12], %[dstStride]\n"
"vst1.8 {d2}, [r12], %[dstStride]\n"
"vst1.8 {d3}, [r12], %[dstStride]\n"
"vst1.8 {d4}, [r12], %[dstStride]\n"
"vst1.8 {d5}, [r12], %[dstStride]\n"
"vst1.8 {d6}, [r12], %[dstStride]\n"
"vst1.8 {d7}, [r12], %[dstStride]\n"
:
:
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
: "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
"q15");
#else
for (int tr = 0; tr < C8NUM; tr++) {
for (int tc = 0; tc < C8NUM; tc++) {
dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
}
}
#endif
}
for (; c < channel; c++) {
const int8_t *src_ptr = src_batch + hw * channel + c;
int8_t *dst_ptr = dst_batch + c * plane + hw;
for (size_t i = 0; i < C8NUM; i++) {
dst_ptr[i] = src_ptr[i * channel];
}
}
}
for (; hw < plane; hw++) {
const int8_t *src_ptr = src_batch + hw * channel;
int8_t *dst_ptr = dst_batch + hw;
for (size_t i = 0; i < channel; i++) {
dst_ptr[i * plane] = src_ptr[i];
}
}
}

@ -108,7 +108,6 @@ int ReduceBaseCPUKernel::Init() {
}
mode_ = reduce_param->mode_;
memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
reduce_to_end_ = reduce_param->reduce_to_end_;
auto ret = CheckInputsOutputs();

File diff suppressed because it is too large Load Diff

@ -28,6 +28,7 @@
using mindspore::schema::ReduceMode;
namespace mindspore::kernel {
enum Four_DIMENSION_REDUCE_TEMPLATE { N, H, W, C, NH, NW, NC, HW, HC, WC, NHW, NHC, NWC, HWC, NHWC };
class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
@ -38,7 +39,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive), ctx_(ctx) {}
~ReduceInt8CPUKernel() {
for (auto qm : mean_multipliers_) {
delete qm;
@ -59,6 +60,8 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
int Init() override;
int ReSize() override;
int Run() override;
int Fast4DReduceMeanHWImpl();
int Reduce4DExecute(int task_id);
int CallReduceUnit(int task_id);
int ReduceLastAxis(int task_id);
@ -68,22 +71,31 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
private:
int MallocTmpBuffer();
void FreeTmpBuffer();
void Match4DReducePattern();
void OneAxis();
void TwoAxes();
void ThreeAxes();
void ReduceMean4DCalQuantParam();
int CalculateQuantArgs();
void GetQuantArgs(size_t i);
private:
ReduceParameter *param_ = nullptr;
ReduceQuantArg quant_arg_;
int8_t *nchw_in_data_ = nullptr;
int32_t bias_;
private:
const lite::InnerContext *ctx_;
int32_t *begin_src_data_ = nullptr;
int8_t *last_dst_data_ = nullptr;
std::vector<int32_t *> data_buffers_;
const int32_t *src_data_ = nullptr;
int32_t *dst_data_ = nullptr;
bool valid_shape_ = false;
bool pattern_impl_ = false;
Four_DIMENSION_REDUCE_TEMPLATE pattern_;
QuantMulArg reduce_mean_quant_param_; // used in reduce mean 4D situation
Reducer reducer_ = nullptr;
LastReducer last_reducer_ = nullptr;
std::vector<QuantMulArg *> mean_multipliers_;

Loading…
Cancel
Save