diff --git a/mindspore/lite/nnacl/int8/reduce_int8.c b/mindspore/lite/nnacl/int8/reduce_int8.c index 80553f8fe8..aa7e129594 100644 --- a/mindspore/lite/nnacl/int8/reduce_int8.c +++ b/mindspore/lite/nnacl/int8/reduce_int8.c @@ -20,6 +20,160 @@ #include "nnacl/quantization/fixed_point.h" #include "nnacl/common_func.h" +int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} +int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg, + int32_t bias) { + int stride = plane * UP_ROUND(c, C4NUM); + for (int batch = 0; batch < n; ++batch) { + int8_t *in_ptr = in_data + batch * stride; + int8_t *out_ptr = out_data + batch * c; + for (int i = 0; i < count; ++i) { + int32_t sum_array = 0; + int j = 0; +#ifdef ENABLE_ARM64 + for (; j < plane; j += 16) { + int8x16_t in_data_vec = vld1q_s8(in_ptr); + sum_array += vaddlvq_s8(in_data_vec); + in_ptr += 16; + } + for (; j < plane; j += 8) { + int8x8_t in_data_vec = vld1_s8(in_ptr); + sum_array += vaddlv_s8(in_data_vec); + in_ptr += 8; + } + for (; j < plane; j += 4) { + int32x4_t in_data_vec; + in_data_vec[0] = in_ptr[0]; + in_data_vec[1] = in_ptr[1]; + in_data_vec[2] = in_ptr[2]; + in_data_vec[3] = in_ptr[3]; + sum_array += vaddvq_s32(in_data_vec); + in_ptr += 4; + } +#elif ENABLE_ARM32 + int32x4_t accum = vmovq_n_s32(0); + for (; j < plane; j += 16) { + int32x4_t in_data_vec1; + int32x4_t in_data_vec2; + int32x4_t in_data_vec3; + int32x4_t in_data_vec4; + in_data_vec1[0] = in_ptr[0]; + in_data_vec1[1] = in_ptr[1]; + in_data_vec1[2] = in_ptr[2]; + in_data_vec1[3] = in_ptr[3]; + in_data_vec2[0] = in_ptr[4]; + in_data_vec2[1] = in_ptr[5]; + in_data_vec2[2] = in_ptr[6]; + in_data_vec2[3] = in_ptr[7]; + in_data_vec3[0] = in_ptr[8]; + in_data_vec3[1] = in_ptr[9]; + in_data_vec3[2] = in_ptr[10]; + in_data_vec3[3] = in_ptr[11]; + in_data_vec4[0] = in_ptr[12]; + in_data_vec4[1] = in_ptr[13]; + in_data_vec4[2] = in_ptr[14]; + in_data_vec4[3] = in_ptr[15]; + accum = vaddq_s32(accum, in_data_vec1); + accum = vaddq_s32(accum, in_data_vec2); + accum = vaddq_s32(accum, in_data_vec3); + accum = vaddq_s32(accum, in_data_vec4); + in_ptr += 16; + } + for (; j < plane; j += 8) { + int32x4_t in_data_vec1; + int32x4_t in_data_vec2; + in_data_vec1[0] = in_ptr[0]; + in_data_vec1[1] = in_ptr[1]; + in_data_vec1[2] = in_ptr[2]; + in_data_vec1[3] = in_ptr[3]; + in_data_vec2[0] = in_ptr[4]; + in_data_vec2[1] = in_ptr[5]; + in_data_vec2[2] = in_ptr[6]; + in_data_vec2[3] = in_ptr[7]; + accum = vaddq_s32(accum, in_data_vec1); + accum = vaddq_s32(accum, in_data_vec2); + in_ptr += 8; + } + for (; j < plane; j += 4) { + int32x4_t in_data_vec; + in_data_vec[0] = in_ptr[0]; + in_data_vec[1] = in_ptr[1]; + in_data_vec[2] = in_ptr[2]; + in_data_vec[3] = in_ptr[3]; + accum = vaddq_s32(accum, in_data_vec); + in_ptr += 4; + } + sum_array += accum[0]; + sum_array += accum[1]; + sum_array += accum[2]; + sum_array += accum[3]; +#endif + for (; j < plane; j++) { + sum_array += in_ptr[0]; + in_ptr++; + } + int32_t mean = + RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum_array * (1 << (unsigned int)quant_arg.left_shift_), + quant_arg.multiplier_), + quant_arg.right_shift_); + mean += bias; + mean = MSMIN(mean, INT8_MAX); + mean = MSMAX(mean, INT8_MIN); + out_ptr[0] = mean; + out_ptr++; + } + } + return NNACL_OK; +} + +int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + +int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) { + return NNACL_OK; +} + // Get x such that (x-zp_in) * scale_in = mean // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce. int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data, diff --git a/mindspore/lite/nnacl/int8/reduce_int8.h b/mindspore/lite/nnacl/int8/reduce_int8.h index b27634d630..c573514fc9 100644 --- a/mindspore/lite/nnacl/int8/reduce_int8.h +++ b/mindspore/lite/nnacl/int8/reduce_int8.h @@ -21,6 +21,23 @@ extern "C" { #endif +int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg, + int32_t bias); +int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); +int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg); + int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data, int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num); int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data, diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 6c8c65f21d..f80fc5cb84 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -929,13 +929,144 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int return; } -void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel) { - for (int n = 0; n < batch; n++) { - for (int c = 0; c < channel; c++) { - for (int hw = 0; hw < plane; hw++) { - int nhwc_index = n * channel * plane + hw * channel + c; - int nchw_index = n * channel * plane + c * plane + hw; - ((int8_t *)dst)[nchw_index] = ((int8_t *)src)[nhwc_index]; +void PackNHWCToNCHWInt8(const void *src, void *dst, int batches, int plane, int channel) { + int hw8 = plane / C8NUM * C8NUM; + int c8 = channel / C8NUM * C8NUM; + int batch = plane * channel; + for (int n = 0; n < batches; n++) { + const int8_t *src_batch = (const int8_t *)src + n * batch; + int8_t *dst_batch = (int8_t *)dst + n * batch; + int hw = 0; + for (; hw < hw8; hw += C8NUM) { + int c = 0; + for (; c < c8; c += C8NUM) { + const int8_t *src_ptr = src_batch + hw * channel + c; + int8_t *dst_ptr = dst_batch + c * plane + hw; +#ifdef ENABLE_ARM64 + size_t srcStride = channel * sizeof(int8_t); + size_t dstStride = plane * sizeof(int8_t); + asm volatile( + "mov x10, %[src_ptr]\n" + "mov x11, %[dst_ptr]\n" + + "ld1 {v0.8b}, [x10], %[srcStride]\n" + "ld1 {v1.8b}, [x10], %[srcStride]\n" + "ld1 {v2.8b}, [x10], %[srcStride]\n" + "ld1 {v3.8b}, [x10], %[srcStride]\n" + + "trn1 v4.8b, v0.8b, v1.8b\n" + "trn2 v5.8b, v0.8b, v1.8b\n" + "trn1 v6.8b, v2.8b, v3.8b\n" + "trn2 v7.8b, v2.8b, v3.8b\n" + + "ld1 {v0.8b}, [x10], %[srcStride]\n" + "ld1 {v1.8b}, [x10], %[srcStride]\n" + "ld1 {v2.8b}, [x10], %[srcStride]\n" + "ld1 {v3.8b}, [x10], %[srcStride]\n" + + "trn1 v8.4h, v4.4h, v6.4h\n" + "trn2 v9.4h, v4.4h, v6.4h\n" + "trn1 v10.4h, v5.4h, v7.4h\n" + "trn2 v11.4h, v5.4h, v7.4h\n" + + "trn1 v4.8b, v0.8b, v1.8b\n" + "trn2 v5.8b, v0.8b, v1.8b\n" + "trn1 v6.8b, v2.8b, v3.8b\n" + "trn2 v7.8b, v2.8b, v3.8b\n" + + "trn1 v12.4h, v4.4h, v6.4h\n" + "trn2 v13.4h, v4.4h, v6.4h\n" + "trn1 v14.4h, v5.4h, v7.4h\n" + "trn2 v15.4h, v5.4h, v7.4h\n" + + "trn1 v0.2s, v8.2s, v12.2s\n" + "trn2 v4.2s, v8.2s, v12.2s\n" + "trn1 v1.2s, v10.2s, v14.2s\n" + "trn2 v5.2s, v10.2s, v14.2s\n" + "trn1 v2.2s, v9.2s, v13.2s\n" + "trn2 v6.2s, v9.2s, v13.2s\n" + "trn1 v3.2s, v11.2s, v15.2s\n" + "trn2 v7.2s, v11.2s, v15.2s\n" + + "st1 {v0.8b}, [x11], %[dstStride]\n" + "st1 {v1.8b}, [x11], %[dstStride]\n" + "st1 {v2.8b}, [x11], %[dstStride]\n" + "st1 {v3.8b}, [x11], %[dstStride]\n" + "st1 {v4.8b}, [x11], %[dstStride]\n" + "st1 {v5.8b}, [x11], %[dstStride]\n" + "st1 {v6.8b}, [x11], %[dstStride]\n" + "st1 {v7.8b}, [x11], %[dstStride]\n" + : + : + [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) + : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31"); +#elif ENABLE_ARM32 + size_t srcStride = channel * sizeof(int8_t); + size_t dstStride = plane * sizeof(int8_t); + asm volatile( + "mov r10, %[src_ptr]\n" + "mov r12, %[dst_ptr]\n" + + "vld1.8 {d0}, [r10], %[srcStride]\n" + "vld1.8 {d1}, [r10], %[srcStride]\n" + "vld1.8 {d2}, [r10], %[srcStride]\n" + "vld1.8 {d3}, [r10], %[srcStride]\n" + "vld1.8 {d4}, [r10], %[srcStride]\n" + "vld1.8 {d5}, [r10], %[srcStride]\n" + "vld1.8 {d6}, [r10], %[srcStride]\n" + "vld1.8 {d7}, [r10], %[srcStride]\n" + + "vtrn.8 d0, d1\n" + "vtrn.8 d2, d3\n" + "vtrn.8 d4, d5\n" + "vtrn.8 d6, d7\n" + + "vtrn.16 d0, d2\n" + "vtrn.16 d1, d3\n" + "vtrn.16 d4, d6\n" + "vtrn.16 d5, d7\n" + + "vtrn.32 d0, d4\n" + "vtrn.32 d1, d5\n" + "vtrn.32 d2, d6\n" + "vtrn.32 d3, d7\n" + + "vst1.8 {d0}, [r12], %[dstStride]\n" + "vst1.8 {d1}, [r12], %[dstStride]\n" + "vst1.8 {d2}, [r12], %[dstStride]\n" + "vst1.8 {d3}, [r12], %[dstStride]\n" + "vst1.8 {d4}, [r12], %[dstStride]\n" + "vst1.8 {d5}, [r12], %[dstStride]\n" + "vst1.8 {d6}, [r12], %[dstStride]\n" + "vst1.8 {d7}, [r12], %[dstStride]\n" + : + : + [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) + : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15"); +#else + for (int tr = 0; tr < C8NUM; tr++) { + for (int tc = 0; tc < C8NUM; tc++) { + dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc]; + } + } +#endif + } + for (; c < channel; c++) { + const int8_t *src_ptr = src_batch + hw * channel + c; + int8_t *dst_ptr = dst_batch + c * plane + hw; + for (size_t i = 0; i < C8NUM; i++) { + dst_ptr[i] = src_ptr[i * channel]; + } + } + } + for (; hw < plane; hw++) { + const int8_t *src_ptr = src_batch + hw * channel; + int8_t *dst_ptr = dst_batch + hw; + for (size_t i = 0; i < channel; i++) { + dst_ptr[i * plane] = src_ptr[i]; } } } diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc index 11917adeb3..fd6d0ba09c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc @@ -108,7 +108,6 @@ int ReduceBaseCPUKernel::Init() { } mode_ = reduce_param->mode_; - memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_)); reduce_to_end_ = reduce_param->reduce_to_end_; auto ret = CheckInputsOutputs(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index f7a4570ed1..d023bde131 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -201,8 +201,8 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector &i return nullptr; } -void FreeMemoryFp16(std::vector group_convs, std::vector new_inputs, - std::vector new_outputs) { +void FreeMemoryFp16(const std::vector &group_convs, const std::vector &new_inputs, + const std::vector &new_outputs) { for (auto sub_conv : group_convs) { if (sub_conv != nullptr) { delete sub_conv; @@ -220,49 +220,131 @@ void FreeMemoryFp16(std::vector group_convs, std::vector in_shape, bool infered_flag) { + auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR); + if (in_tensor == nullptr) { + MS_LOG(ERROR) << "new in_tensor failed."; + return nullptr; + } + if (infered_flag) { + auto ret = in_tensor->MallocData(); + if (ret != RET_OK) { + delete in_tensor; + MS_LOG(ERROR) << "in tensor malloc failed."; + return nullptr; + } + } + return in_tensor; +} + +lite::Tensor *CreateFilterTensor(TypeId data_type, std::vector filter_shape, + const std::vector &inputs, int copy_length, int index) { + auto filter_tensor = + new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + if (filter_tensor == nullptr) { + MS_LOG(ERROR) << "new filter_tensor failed."; + return nullptr; + } + auto ret = filter_tensor->MallocData(); + if (ret != RET_OK) { + delete filter_tensor; + MS_LOG(ERROR) << "filter_tensor malloc failed."; + return nullptr; + } + if (data_type == kNumberTypeFloat16) { + auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); + memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t)); + } else { + MS_ASSERT(data_type == kNumberTypeFloat32); + auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); + memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float)); + } + return filter_tensor; +} + +lite::Tensor *CreateBiasTensor(TypeId data_type, std::vector bias_shape, const std::vector &inputs, + int new_out_channel, int index) { + auto *origin_bias = inputs.at(kBiasIndex)->data_c(); + auto bias_tensor = + new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + if (bias_tensor == nullptr) { + MS_LOG(ERROR) << "new bias_tensor failed."; + return nullptr; + } + auto ret = bias_tensor->MallocData(); + if (ret != RET_OK) { + delete bias_tensor; + MS_LOG(ERROR) << "bias_tensor malloc failed."; + return nullptr; + } + if (data_type == kNumberTypeFloat16) { + auto bias_data = reinterpret_cast(origin_bias); + memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t)); + } else { + MS_ASSERT(data_type == kNumberTypeFloat32); + auto bias_data = reinterpret_cast(origin_bias); + memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float)); + } + return bias_tensor; +} + +lite::Tensor *CreateOutputTensor(std::vector out_shape, const std::vector &outputs, + bool infered_flag, int index) { + auto out_tensor = new (std::nothrow) lite::Tensor(); + if (out_tensor == nullptr) { + MS_LOG(ERROR) << "new tmp_out_tensor failed."; + return nullptr; + } + out_tensor->set_data_type(outputs.at(index)->data_type()); + out_tensor->SetFormat(outputs.at(index)->GetFormat()); + if (infered_flag) { + out_tensor->set_shape(out_shape); + auto ret = out_tensor->MallocData(); + if (ret != RET_OK) { + delete out_tensor; + MS_LOG(ERROR) << "out_tensor malloc data failed."; + return nullptr; + } + } + return out_tensor; +} + kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *op_parameter, const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive, int group) { - std::vector group_convs; - std::vector in_shape; - std::vector filter_shape; - std::vector bias_shape; - std::vector out_shape; - + int out_unit; + bool has_bias = inputs.size() == 3; + bool use_winograd = false; + bool infered_flag = (primitive != nullptr && primitive->GetInferFlag()); auto conv_param = reinterpret_cast(op_parameter); - int out_channel = inputs.at(kWeightIndex)->Batch(); + + // update new shape info for each sub kernel int new_in_channel = inputs.at(kWeightIndex)->Channel(); int new_out_channel = 0; if (group == 0) { MS_LOG(ERROR) << "Divisor 'group' cannot be 0."; return nullptr; } else { - new_out_channel = out_channel / group; + new_out_channel = inputs.at(kWeightIndex)->Batch() / group; } - int kernel_h = conv_param->kernel_h_; - int kernel_w = conv_param->kernel_w_; - int input_num = inputs.size(); - int output_num = outputs.size(); - bool has_bias = input_num == 3; - bool use_winograd = false; - int out_unit; - bool infered_flag = (primitive != nullptr && primitive->GetInferFlag()); + std::vector in_shape; + std::vector out_shape; if (infered_flag) { int batch = inputs.front()->Batch(); - int in_h = inputs.front()->Height(); - int in_w = inputs.front()->Width(); conv_param->input_channel_ = new_in_channel; conv_param->output_channel_ = new_out_channel; CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param); - in_shape = {batch, in_h, in_w, new_in_channel}; + in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel}; out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel}; } + std::vector filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel}; + std::vector bias_shape = {new_out_channel}; - filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel}; - bias_shape = {new_out_channel}; - + // new group conv op + std::vector group_convs; + // create tensors for every sub conv kernel for (int i = 0; i < group; ++i) { std::vector new_inputs; std::vector new_outputs; @@ -272,116 +354,56 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vectordata_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR); + + // create new input for each group + auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag); if (in_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new in_tensor failed."; + MS_LOG(ERROR) << "create input tensor failed."; return nullptr; } - if (infered_flag) { - auto ret = in_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete in_tensor; - FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "in tensor malloc failed."; - return nullptr; - } - } new_inputs.emplace_back(in_tensor); - // new weight - auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape, - Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + // create new weight + int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel; + auto filter_tensor = CreateFilterTensor(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i); if (filter_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new filter_tensor failed."; - return nullptr; - } - auto ret = filter_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete filter_tensor; - FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "filter_tensor malloc failed."; + MS_LOG(ERROR) << "create filter tensor failed."; return nullptr; } - int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel; - auto filter_data_type = inputs.at(kWeightIndex)->data_type(); - if (filter_data_type == kNumberTypeFloat16) { - auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); - memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float16_t)); - } else { - MS_ASSERT(filter_data_type == kNumberTypeFloat32); - auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); - memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float)); - } new_inputs.emplace_back(filter_tensor); - // if has bias, set new bias + // if has bias, create new bias if (has_bias) { - auto *origin_bias = inputs.at(kBiasIndex)->data_c(); - auto bias_data_type = inputs.at(kBiasIndex)->data_type(); - auto bias_tensor = new (std::nothrow) - lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + auto bias_tensor = CreateBiasTensor(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i); if (bias_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new bias_tensor failed."; - return nullptr; - } - ret = bias_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete bias_tensor; - FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "bias_tensor malloc failed."; + MS_LOG(ERROR) << "create bias_tensor failed."; return nullptr; } - if (bias_data_type == kNumberTypeFloat16) { - auto bias_data = reinterpret_cast(origin_bias); - memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float16_t)); - } else { - MS_ASSERT(bias_data_type == kNumberTypeFloat32); - auto bias_data = reinterpret_cast(origin_bias); - memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float)); - } new_inputs.emplace_back(bias_tensor); } - // set new output tensor - for (int j = 0; j < output_num; ++j) { - auto tmp_out_tensor = new (std::nothrow) lite::Tensor(); - if (tmp_out_tensor == nullptr) { + // create new output tensors + for (size_t j = 0; j < outputs.size(); ++j) { + auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j); + if (out_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new tmp_out_tensor failed."; + MS_LOG(ERROR) << "new out_tensor failed."; return nullptr; } - tmp_out_tensor->set_data_type(outputs.at(j)->data_type()); - tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat()); - if (infered_flag) { - tmp_out_tensor->set_shape(out_shape); - ret = tmp_out_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete tmp_out_tensor; - FreeMemoryFp16(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "tmp_out_tensor malloc data failed."; - return nullptr; - } - } - new_outputs.emplace_back(tmp_out_tensor); + new_outputs.emplace_back(out_tensor); } - group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs, reinterpret_cast(new_conv_parameter), ctx, primitive, use_winograd, out_unit)); } + return new (std::nothrow) GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc index 6c3c9b199b..1dfa2e01d2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc @@ -169,8 +169,8 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) { return conv_parameter; } -void FreeMemoryFp32(std::vector group_convs, std::vector new_inputs, - std::vector new_outputs) { +void FreeMemoryFp32(const std::vector &group_convs, const std::vector &new_inputs, + const std::vector &new_outputs) { for (auto sub_conv : group_convs) { if (sub_conv != nullptr) { delete sub_conv; @@ -188,6 +188,87 @@ void FreeMemoryFp32(std::vector group_convs, std::vector in_shape, bool infered_flag) { + auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR); + if (in_tensor == nullptr) { + MS_LOG(ERROR) << "new in_tensor failed."; + return nullptr; + } + if (infered_flag) { + auto ret = in_tensor->MallocData(); + if (ret != RET_OK) { + delete in_tensor; + MS_LOG(ERROR) << "in tensor malloc failed."; + return nullptr; + } + } + return in_tensor; +} + +lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector filter_shape, + const std::vector &inputs, int copy_length, int index) { + auto filter_tensor = + new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + if (filter_tensor == nullptr) { + MS_LOG(ERROR) << "new filter_tensor failed."; + return nullptr; + } + auto ret = filter_tensor->MallocData(); + if (ret != RET_OK) { + delete filter_tensor; + MS_LOG(ERROR) << "filter_tensor malloc failed."; + return nullptr; + } + + MS_ASSERT(data_type == kNumberTypeFloat32); + auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); + memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float)); + return filter_tensor; +} + +lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector bias_shape, + const std::vector &inputs, int new_out_channel, int index) { + auto *origin_bias = inputs.at(kBiasIndex)->data_c(); + auto bias_tensor = + new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + if (bias_tensor == nullptr) { + MS_LOG(ERROR) << "new bias_tensor failed."; + return nullptr; + } + auto ret = bias_tensor->MallocData(); + if (ret != RET_OK) { + delete bias_tensor; + MS_LOG(ERROR) << "bias_tensor malloc failed."; + return nullptr; + } + MS_ASSERT(data_type == kNumberTypeFloat32); + auto bias_data = reinterpret_cast(origin_bias); + memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float)); + + return bias_tensor; +} + +lite::Tensor *CreateOutputTensorFp32(std::vector out_shape, const std::vector &outputs, + bool infered_flag, int index) { + auto out_tensor = new (std::nothrow) lite::Tensor(); + if (out_tensor == nullptr) { + MS_LOG(ERROR) << "new tmp_out_tensor failed."; + return nullptr; + } + out_tensor->set_data_type(outputs.at(index)->data_type()); + out_tensor->SetFormat(outputs.at(index)->GetFormat()); + if (infered_flag) { + out_tensor->set_shape(out_shape); + auto ret = out_tensor->MallocData(); + if (ret != RET_OK) { + delete out_tensor; + MS_LOG(ERROR) << "out_tensor malloc data failed."; + return nullptr; + } + } + return out_tensor; +} + kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector &inputs, const std::vector &outputs, OpParameter *op_parameter, const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive, @@ -208,31 +289,22 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector &outputs, OpParameter *op_parameter, const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive, int group) { - std::vector group_convs; + int out_unit; + bool has_bias = inputs.size() == 3; + bool use_winograd = false; + bool infered_flag = primitive != nullptr && primitive->GetInferFlag(); + auto conv_param = reinterpret_cast(op_parameter); + std::vector in_shape; - std::vector filter_shape; - std::vector bias_shape; std::vector out_shape; - - auto conv_param = reinterpret_cast(op_parameter); - int out_channel = inputs.at(kWeightIndex)->Batch(); int new_in_channel = inputs.at(kWeightIndex)->Channel(); int new_out_channel = 0; if (group == 0) { MS_LOG(ERROR) << "Divisor 'group' cannot be 0."; return nullptr; } else { - new_out_channel = out_channel / group; + new_out_channel = inputs.at(kWeightIndex)->Batch() / group; } - int kernel_h = conv_param->kernel_h_; - int kernel_w = conv_param->kernel_w_; - int input_num = inputs.size(); - int output_num = outputs.size(); - bool has_bias = input_num == 3; - bool use_winograd = false; - int out_unit; - bool infered_flag = primitive != nullptr && primitive->GetInferFlag(); - if (infered_flag) { int batch = inputs.front()->Batch(); int in_h = inputs.front()->Height(); @@ -243,11 +315,11 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vectoroutput_h_, conv_param->output_w_, new_out_channel}; } + std::vector filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel}; + std::vector bias_shape = {new_out_channel}; - filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel}; - bias_shape = {new_out_channel}; - auto *origin_weight = reinterpret_cast(inputs.at(kWeightIndex)->data_c()); - + // create sub kernels + std::vector group_convs; for (int i = 0; i < group; ++i) { std::vector new_inputs; std::vector new_outputs; @@ -257,100 +329,58 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vectordata_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR); + + // create new input for each group + auto in_tensor = CreateInputTensorFp32(inputs.front()->data_type(), in_shape, infered_flag); if (in_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new in_tensor failed."; + MS_LOG(ERROR) << "create input tensor failed."; return nullptr; } - if (infered_flag) { - auto ret = in_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete in_tensor; - FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "in tensor malloc failed."; - return nullptr; - } - } new_inputs.emplace_back(in_tensor); - // new weight - auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape, - Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + // create new weight + int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel; + auto filter_tensor = + CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i); if (filter_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new filter_tensor failed."; + MS_LOG(ERROR) << "create filter tensor failed."; return nullptr; } - auto ret = filter_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete filter_tensor; - FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "filter_tensor malloc failed."; - return nullptr; - } - int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel; - memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float)); new_inputs.emplace_back(filter_tensor); - // if has bias, set new bias + // if has bias, create new bias if (has_bias) { - auto *origin_bias = reinterpret_cast(inputs.at(kBiasIndex)->data_c()); - auto bias_tensor = new (std::nothrow) - lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + auto bias_tensor = + CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i); if (bias_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new bias_tensor failed."; - return nullptr; - } - ret = bias_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete bias_tensor; - FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "bias_tensor malloc failed."; + MS_LOG(ERROR) << "create bias_tensor failed."; return nullptr; } - memcpy(bias_tensor->data_c(), origin_bias + i * new_out_channel, new_out_channel * sizeof(float)); new_inputs.emplace_back(bias_tensor); } - // set new output tensor - for (int j = 0; j < output_num; ++j) { - auto tmp_out_tensor = new (std::nothrow) lite::Tensor(); - if (tmp_out_tensor == nullptr) { + // create new output tensor + for (size_t j = 0; j < outputs.size(); ++j) { + auto out_tensor = CreateOutputTensorFp32(out_shape, outputs, infered_flag, j); + if (out_tensor == nullptr) { delete new_conv_parameter; FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "new tmp_out_tensor failed."; + MS_LOG(ERROR) << "new out_tensor failed."; return nullptr; } - tmp_out_tensor->set_data_type(outputs.at(j)->data_type()); - tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat()); - if (infered_flag) { - tmp_out_tensor->set_shape(out_shape); - ret = tmp_out_tensor->MallocData(); - if (ret != RET_OK) { - delete new_conv_parameter; - delete tmp_out_tensor; - FreeMemoryFp32(group_convs, new_inputs, new_outputs); - MS_LOG(ERROR) << "tmp_out_tensor malloc data failed."; - return nullptr; - } - } - new_outputs.emplace_back(tmp_out_tensor); + new_outputs.emplace_back(out_tensor); } - group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs, reinterpret_cast(new_conv_parameter), ctx, primitive, use_winograd, out_unit)); } + return new (std::nothrow) GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group); } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc index 92a8fe5db4..bc16817914 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc @@ -19,6 +19,7 @@ #include "src/kernel_registry.h" #include "src/runtime/runtime_api.h" #include "nnacl/quantization/quantize.h" +#include "nnacl/pack.h" #include "include/errorcode.h" using mindspore::lite::KernelRegistrar; @@ -38,11 +39,77 @@ using mindspore::schema::PrimitiveType_Mean; using mindspore::schema::PrimitiveType_Reduce; namespace mindspore::kernel { +void ReduceInt8CPUKernel::OneAxis() { + auto axis_info = axes_[0]; + if (axis_info == 0) { + pattern_ = kernel::N; + } else if (axis_info == 1) { + pattern_ = kernel::H; + } else if (axis_info == 2) { + pattern_ = kernel::W; + } else { + pattern_ = kernel::C; + } +} + +void ReduceInt8CPUKernel::TwoAxes() { + auto axis_info1 = axes_[0]; + auto axis_info2 = axes_[1]; + auto axis_sum = axis_info1 + axis_info2; + if (axis_sum == 1) { + pattern_ = kernel::NH; + } else if (axis_sum == 2) { + pattern_ = kernel::NW; + } else if (axis_sum == 3) { + if (axis_info1 == 0) { + pattern_ = kernel::NC; + } else { + pattern_ = kernel::HW; + } + } else if (axis_sum == 4) { + pattern_ = kernel::HC; + } else { + MS_ASSERT(axis_sum == 5); + pattern_ = kernel::WC; + } +} + +void ReduceInt8CPUKernel::ThreeAxes() { + auto axis_info1 = axes_[0]; + auto axis_info2 = axes_[1]; + auto axis_info3 = axes_[2]; + auto axis_sum = axis_info1 + axis_info2 + axis_info3; + if (axis_sum == 3) { + pattern_ = kernel::NHW; + } else if (axis_sum == 4) { + pattern_ = kernel::NHC; + } else if (axis_sum == 5) { + pattern_ = kernel::NWC; + } else { + MS_ASSERT(axis_sum == 6); + pattern_ = kernel::HWC; + } +} + +void ReduceInt8CPUKernel::Match4DReducePattern() { + if (num_axes_ == 1) { + OneAxis(); + } else if (num_axes_ == 2) { + TwoAxes(); + } else if (num_axes_ == 3) { + ThreeAxes(); + } else { + MS_ASSERT(num_axes_ == 4); + pattern_ = kernel::NHWC; + } +} + int ReduceInt8CPUKernel::Init() { auto ret = ReduceBaseCPUKernel::Init(); if (ret != RET_OK) { return ret; } + Match4DReducePattern(); if (!this->in_tensors_[0]->shape().empty()) { this->valid_shape_ = true; ret = CalculateQuantArgs(); @@ -96,6 +163,64 @@ int ReduceInt8CPUKernel::Init() { return ReSize(); } +void ReduceInt8CPUKernel::ReduceMean4DCalQuantParam() { + int reduce_num = 1; + auto in_shape = in_tensors_.front()->shape(); + switch (pattern_) { + case N: + reduce_num = in_shape[0]; + break; + case H: + reduce_num = in_shape[1]; + break; + case W: + reduce_num = in_shape[2]; + break; + case C: + reduce_num = in_shape[3]; + break; + case NH: + reduce_num = in_shape[0] * in_shape[1]; + break; + case NW: + reduce_num = in_shape[0] * in_shape[2]; + break; + case NC: + reduce_num = in_shape[0] * in_shape[3]; + break; + case HW: + reduce_num = in_shape[1] * in_shape[2]; + break; + case HC: + reduce_num = in_shape[1] * in_shape[3]; + break; + case WC: + reduce_num = in_shape[2] * in_shape[3]; + break; + case NHW: + reduce_num = in_shape[0] * in_shape[1] * in_shape[2]; + break; + case NHC: + reduce_num = in_shape[0] * in_shape[1] * in_shape[3]; + break; + case NWC: + reduce_num = in_shape[0] * in_shape[2] * in_shape[3]; + break; + case HWC: + reduce_num = in_shape[1] * in_shape[2] * in_shape[3]; + break; + case NHWC: + reduce_num = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + break; + } + bias_ = quant_arg_.out_zp_ - quant_arg_.in_zp_ * quant_arg_.in_scale_ / quant_arg_.out_scale_; + int shift; + double reciprocal = quant_arg_.in_scale_ / (quant_arg_.out_scale_ * reduce_num); + QuantizeMultiplierSmallerThanOne(reciprocal, &reduce_mean_quant_param_.multiplier_, &shift); + reduce_mean_quant_param_.left_shift_ = shift < 0 ? -shift : 0; + reduce_mean_quant_param_.right_shift_ = shift > 0 ? shift : 0; +} + int ReduceInt8CPUKernel::CalculateQuantArgs() { lite::Tensor *input = in_tensors_.at(0); lite::Tensor *output = out_tensors_.at(0); @@ -117,18 +242,24 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() { // (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes // quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num) if (mode_ == static_cast(schema::ReduceMode_ReduceMean)) { - for (auto i = 0; i < num_axes_; i++) { - auto axis = axes_[i]; - double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis]; - QuantMulArg *qm = new (std::nothrow) QuantMulArg; - if (qm == nullptr) { - MS_LOG(ERROR) << "Reduce new QuantMulArg failed."; - return RET_NULL_PTR; + if (input->shape().size() == 4 && pattern_ == kernel::HW) { + // special case, can use pattern + ReduceMean4DCalQuantParam(); + pattern_impl_ = true; + } else { + for (auto i = 0; i < num_axes_; i++) { + auto axis = axes_[i]; + double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis]; + QuantMulArg *qm = new (std::nothrow) QuantMulArg; + if (qm == nullptr) { + MS_LOG(ERROR) << "Reduce new QuantMulArg failed."; + return RET_NULL_PTR; + } + QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift); + qm->left_shift_ = shift < 0 ? -shift : 0; + qm->right_shift_ = shift > 0 ? shift : 0; + mean_multipliers_.push_back(qm); } - QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift); - qm->left_shift_ = shift < 0 ? -shift : 0; - qm->right_shift_ = shift > 0 ? shift : 0; - mean_multipliers_.push_back(qm); } } @@ -230,6 +361,16 @@ int ReduceInt8Impl(void *cdata, int task_id) { return RET_OK; } +int ReduceMeanPatternInt8Impl(void *cdata, int task_id) { + auto reduce = reinterpret_cast(cdata); + auto error_code = reduce->Reduce4DExecute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + void ReduceInt8CPUKernel::GetQuantArgs(size_t i) { MS_ASSERT(i < static_cast(num_axis_)); if (mode_ == static_cast(schema::ReduceMode_ReduceMean)) { @@ -250,6 +391,78 @@ void ReduceInt8CPUKernel::GetQuantArgs(size_t i) { } } +int ReduceInt8CPUKernel::Reduce4DExecute(int task_id) { + auto input = in_tensors_.at(0); + auto in_data = reinterpret_cast(input->data_c()); + auto in_shape = input->shape(); + MS_ASSERT(in_shape.size() == 4); + int n = in_shape.at(0); + int h = in_shape.at(1); + int w = in_shape.at(2); + int c = in_shape.at(3); + auto output_data = reinterpret_cast(out_tensors_.at(0)->data_c()); + switch (pattern_) { + case N: + return ReduceMeanN(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case H: + return ReduceMeanH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case W: + return ReduceMeanW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case C: + return ReduceMeanC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NH: + return ReduceMeanNH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NW: + return ReduceMeanNW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NC: + return ReduceMeanNC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case HW: { + // data has been convert into NCHW format for efficiently + int num = UP_DIV(c, ctx_->thread_num_); + int count = c - task_id * num; + count = count > num ? num : count; + int plane = h * w; + return ReduceMeanHW(n, plane, count, c, nchw_in_data_ + task_id * num * plane, output_data + task_id * num, + reduce_mean_quant_param_, bias_); + } + case HC: + return ReduceMeanHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case WC: + return ReduceMeanWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NHW: + return ReduceMeanNHW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NHC: + return ReduceMeanNHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NWC: + return ReduceMeanNWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case HWC: + return ReduceMeanHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + case NHWC: + return ReduceMeanNHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_); + } + return RET_OK; +} + +int ReduceInt8CPUKernel::Fast4DReduceMeanHWImpl() { + auto input = in_tensors_.at(0); + auto input_data = reinterpret_cast(input->data_c()); + nchw_in_data_ = reinterpret_cast(ctx_->allocator->Malloc(input->ElementsNum())); + if (nchw_in_data_ == nullptr) { + MS_LOG(ERROR) << "malloc nchw_in_data_ failed."; + return RET_ERROR; + } + PackNHWCToNCHWInt8(reinterpret_cast(input_data), reinterpret_cast(nchw_in_data_), input->Batch(), + input->Height() * input->Width(), input->Channel()); + auto ret = ParallelLaunch(this->context_->thread_pool_, ReduceMeanPatternInt8Impl, this, context_->thread_num_); + if (ret != RET_OK) { + ctx_->allocator->Free(nchw_in_data_); + MS_LOG(ERROR) << "Reduce run error, error_code[" << ret << "]"; + return RET_ERROR; + } + ctx_->allocator->Free(nchw_in_data_); + return RET_OK; +} + int ReduceInt8CPUKernel::Run() { if (!this->valid_shape_) { auto ret = CalculateQuantArgs(); @@ -257,6 +470,11 @@ int ReduceInt8CPUKernel::Run() { return ret; } } + // now only implement reduce mean mode 4d reduce HW case, otherwise go into reference impl + if (mode_ == static_cast(schema::ReduceMode_ReduceMean) && pattern_impl_ && pattern_ == kernel::HW) { + return Fast4DReduceMeanHWImpl(); + } + auto ret = MallocTmpBuffer(); if (ret != RET_OK) { FreeTmpBuffer(); @@ -313,6 +531,7 @@ int ReduceInt8CPUKernel::CallReduceUnit(int task_id) { } return ret; } + kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h index 92a7542150..79b6405aff 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h @@ -28,6 +28,7 @@ using mindspore::schema::ReduceMode; namespace mindspore::kernel { +enum Four_DIMENSION_REDUCE_TEMPLATE { N, H, W, C, NH, NW, NC, HW, HC, WC, NHW, NHC, NWC, HWC, NHWC }; class ReduceInt8CPUKernel : public ReduceBaseCPUKernel { typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data, int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num); @@ -38,7 +39,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel { ReduceInt8CPUKernel(OpParameter *param, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} + : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive), ctx_(ctx) {} ~ReduceInt8CPUKernel() { for (auto qm : mean_multipliers_) { delete qm; @@ -59,6 +60,8 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel { int Init() override; int ReSize() override; int Run() override; + int Fast4DReduceMeanHWImpl(); + int Reduce4DExecute(int task_id); int CallReduceUnit(int task_id); int ReduceLastAxis(int task_id); @@ -68,22 +71,31 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel { private: int MallocTmpBuffer(); void FreeTmpBuffer(); - + void Match4DReducePattern(); + void OneAxis(); + void TwoAxes(); + void ThreeAxes(); + void ReduceMean4DCalQuantParam(); int CalculateQuantArgs(); void GetQuantArgs(size_t i); private: ReduceParameter *param_ = nullptr; ReduceQuantArg quant_arg_; + int8_t *nchw_in_data_ = nullptr; + int32_t bias_; private: + const lite::InnerContext *ctx_; int32_t *begin_src_data_ = nullptr; int8_t *last_dst_data_ = nullptr; std::vector data_buffers_; const int32_t *src_data_ = nullptr; int32_t *dst_data_ = nullptr; bool valid_shape_ = false; - + bool pattern_impl_ = false; + Four_DIMENSION_REDUCE_TEMPLATE pattern_; + QuantMulArg reduce_mean_quant_param_; // used in reduce mean 4D situation Reducer reducer_ = nullptr; LastReducer last_reducer_ = nullptr; std::vector mean_multipliers_;