!5878 reduce prod support int

Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_int
pull/5878/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 8200410f20

@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
int offset4d(const int *shape, const int *dims);
inline bool isAddOverflow(int32_t x, int32_t y) {
int32_t sum = x + y;
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
}
inline bool isMulOverflow(int32_t x, int32_t y) {
int32_t p = x * y;
return (x != 0) && (p / x != y);
}
#ifdef ENABLE_ARM64
void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);

@ -17,6 +17,7 @@
#include <float.h>
#include "nnacl/fp32/reduce.h"
#include "nnacl/errorcode.h"
#include "nnacl/common_func.h"
int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
const int tid, const int thread_num) {
@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size,
}
return NNACL_OK;
}
int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
const int tid, const int thread_num) {
if (src_data == NULL || dst_data == NULL) {
return NNACL_NULL_PTR;
}
int i, j, k;
for (j = tid; j < outer_size; j += thread_num) {
const int *outer_src = src_data + j * axis_size * inner_size;
int *outer_dst = dst_data + j * inner_size;
for (k = 0; k < inner_size; k++) {
const int *inner_src = outer_src + k;
int *inner_dst = outer_dst + k;
int tmp = 1;
for (i = 0; i < axis_size; i++) {
if (isMulOverflow(tmp, inner_src[i * inner_size])) {
return NNACL_ERRCODE_MUL_OVERFLOW;
}
tmp *= inner_src[i * inner_size];
}
*inner_dst = tmp;
}
}
return NNACL_OK;
}
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num) {
if (src_data == NULL || dst_data == NULL) {

@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c
const int tid, const int thread_num);
int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
const int tid, const int thread_num);
int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
const int tid, const int thread_num);
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num);
#ifdef __cplusplus

@ -18,16 +18,7 @@
#include "nnacl/int8/reduce_int8.h"
#include "nnacl/errorcode.h"
#include "nnacl/quantization/fixed_point.h"
inline bool isAddOverflow(int32_t x, int32_t y) {
int32_t sum = x + y;
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
}
inline bool isMulOverflow(int32_t x, int32_t y) {
int32_t p = x * y;
return (x != 0) && (p / x != y);
}
#include "nnacl/common_func.h"
// Get x such that (x-zp_in) * scale_in = mean
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
(tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)),
quant->in_out_multiplier_),
quant->in_out_right_shift_ + base_offset);
quant->in_out_right_shift_ + base_offset);
if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
return NNACL_ERRCODE_ADD_OVERFLOW;
}

@ -53,6 +53,7 @@
typedef enum LiteDataType {
kDataTypeFloat,
kDataTypeInt,
kDataTypeInt8,
} LiteDataType;

@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *>
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator)

@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() {
}
case static_cast<int>(ReduceMode_ReduceProd): {
reducer_ = ReduceProd;
int_reducer_ = IntReduceProd;
break;
}
case static_cast<int>(ReduceMode_ReduceSumSquare): {
@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() {
return ReSize();
}
int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); }
int ReduceCPUKernel::ReSize() {
if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) {
data_type_ = kDataTypeFloat;
} else {
data_type_ = kDataTypeInt;
}
return ReduceBaseCPUKernel::ReSize();
}
int ReduceCPUKernel::CallReduceUnit(int task_id) {
auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_);
int ret;
if (data_type_ == kDataTypeFloat) {
ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_),
static_cast<float *>(dst_data_), task_id, context_->thread_num_);
} else {
ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_),
static_cast<int *>(dst_data_), task_id, context_->thread_num_);
}
return ret;
}
@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() {
return ret;
}
src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData());
src_data_ = in_tensors_.at(0)->MutableData();
for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
if (i != static_cast<size_t>(num_axes_ - 1)) {
dst_data_ = data_buffers_[i];
} else {
dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
dst_data_ = out_tensors_.at(0)->MutableData();
}
outer_size_ = outer_sizes_[i];
inner_size_ = inner_sizes_[i];
@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() {
int ReduceCPUKernel::MallocTmpBuffer() {
data_buffers_.clear();
for (auto size : buffer_sizes_) {
float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float)));
void *buffer;
if (data_type_ == kDataTypeFloat) {
buffer = context_->allocator->Malloc(size * sizeof(float));
} else {
buffer = context_->allocator->Malloc(size * sizeof(int));
}
if (buffer == nullptr) {
MS_LOG(ERROR) << "Malloc data failed.";
return RET_ERROR;
@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() {
}
void ReduceCPUKernel::FreeTmpBuffer() {
for (size_t i = 0; i < data_buffers_.size(); i++) {
float *buffer = data_buffers_[i];
for (auto buffer : data_buffers_) {
if (buffer != nullptr) {
context_->allocator->Free(buffer);
buffer = nullptr;

@ -29,6 +29,8 @@ namespace mindspore::kernel {
class ReduceCPUKernel : public ReduceBaseCPUKernel {
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num);
typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data,
int *dst_data, const int tid, const int thread_num);
public:
ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
const mindspore::lite::PrimitiveC *primitive)
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
~ReduceCPUKernel() {
FreeTmpBuffer();
src_data_ = nullptr;
dst_data_ = nullptr;
reducer_ = nullptr;
int_reducer_ = nullptr;
}
int Init() override;
@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
private:
Reducer reducer_ = nullptr;
std::vector<float *> data_buffers_;
const float *src_data_ = nullptr;
float *dst_data_ = nullptr;
IntReducer int_reducer_ = nullptr;
std::vector<void *> data_buffers_;
LiteDataType data_type_;
const void *src_data_ = nullptr;
void *dst_data_ = nullptr;
private:
int MallocTmpBuffer();

Loading…
Cancel
Save