!5878 reduce prod support int

Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_int
4 years ago · 8200410f20
parent 91b2591b08 1a92b8696a
commit 8200410f20
8 changed files with 79 additions and 22 deletions
--- a/mindspore/lite/nnacl/common_func.h
+++ b/mindspore/lite/nnacl/common_func.h
@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co
 int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
 int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
 int offset4d(const int *shape, const int *dims);
+inline bool isAddOverflow(int32_t x, int32_t y) {
+  int32_t sum = x + y;
+  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
+}
+
+inline bool isMulOverflow(int32_t x, int32_t y) {
+  int32_t p = x * y;
+  return (x != 0) && (p / x != y);
+}

 #ifdef ENABLE_ARM64
 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
--- a/mindspore/lite/nnacl/fp32/reduce.c
+++ b/mindspore/lite/nnacl/fp32/reduce.c
@ -17,6 +17,7 @@
 #include <float.h>
 #include "nnacl/fp32/reduce.h"
 #include "nnacl/errorcode.h"
+#include "nnacl/common_func.h"

 int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
               const int tid, const int thread_num) {
@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size,
  }
  return NNACL_OK;
 }
+
+int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
+                  const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int *outer_src = src_data + j * axis_size * inner_size;
+    int *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int *inner_src = outer_src + k;
+      int *inner_dst = outer_dst + k;
+      int tmp = 1;
+      for (i = 0; i < axis_size; i++) {
+        if (isMulOverflow(tmp, inner_src[i * inner_size])) {
+          return NNACL_ERRCODE_MUL_OVERFLOW;
+        }
+        tmp *= inner_src[i * inner_size];
+      }
+      *inner_dst = tmp;
+    }
+  }
+  return NNACL_OK;
+}
 int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                    float *dst_data, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
--- a/mindspore/lite/nnacl/fp32/reduce.h
+++ b/mindspore/lite/nnacl/fp32/reduce.h
@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c
              const int tid, const int thread_num);
 int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
               const int tid, const int thread_num);
+int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
+                  const int tid, const int thread_num);
 int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                    float *dst_data, const int tid, const int thread_num);
 #ifdef __cplusplus
--- a/mindspore/lite/nnacl/int8/reduce_int8.c
+++ b/mindspore/lite/nnacl/int8/reduce_int8.c
@ -18,16 +18,7 @@
 #include "nnacl/int8/reduce_int8.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/quantization/fixed_point.h"
-
-inline bool isAddOverflow(int32_t x, int32_t y) {
-  int32_t sum = x + y;
-  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
-}
-
-inline bool isMulOverflow(int32_t x, int32_t y) {
-  int32_t p = x * y;
-  return (x != 0) && (p / x != y);
-}
+#include "nnacl/common_func.h"

 // Get x such that (x-zp_in) * scale_in = mean
 // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
                              (tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)),
                              quant->in_out_multiplier_),
-                              quant->in_out_right_shift_ + base_offset);
+                            quant->in_out_right_shift_ + base_offset);
      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@ -53,6 +53,7 @@

 typedef enum LiteDataType {
  kDataTypeFloat,
+  kDataTypeInt,
  kDataTypeInt8,
 } LiteDataType;

--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *>
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() {
    }
    case static_cast<int>(ReduceMode_ReduceProd): {
      reducer_ = ReduceProd;
+      int_reducer_ = IntReduceProd;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceSumSquare): {
@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() {
  return ReSize();
 }

-int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); }
+int ReduceCPUKernel::ReSize() {
+  if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) {
+    data_type_ = kDataTypeFloat;
+  } else {
+    data_type_ = kDataTypeInt;
+  }
+  return ReduceBaseCPUKernel::ReSize();
+}

 int ReduceCPUKernel::CallReduceUnit(int task_id) {
-  auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_);
+  int ret;
+  if (data_type_ == kDataTypeFloat) {
+    ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_),
+                   static_cast<float *>(dst_data_), task_id, context_->thread_num_);
+  } else {
+    ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_),
+                       static_cast<int *>(dst_data_), task_id, context_->thread_num_);
+  }
+
  return ret;
 }

@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() {
    return ret;
  }

-  src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData());
+  src_data_ = in_tensors_.at(0)->MutableData();
  for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
    if (i != static_cast<size_t>(num_axes_ - 1)) {
      dst_data_ = data_buffers_[i];
    } else {
-      dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
+      dst_data_ = out_tensors_.at(0)->MutableData();
    }
    outer_size_ = outer_sizes_[i];
    inner_size_ = inner_sizes_[i];
@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() {
 int ReduceCPUKernel::MallocTmpBuffer() {
  data_buffers_.clear();
  for (auto size : buffer_sizes_) {
-    float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float)));
+    void *buffer;
+    if (data_type_ == kDataTypeFloat) {
+      buffer = context_->allocator->Malloc(size * sizeof(float));
+    } else {
+      buffer = context_->allocator->Malloc(size * sizeof(int));
+    }
    if (buffer == nullptr) {
      MS_LOG(ERROR) << "Malloc data failed.";
      return RET_ERROR;
@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() {
 }

 void ReduceCPUKernel::FreeTmpBuffer() {
-  for (size_t i = 0; i < data_buffers_.size(); i++) {
-    float *buffer = data_buffers_[i];
+  for (auto buffer : data_buffers_) {
    if (buffer != nullptr) {
      context_->allocator->Free(buffer);
      buffer = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
@ -29,6 +29,8 @@ namespace mindspore::kernel {
 class ReduceCPUKernel : public ReduceBaseCPUKernel {
  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                         float *dst_data, const int tid, const int thread_num);
+  typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data,
+                            int *dst_data, const int tid, const int thread_num);

 public:
  ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
                  const mindspore::lite::PrimitiveC *primitive)
      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
  ~ReduceCPUKernel() {
-    FreeTmpBuffer();
    src_data_ = nullptr;
    dst_data_ = nullptr;
+    reducer_ = nullptr;
+    int_reducer_ = nullptr;
  }

  int Init() override;
@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {

 private:
  Reducer reducer_ = nullptr;
-  std::vector<float *> data_buffers_;
-  const float *src_data_ = nullptr;
-  float *dst_data_ = nullptr;
+  IntReducer int_reducer_ = nullptr;
+  std::vector<void *> data_buffers_;
+  LiteDataType data_type_;
+
+  const void *src_data_ = nullptr;
+  void *dst_data_ = nullptr;

 private:
  int MallocTmpBuffer();