Add crop fp16 ops

5 years ago · 900dfe5cba
parent 8f55187492
commit 900dfe5cba
7 changed files with 431 additions and 61 deletions
--- a/mindspore/lite/nnacl/fp16/crop_fp16.c
+++ b/mindspore/lite/nnacl/fp16/crop_fp16.c
@ -0,0 +1,153 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/crop_fp16.h"
+
+#include <string.h>
+
+#include "nnacl/crop_parameter.h"
+
+void Crop(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+  int input_dim = para->input_dim_;
+  switch (input_dim) {
+    case 1:
+      Crop1D(input, output, task_id, para);
+      break;
+    case 2:
+      Crop2D(input, output, task_id, para);
+      break;
+    case 3:
+      Crop3D(input, output, task_id, para);
+      break;
+    case 4:
+      Crop4D(input, output, task_id, para);
+      break;
+  }
+}
+
+void Crop1D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+  const int out_batch = para->out_shape_[0];
+  const int thread_count = para->thread_count_;
+  int64_t task_id_stride = thread_count > 1 ? UP_DIV(out_batch, thread_count) : out_batch;
+  if (task_id_stride <= 0) {
+    return;
+  }
+  int n = task_id * task_id_stride;
+  if (n >= out_batch) {
+    return;
+  }
+  const float16_t *in_ptr = input + n + para->in_offset_[0];
+  float16_t *out_ptr = output + n;
+  int64_t out_dist_stride = MSMIN(out_batch - task_id * task_id_stride, task_id_stride);
+  memcpy(out_ptr, in_ptr, sizeof(float16_t) * out_dist_stride);
+}
+
+void Crop2D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+  const int in_height = para->in_shape_[1];
+  const int out_batch = para->out_shape_[0];
+  const int out_height = para->out_shape_[1];
+  const int thread_count = para->thread_count_;
+  int64_t task_id_stride = thread_count > 1 ? UP_DIV(out_height, thread_count) : out_height;
+  if (task_id_stride <= 0) {
+    return;
+  }
+
+  for (int n = 0; n < out_batch; n++) {
+    int h = task_id * task_id_stride;
+    if (h >= out_height) {
+      return;
+    }
+    const float16_t *in_ptr = input + (n + para->in_offset_[0]) * in_height + h + para->in_offset_[1];
+    float16_t *out_ptr = output + n * out_height + h;
+    int64_t out_dist_stride = MSMIN(out_height - task_id * task_id_stride, task_id_stride);
+    memcpy(out_ptr, in_ptr, sizeof(float16_t) * out_dist_stride);
+  }
+}
+
+void Crop3D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+  const int in_height = para->in_shape_[1];
+  const int in_width = para->in_shape_[2];
+
+  const int out_batch = para->out_shape_[0];
+  const int out_height = para->out_shape_[1];
+  const int out_width = para->out_shape_[2];
+
+  const int thread_count = para->thread_count_;
+  int64_t task_id_stride = thread_count > 1 ? UP_DIV(out_height, thread_count) : out_height;
+  if (task_id_stride <= 0) {
+    return;
+  }
+
+  const int in_stride_h = in_width;
+  const int in_stride_n = in_stride_h * in_height;
+
+  const int out_stride_h = out_width;
+  const int out_stride_n = out_stride_h * out_height;
+
+  for (int n = 0; n < out_batch; n++) {
+    for (int t = 0; t < task_id_stride; t++) {
+      int h = t + task_id * task_id_stride;
+      if (h >= out_height) {
+        break;
+      }
+      const float16_t *in_ptr =
+        input + (n + para->in_offset_[0]) * in_stride_n + (h + para->in_offset_[1]) * in_stride_h + para->in_offset_[2];
+      float16_t *out_ptr = output + n * out_stride_n + h * out_stride_h;
+      memcpy(out_ptr, in_ptr, sizeof(float16_t) * out_width);
+    }
+  }
+}
+
+void Crop4D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+  const int in_height = para->in_shape_[1];
+  const int in_width = para->in_shape_[2];
+  const int in_channel = para->in_shape_[3];
+
+  const int out_batch = para->out_shape_[0];
+  const int out_height = para->out_shape_[1];
+  const int out_width = para->out_shape_[2];
+  const int out_channel = para->out_shape_[3];
+
+  const int thread_count = para->thread_count_;
+  int64_t task_id_stride = thread_count > 1 ? UP_DIV(out_height, thread_count) : out_height;
+  if (task_id_stride <= 0) {
+    return;
+  }
+
+  const int in_stride_w = in_channel;
+  const int in_stride_h = in_channel * in_width;
+  const int in_stride_n = in_stride_h * in_height;
+
+  const int out_stride_w = out_channel;
+  const int out_stride_h = out_channel * out_width;
+  const int out_stride_n = out_stride_h * out_height;
+
+  for (int n = 0; n < out_batch; n++) {
+    for (int t = 0; t < task_id_stride; t++) {
+      int h = t + task_id * task_id_stride;
+      if (h >= out_height) {
+        break;
+      }
+      for (int w = 0; w < out_width; w++) {
+        const float16_t *in_ptr = input + (n + para->in_offset_[0]) * in_stride_n +
+                                  (h + para->in_offset_[1]) * in_stride_h + (w + para->in_offset_[2]) * in_stride_w +
+                                  para->in_offset_[3];
+        float16_t *out_ptr = output + n * out_stride_n + h * out_stride_h + w * out_stride_w;
+        memcpy(out_ptr, in_ptr, sizeof(float16_t) * out_channel);
+      }
+    }
+  }
+}
--- a/mindspore/lite/nnacl/fp16/crop_fp16.h
+++ b/mindspore/lite/nnacl/fp16/crop_fp16.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_FP16_CROP_FP16_H_
+#define MINDSPORE_LITE_NNACL_FP16_CROP_FP16_H_
+
+#include <arm_neon.h>
+#include "nnacl/op_base.h"
+#include "nnacl/crop_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Crop(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+void Crop1D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+void Crop2D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+void Crop3D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+void Crop4D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP16_CROP_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc
@ -54,6 +54,58 @@ kernel::LiteKernel *CpuCropInt8KernelCreator(const std::vector<lite::Tensor *> &
  return kernel;
 }

+int CropBaseCPUKernel::ReSize() {
+  auto *input_tensor = in_tensors_.at(kInputIndex);
+  auto input_shape = input_tensor->shape();
+  size_t input_dim = input_shape.size();
+
+  crop_para_->in_shape_ = reinterpret_cast<int *>(malloc(input_dim * sizeof(int)));
+  if (crop_para_->in_shape_ == nullptr) {
+    MS_LOG(ERROR) << "in_shape_ is nullptr";
+    return RET_ERROR;
+  } else {
+    memcpy(reinterpret_cast<void *>(const_cast<int *>(crop_para_->in_shape_)), input_shape.data(),
+           sizeof(int) * input_dim);
+  }
+
+  auto *out_tensor = out_tensors_.at(kOutputIndex);
+  auto output_shape = out_tensor->shape();
+  size_t output_dim = output_shape.size();
+
+  crop_para_->out_shape_ = reinterpret_cast<int *>(malloc(output_dim * sizeof(int)));
+  if (crop_para_->out_shape_ == nullptr) {
+    MS_LOG(ERROR) << "out_shape_ is nullptr";
+    return RET_ERROR;
+  } else {
+    memcpy(reinterpret_cast<void *>(const_cast<int *>(crop_para_->out_shape_)), output_shape.data(),
+           sizeof(int) * output_dim);
+  }
+  MS_ASSERT(input_dim <= CROP_OFFSET_MAX_SIZE);
+  crop_para_->input_dim_ = input_dim;
+  PadOffset(input_dim, crop_para_);
+  return RET_OK;
+}
+
+void CropBaseCPUKernel::PadOffset(int input_dim, CropParameter *crop_para) {
+  auto axis = crop_para->axis_;
+  auto offsets_size = crop_para->offset_size_;
+  MS_ASSERT(axis <= input_dim);
+  if (offsets_size > 1) {
+    MS_ASSERT(axis + offsets_size == input_dim);
+  }
+  for (int i = 0; i < input_dim; i++) {
+    int crop_offset = 0;
+    if (i >= axis) {
+      if (offsets_size == 1) {
+        crop_offset = crop_para->offset_[0];
+      } else if (offsets_size > 1) {
+        crop_offset = crop_para->offset_[i - axis];
+      }
+    }
+    crop_para->in_offset_[i] = crop_offset;
+  }
+}
+
 kernel::LiteKernel *CpuCropInt32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                              const InnerContext *ctx, const kernel::KernelKey &desc,
--- a/mindspore/lite/src/runtime/kernel/arm/base/crop_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/crop_base.h
@ -29,15 +29,20 @@ class CropBaseCPUKernel : public LiteKernel {
  CropBaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                    const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                    const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {}
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive), thread_count_(ctx->thread_num_) {
+    crop_para_ = reinterpret_cast<CropParameter *>(op_parameter_);
+    crop_para_->thread_count_ = op_parameter_->thread_num_;
+  }
  ~CropBaseCPUKernel() = default;

  int Init() override;
-  int ReSize() override { return 0; }
+  int ReSize() override;
  int Run() override { return 0; }

 protected:
+  CropParameter *crop_para_;
  int thread_count_;
+  void PadOffset(int input_dim, CropParameter *crop_para);
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
@ -0,0 +1,130 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/fp16/crop_fp16.h"
+
+#include "include/errorcode.h"
+#include "nnacl/crop_parameter.h"
+#include "nnacl/fp16/cast_fp16.h"
+#include "nnacl/fp16/crop_fp16.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/base/crop_base.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Crop;
+
+namespace mindspore::kernel {
+
+int CropFp16CPUKernel::Init() {
+  auto ret = CropBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int CropFp16CPUKernel::ReSize() { return CropBaseCPUKernel::ReSize(); }
+
+int CropFp16CPUKernel::DoExecute(int task_id) {
+  Crop(input_ptr_, output_ptr_, task_id, crop_para_);
+  return RET_OK;
+}
+
+static int CropRun(void *cdata, int task_id) {
+  auto g_kernel = reinterpret_cast<CropFp16CPUKernel *>(cdata);
+  auto ret = g_kernel->DoExecute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "CropRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int CropFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+  input_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_);
+  if (input_ptr_ == nullptr) {
+    MS_LOG(ERROR) << "input or output is nullptr";
+    return RET_ERROR;
+  }
+
+  output_ptr_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_);
+  if (output_ptr_ == nullptr) {
+    FreeInputAndOutput();
+    MS_LOG(ERROR) << "input or output is nullptr";
+    return RET_ERROR;
+  }
+
+  ret = ParallelLaunch(this->context_->thread_pool_, CropRun, this, thread_count_);
+
+  if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) {
+    Float16ToFloat32(output_ptr_, reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData()),
+                     out_tensors_.at(kOutputIndex)->ElementsNum());
+  }
+  FreeInputAndOutput();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Crop error error_code[" << ret << "]";
+  }
+  return ret;
+}
+
+void CropFp16CPUKernel::FreeInputAndOutput() {
+  if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) {
+    context_->allocator->Free(input_ptr_);
+    input_ptr_ = nullptr;
+  }
+  if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) {
+    context_->allocator->Free(output_ptr_);
+    output_ptr_ = nullptr;
+  }
+}
+
+kernel::LiteKernel *CpuCropFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
+                                             const InnerContext *ctx, const kernel::KernelKey &desc,
+                                             const mindspore::lite::PrimitiveC *primitive) {
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Input opParameter is nullptr!";
+    return nullptr;
+  }
+  MS_ASSERT(desc.type == schema::PrimitiveType_Crop);
+  auto *kernel = new (std::nothrow) CropFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new CropFp16CPUKernel fail!";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Crop, CpuCropFp16KernelCreator)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CROP_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CROP_H_
+
+#include <arm_neon.h>
+
+#include <vector>
+
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/base/crop_base.h"
+
+namespace mindspore::kernel {
+class CropFp16CPUKernel : public CropBaseCPUKernel {
+ public:
+  CropFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                    const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                    const mindspore::lite::PrimitiveC *primitive)
+      : CropBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {
+    crop_para_ = reinterpret_cast<CropParameter *>(op_parameter_);
+    crop_para_->thread_count_ = op_parameter_->thread_num_;
+  }
+  ~CropFp16CPUKernel() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoExecute(int task_id);
+
+ private:
+  float16_t *input_ptr_ = nullptr;
+  float16_t *output_ptr_ = nullptr;
+  CropParameter *crop_para_;
+  void FreeInputAndOutput();
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CROP_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
@ -44,16 +44,6 @@ int CropInt8CPUKernel::Init() {

  crop_para_->quant_arg.output_activation_max_ = std::numeric_limits<int8_t>::max();
  crop_para_->quant_arg.output_activation_min_ = std::numeric_limits<int8_t>::min();
-  crop_para_->in_shape_ = reinterpret_cast<int *>(malloc(input_tensor->shape().size() * sizeof(int)));
-  if (crop_para_->in_shape_ == nullptr) {
-    MS_LOG(ERROR) << "malloc memory failed";
-    return RET_MEMORY_FAILED;
-  }
-  crop_para_->out_shape_ = reinterpret_cast<int *>(malloc(out_tensor->shape().size() * sizeof(int)));
-  if (crop_para_->out_shape_ == nullptr) {
-    MS_LOG(ERROR) << "malloc memory failed";
-    return RET_MEMORY_FAILED;
-  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
@ -72,35 +62,7 @@ CropInt8CPUKernel::~CropInt8CPUKernel() {
  }
 }

-int CropInt8CPUKernel::ReSize() {
-  auto *input_tensor = in_tensors_.at(kInputIndex);
-  auto input_shape = input_tensor->shape();
-  size_t input_dim = input_shape.size();
-
-  if (crop_para_->in_shape_ == nullptr) {
-    MS_LOG(ERROR) << "in_shape_ is nullptr";
-    return RET_ERROR;
-  } else {
-    memcpy(reinterpret_cast<void *>(const_cast<int *>(crop_para_->in_shape_)), input_shape.data(),
-           sizeof(int) * input_dim);
-  }
-
-  auto *out_tensor = out_tensors_.at(kOutputIndex);
-  auto output_shape = out_tensor->shape();
-  size_t output_dim = output_shape.size();
-
-  if (crop_para_->out_shape_ == nullptr) {
-    MS_LOG(ERROR) << "out_shape_ is nullptr";
-    return RET_ERROR;
-  } else {
-    memcpy(reinterpret_cast<void *>(const_cast<int *>(crop_para_->out_shape_)), output_shape.data(),
-           sizeof(int) * output_dim);
-  }
-  MS_ASSERT(input_dim <= CROP_OFFSET_MAX_SIZE);
-  crop_para_->input_dim_ = input_dim;
-  PadOffset(input_dim, crop_para_);
-  return RET_OK;
-}
+int CropInt8CPUKernel::ReSize() { return CropBaseCPUKernel::ReSize(); }

 int CropInt8CPUKernel::Run() {
  auto ret = Prepare();
@ -112,26 +74,6 @@ int CropInt8CPUKernel::Run() {
  return ret;
 }

-void PadOffset(int input_dim, CropParameter *crop_para) {
-  auto axis = crop_para->axis_;
-  auto offsets_size = crop_para->offset_size_;
-  MS_ASSERT(axis <= input_dim);
-  if (offsets_size > 1) {
-    MS_ASSERT(axis + offsets_size == input_dim);
-  }
-  for (int i = 0; i < input_dim; i++) {
-    int crop_offset = 0;
-    if (i >= axis) {
-      if (offsets_size == 1) {
-        crop_offset = crop_para->offset_[0];
-      } else if (offsets_size > 1) {
-        crop_offset = crop_para->offset_[i - axis];
-      }
-    }
-    crop_para->in_offset_[i] = crop_offset;
-  }
-}
-
 int CropInt8Run(void *cdata, int task_id) {
  auto crop = reinterpret_cast<CropInt8CPUKernel *>(cdata);
  crop->DoExecute(task_id);