!7251 [MS][LITE[GPU]arithmetic support broadcast

Merge pull request !7251 from chenzupeng/master-lite
5 years ago · 832b70961a
parent 634cdd3485 0c9e9e5d82
commit 832b70961a
9 changed files with 421 additions and 274 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@ -42,7 +42,10 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {

  cl::Kernel kernel_;
  bool element_flag_{true};
-  void *weight_ptr_{nullptr};
+  float activation_min_{-FLT_MAX};
+  float activation_max_{FLT_MAX};
+  std::vector<std::vector<int>> inputs_nhwc_shapes_;
+  std::vector<void *> inputs_weight_ptrs_;

  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@ -54,6 +54,12 @@ std::vector<size_t> ScaleOpenCLKernel::InitGlobalSize() const {

 void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
  local_size_ = {16, 16};
+  if (out_tensors_[0]->shape().size() == 2) {
+    size_t H = out_tensors_[0]->shape()[0];
+    size_t W = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
+    global_size_ = {W, H};
+    return;
+  }
  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
    size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    size_t W = out_tensors_[0]->Width();
@ -78,18 +84,23 @@ void ScaleOpenCLKernel::BufferGetWorkGroupSize() {

 int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
-  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-    im_dst_x = out_tensors_[0]->Width();
-    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
-    im_dst_y = out_tensors_[0]->Batch();
-    im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  if (out_tensors_[0]->shape().size() == 2) {
+    im_dst_x = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
+    im_dst_y = out_tensors_[0]->shape()[0];
  } else {
-    MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
-    return RET_ERROR;
+    if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
+      im_dst_x = out_tensors_[0]->Width();
+      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+    } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
+      im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
+    } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
+      im_dst_y = out_tensors_[0]->Batch();
+      im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+    } else {
+      MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
+      return RET_ERROR;
+    }
  }

  size_t img_dtype = CL_FLOAT;
@ -114,7 +125,7 @@ int ScaleOpenCLKernel::InitBuffer() {
    auto allocator = ocl_runtime_->GetAllocator();
    std::vector<size_t> img_size;
    GetImageSize(0, &img_size);
-    if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) {
+    if (scale_C_flag_) {
      img_size[1] = 1;
      img_size[0] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM);
      scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size);
@ -256,8 +267,10 @@ int ScaleOpenCLKernel::Init() {
    if (scale_tensor->ElementsNum() == 1) {
      element_flag_ = false;
      kernel_name = "BoardcastScale";
-    } else if (axis_ == 3 && scale_shape.size() == 1) {
+    } else if (((in_shape.size() == 4 && axis_ == 3) || (in_shape.size() == 2 && axis_ == 1)) &&
+               scale_shape.size() == 1) {
      element_flag_ = true;
+      scale_C_flag_ = true;
      kernel_name = "Scale_C";
    }
  } else {
@ -327,24 +340,9 @@ int ScaleOpenCLKernel::Run() {
    }
  }
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  int H = 0;
-  int W = 0;
-  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-    W = out_tensors_[0]->Width();
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
-    W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
-    H = out_tensors_[0]->Batch();
-    W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else {
-    MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat();
-    return RET_ERROR;
-  }
-  cl_int2 output_shape{W, H};
+  cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
-  if (element_flag_ && axis_ == 3) {
+  if (element_flag_ && scale_C_flag_) {
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
  }
  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
@ -42,6 +42,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {

  cl::Kernel kernel_;
  bool element_flag_{true};
+  bool scale_C_flag_{false};
  void *scale_ptr_{nullptr};
  void *offset_ptr_{nullptr};
  int axis_{0};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@ -27,6 +27,8 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Nchw2Nhwc;
+using mindspore::schema::PrimitiveType_Nhwc2Nchw;
 using mindspore::schema::PrimitiveType_Transpose;

 namespace mindspore::kernel {
@ -141,4 +143,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -276,4 +276,41 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) {
  }
  allocator->UnmapBuffer(origin_data);
 }
+
+std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape) {
+  int n, h, w, c;
+  n = h = w = c = 1;
+  if (tensor_shape.size() == 1) {
+    c = tensor_shape[0];
+  } else if (tensor_shape.size() == 2) {
+    n = tensor_shape[0];
+    c = tensor_shape[1];
+  } else if (tensor_shape.size() == 3) {
+    n = tensor_shape[0];
+    h = tensor_shape[1];
+    c = tensor_shape[2];
+  } else if (tensor_shape.size() == 4) {
+    n = tensor_shape[0];
+    h = tensor_shape[1];
+    w = tensor_shape[2];
+    c = tensor_shape[3];
+  }
+  return {n, h, w, c};
+}
+
+std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format) {
+  if (tensor_shape.size() != 4) {
+    return {1, 1};
+  }
+  size_t image_x, image_y;
+  image_x = image_y = 1;
+  if (format == schema::Format_NHWC4) {
+    image_x = tensor_shape[2] * UP_DIV(tensor_shape[3], C4NUM);
+    image_y = tensor_shape[0] * tensor_shape[1];
+  } else if (format == schema::Format_NC4HW4) {
+    image_x = tensor_shape[2];
+    image_y = tensor_shape[0] * tensor_shape[1] * UP_DIV(tensor_shape[3], C4NUM);
+  }
+  return {image_x, image_y};
+}
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@ -48,6 +48,10 @@ void Write2File(void *mem, const std::string &file_name, int size);

 void PrintTensor(lite::Tensor *tensor, int num = 10, const std::string &out_file = "");

+std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape);
+
+std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format);
+
 template <class T1, class T2>
 void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@ -37,8 +37,7 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*"
 ./lite-test --gtest_filter="TestAvgPoolingOpenCL*"
 ./lite-test --gtest_filter="TestConv2dTransposeOpenCL*"
-./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*"
-./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*"
+./lite-test --gtest_filter="TestMatMulOpenCL*"
 ./lite-test --gtest_filter="TestMaxPoolingOpenCL*"
 ./lite-test --gtest_filter="TestReduceOpenCL*"
 ./lite-test --gtest_filter="TestReshapeOpenCL*"
@ -46,3 +45,5 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestTransposeOpenCL*"
 ./lite-test --gtest_filter="TestArithmeticOpenCL*"
 ./lite-test --gtest_filter="TestScaleOpenCL*"
+./lite-test --gtest_filter="TestFullConnectionOpenCL*"
+./lite-test --gtest_filter="TestResizeOpenCL*"