!6088 optimize opencl convolution fp16 kernel

Merge pull request !6088 from 王东旭/opencl_convolution_support_fp16
5 years ago · 37561b1b4c
parent 4d4b19059e 4e87458ddb
commit 37561b1b4c
5 changed files with 299 additions and 236 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data,
  }
  dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
 }
+__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
+                                                int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 private:
  bool use_fp16_ = false;

-  int CI;
-  int IH;
-  int IW;
-  int CO;
-  int OH;
-  int OW;
-  int CI_SLICES;
-  int CO_SLICES;
+  int CI_{};
+  int IH_{};
+  int IW_{};
+  int CO_{};
+  int OH_{};
+  int OW_{};
+  int CI_SLICES_{};
+  int CO_SLICES_{};
+  int KH_{};
+  int KW_{};
  void *packed_weight_ = nullptr;
  void *packed_bias_ = nullptr;

  bool use_winograd_ = false;
-  int TILES_X;
-  int TILES_Y;
-  int TILES_XY;
+  int TILES_X_{};
+  int TILES_Y_{};
+  int TILES_XY_{};
  void *winograd_mem0_ = nullptr;
  void *winograd_mem1_ = nullptr;

-  cl::Kernel kernel_4x4to36;
-  cl::Kernel kernel_conv;
-  cl::Kernel kernel_36to4x4;
+  cl::Kernel kernel_4x4to36_;
+  cl::Kernel kernel_conv_;
+  cl::Kernel kernel_36to4x4_;
+
+  int InitWeight();
+  int InitBias();
+  int RearrangeWinogradWeight();
+  template <typename SRC_T, typename DST_T>
+  int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);

  std::string CodeGenConvolutionNHWC4();
  std::string CodeGenConvolutionNC4HW4();
@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  std::string CodeGenWinograd36To4x4();
  int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);

+  size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }
+
  bool UseWinograd4x4To6x6() {
    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
                            param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
-    const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
-    const bool hw_good = TILES_X * TILES_Y >= 16;
+    const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
+    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }

-  std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
+  static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
    std::vector<float> C(M * K);
    for (int i = 0; i < M; ++i) {
      for (int j = 0; j < K; ++j) {
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter=TestDeconvInt8.*

 ./lite-test --gtest_filter="TestTfliteParser*"
+
+# for GPU OpenCL
+./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*"
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@ -21,19 +21,18 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
 #include "nnacl/pack.h"
-#include "nnacl/fp32/common_func.h"

 using mindspore::kernel::ConvolutionOpenCLKernel;
 using mindspore::kernel::LiteKernel;
 using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::Tensor;
 using mindspore::schema::Format;
-using mindspore::schema::Format_KHWC;
-using mindspore::schema::Format_NC4HW4;
-using mindspore::schema::Format_NCHW;
-using mindspore::schema::Format_NHWC;
-using mindspore::schema::Format_NHWC4;
 using mindspore::schema::NodeType_ValueNode;
+using mindspore::schema::Format::Format_KHWC;
+using mindspore::schema::Format::Format_NC4HW4;
+using mindspore::schema::Format::Format_NCHW;
+using mindspore::schema::Format::Format_NHWC;
+using mindspore::schema::Format::Format_NHWC4;

 namespace mindspore {

@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {};

 void LoadData(Tensor *tensor, const float *src) {
  if (tensor->data_type() == kNumberTypeFloat16) {
-    auto num = tensor->Size() / 2;
-    auto tensor_data = reinterpret_cast<uint16_t *>(tensor->MutableData());
+    auto num = tensor->Size() / sizeof(float16_t);
+    auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
    for (int i = 0; i < num; ++i) {
-      tensor_data[i] = Float32ToShort(src[i]);
+      tensor_data[i] = static_cast<float16_t>(src[i]);
    }
  } else {
-    memcpy(tensor->MutableData(), src, tensor->Size());
+    memcpy(tensor->data_c(), src, tensor->Size());
  }
 }

 void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
-  auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4;
+  auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
  std::vector<float> output_data(num);
  if (output->data_type() == kNumberTypeFloat16) {
-    auto output_data_fp16 = reinterpret_cast<uint16_t *>(output->MutableData());
    for (int i = 0; i < output_data.size(); ++i) {
-      output_data[i] = ShortToFloat32((output_data_fp16[i]));
+      output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
    }
  } else {
-    memcpy(output_data.data(), output->MutableData(), output->Size());
+    memcpy(output_data.data(), output->data_c(), output->Size());
  }

  printf("output:");
@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
  }
  printf("\n");

-  float max_err = 0.0f;
+  float max_err = -1.0f;
  std::array<int, 5> idx_5d{};
-  int idx = -1;
+  int max_err_idx = -1, first_err_idx = -1;
  auto SLICES = UP_DIV(output->Channel(), 4);
  int I = 1, J = 1, K = 1, L = 1, M = 1;
  switch (output->GetFormat()) {
@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
        for (int l = 0; l < L; ++l) {
          for (int m = 0; m < M; ++m) {
            auto err = std::fabs(output_data[cn] - expect_data[cn]);
+            if (first_err_idx == -1 && max_err > atol) {
+              first_err_idx = cn;
+            }
            if (err > max_err) {
              max_err = err;
              idx_5d = {i, j, k, l, m};
-              idx = cn;
+              max_err_idx = cn;
            }
            cn++;
          }
@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
    }
  }

-  float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx]));
+  if (max_err > atol) {
+    printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
+           output_data[first_err_idx]);
+    FAIL();
+  } else {
+    float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
    if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
      printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
    } else {
      printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
    }
-  printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx],
-         max_err, relative_err * 100);
-
-  if (max_err > atol) {
-    FAIL();
-  } else {
+    printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
+           output_data[max_err_idx], max_err, relative_err * 100);
    printf("COMPARE SUCCESS!\n\n");
  }
 }