opencl_conv_optimize_performance

4 years ago · 351c21eaf2
parent 9ae5f96988
commit 351c21eaf2
6 changed files with 716 additions and 328 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@ -0,0 +1,187 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+
+#define ActType_Relu 1
+#define ActType_Relu6 3
+
+constant FLT Bt[36] = {
+  1.0000000000f, 0.0000000000f,  -2.5000004768f, -0.0000001192f, 1.0000001192f,  0.0000000000f,
+  0.0000000000f, 0.9428091049f,  1.3333333731f,  -0.4714044929f, -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.9428089857f, 1.3333334923f,  0.4714045525f,  -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f,  0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.1178511307f,  -0.0833333507f, -0.2357022911f, 0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.9999998808f,  -0.0000000596f, -2.5000000000f, 0.0000000000f,  1.0000000000f,
+};
+
+__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
+                              const int4 input_shape,     // N H W CI_SLICES
+                              const int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
+#define PAD 1
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = output_shape.z;
+  int SLICES = input_shape.w;
+  if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) {
+    return;
+  }
+
+  int IH = input_shape.y, IW = input_shape.z;
+  int TILE_X = UP_DIV(IW, 4);
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+
+  constant FLT *Bt_row = Bt + row * 6;
+  FLT4 BtD_row[6] = {0};
+
+  int ih = tile_y * 4 - PAD;
+  int iw = tile_x * 4 - PAD;
+  for (int y = 0; y < 6; y++) {
+    int x_idx = iw * SLICES + slice;
+    for (int x = 0; x < 6; x++) {
+      // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES
+      // if (iw < 0 || iw >= IW) { continue; }
+      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih));
+      x_idx += SLICES;
+    }
+    ih++;
+  }
+
+  int y_idx = slice * 36 + row * 6;
+  for (int y = 0; y < 6; y++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int x = 0; x < 6; x++) {
+      acc += BtD_row[x] * Bt[y * 6 + x];
+    }
+    WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc);  // CH W  H=36
+  }
+#undef PAD
+}
+
+__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
+                                  const int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
+                                  const int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
+#define H 36
+  int w = get_global_id(0) * 2;
+  int h = get_global_id(1);
+  int co_slice = get_global_id(2) * 2;
+
+  int CI_SLICES = input_shape.w;
+  int W = input_shape.z;
+  int CO_SLICES = output_shape.w;
+
+  if (h >= H || w >= W || co_slice >= CO_SLICES) {
+    return;
+  }
+
+  FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int y_idx = h;
+  __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx));
+    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx));
+    y_idx += 36;
+
+    FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
+    weight_ptr += 2;
+
+    out00 += in0.x * weight0.s0123;
+    out00 += in0.y * weight0.s4567;
+    out00 += in0.z * weight0.s89ab;
+    out00 += in0.w * weight0.scdef;
+
+    out01 += in1.x * weight0.s0123;
+    out01 += in1.y * weight0.s4567;
+    out01 += in1.z * weight0.s89ab;
+    out01 += in1.w * weight0.scdef;
+
+    out10 += in0.x * weight1.s0123;
+    out10 += in0.y * weight1.s4567;
+    out10 += in0.z * weight1.s89ab;
+    out10 += in0.w * weight1.scdef;
+
+    out11 += in1.x * weight1.s0123;
+    out11 += in1.y * weight1.s4567;
+    out11 += in1.z * weight1.s89ab;
+    out11 += in1.w * weight1.scdef;
+  }
+
+  WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
+  if (w + 1 < W) {
+    WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
+  }
+
+  if (co_slice + 1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
+    if (w + 1 < W) {
+      WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
+    }
+  }
+#undef H
+}
+
+constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  0.0000000000f,
+                       0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
+                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
+                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
+
+__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
+                              const int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
+                              const int4 output_shape,  // N H W CO_SLICES
+                              const int act_type) {
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = input_shape.z;
+  int SLICES = input_shape.w;
+  int OH = output_shape.y;
+  int OW = output_shape.z;
+
+  if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) {
+    return;
+  }
+
+  constant FLT *At_row = At + row * 6;
+  FLT4 AtM_row[6] = {0};
+  for (int y = 0, idx = slice * 36; y < 6; y++) {
+    for (int x = 0; x < 6; x++, idx++) {
+      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx));
+    }
+  }
+
+  int TILE_X = UP_DIV(OW, 4);
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+  int oh = tile_y * 4 + row;
+  int ow = tile_x * 4;
+  int x_idx = ow * SLICES + slice;
+
+  for (int x = 0, idx = 0; x < 4; x++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int y = 0; y < 6; y++, idx++) {
+      acc += AtM_row[y] * At[idx];
+    }
+
+    if (bias) {
+      acc += bias[slice];
+    }
+
+    if (act_type == ActType_Relu) {
+      acc = max(acc, (FLT4)(0.0f));
+    } else if (act_type == ActType_Relu6) {
+      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
+    }
+
+    WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
+    x_idx += SLICES;
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@ -31,7 +31,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 public:
  ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                          const std::vector<lite::Tensor *> &outputs)
-      : OpenCLKernel(parameter, inputs, outputs) {}
+      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {}
  ~ConvolutionOpenCLKernel() override = default;

  int Init() override;
@ -39,26 +39,32 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  int InitBuffer() override;

 private:
+  void SetBlockSize();
+  void SetGlobalLocal();
  int InitWeight();
  int InitBias();
  int GenerateWinogradWeight();
-  int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
-
-  size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }

  bool UseWinograd4x4To6x6() {
-    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
-                            param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
-                            param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
-                            IW_ == OW_ && batch_size_ == 1;
+    const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 &&
+                            param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 &&
+                            param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 &&
+                            param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1;
    const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8;
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }

+  cl::Kernel kernel_4x4to36_;
+  cl::Kernel kernel_conv_;
+  cl::Kernel kernel_36to4x4_;
+  std::vector<size_t> global_;
+  std::vector<size_t> local_;
+
  bool use_fp16_{false};
+  size_t sizeof_FLT_{4};

+  ConvParameter *param_{nullptr};
  int batch_size_{};
  int CI_{};
  int IH_{};
@ -81,9 +87,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  void *winograd_mem0_{nullptr};
  void *winograd_mem1_{nullptr};

-  cl::Kernel kernel_4x4to36_;
-  cl::Kernel kernel_conv_;
-  cl::Kernel kernel_36to4x4_;
+  struct {
+    int H{1};
+    int W{1};
+    int C{1};
+  } block_size_;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -38,30 +38,29 @@ struct OpenCLToFormatParameter {

 struct Image2DInfo {
  explicit Image2DInfo(const lite::Tensor *tensor) {
-    if (tensor != nullptr) {
-      auto shape = tensor->shape();
-      if (shape.size() == 1) {
-        N = shape[0];
-      } else if (shape.size() == 2) {
-        N = shape[0];
-        C = shape[1];
-      } else if (shape.size() == 3) {
-        N = shape[0];
-        W = shape[1];
-        C = shape[2];
-      } else if (shape.size() == 4) {
-        N = shape[0];
-        H = shape[1];
-        W = shape[2];
-        C = shape[3];
-      } else if (shape.size() >= 5) {
-        MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
-      }
-      FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
-    } else {
-      FLT_size = sizeof(cl_float);
+    if (tensor == nullptr) {
+      return;
    }

+    auto shape = tensor->shape();
+    if (shape.size() == 1) {
+      N = shape[0];
+    } else if (shape.size() == 2) {
+      N = shape[0];
+      C = shape[1];
+    } else if (shape.size() == 3) {
+      N = shape[0];
+      W = shape[1];
+      C = shape[2];
+    } else if (shape.size() == 4) {
+      N = shape[0];
+      H = shape[1];
+      W = shape[2];
+      C = shape[3];
+    } else if (shape.size() >= 5) {
+      MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
+    }
+    FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
    FLT4_size = FLT_size * 4;
    Slice = UP_DIV(C, C4NUM);
    if (W * Slice <= MAX_IMAGE2D_SIZE) {
@ -72,16 +71,19 @@ struct Image2DInfo {
      width = N * H * Slice;
    }

-    auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-    int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
-    row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
-
    ElementsNum = N * H * W * C;
    ElementsC4Num = N * H * W * Slice * C4NUM;
    OriginSize = ElementsNum * FLT_size;
    Image2DSize = height * width * FLT4_size;
  }

+  size_t RowPitch() const {
+    auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+    int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
+    size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
+    return row_pitch;
+  }
+
  size_t N{1};
  size_t H{1};
  size_t W{1};
@ -89,9 +91,8 @@ struct Image2DInfo {
  size_t Slice{};
  size_t width{};
  size_t height{};
-  size_t FLT_size{};
-  size_t FLT4_size{};
-  size_t row_pitch{};
+  size_t FLT_size{4};
+  size_t FLT4_size{16};
  size_t ElementsNum{};
  size_t ElementsC4Num{};
  size_t OriginSize{};
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -262,7 +262,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons
    auto row_size = img_info.width * img_info.FLT4_size;
    for (int i = 0; i < img_info.height; ++i) {
      memcpy(reinterpret_cast<char *>(data.data()) + i * row_size,
-             static_cast<char *>(tensor->data_c()) + i * img_info.row_pitch, row_size);
+             static_cast<char *>(tensor->data_c()) + i * img_info.RowPitch(), row_size);
    }
  }
  allocator->UnmapBuffer(tensor->data_c());