!7793 【MSLITE】speed up gpu winorgad kernel

Merge pull request !7793 from wangdongxu6/opencl_winograd_speedup
4 years ago · 051e21df68
parent 9588bd8bd4 e4c34c0c93
commit 051e21df68
3 changed files with 34 additions and 39 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
@ -123,36 +123,27 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_

  constant FLT *Bt_row = Bt + row * 6;
  FLT4 BtD_row[6] = {0};
-  for (int y = 0; y < 6; y++) {
-    int ih = tile_y * 4 - PAD + y;
-
-    // Format_NHWC4
-    int y_idx = ih;
-    // Format_NC4HW4
-    // if (ih < 0 || ih >= IH) { continue;}
-    // int y_idx = slice * IH + ih;

+  int ih = tile_y * 4 - PAD;
+  int iw = tile_x * 4 - PAD;
+  for (int y = 0; y < 6; y++) {
+    int x_idx = iw * SLICES + slice;
    for (int x = 0; x < 6; x++) {
-      int iw = tile_x * 4 - PAD + x;
-
-      // Format_NHWC4
-      if (iw < 0 || iw >= IW) {
-        continue;
-      }
-      int x_idx = iw * SLICES + slice;
-      // Format_NC4HW4
-      // int x_idx = iw;
-
-      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, y_idx));
+      // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES
+      // if (iw < 0 || iw >= IW) { continue; }
+      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih));
+      x_idx += SLICES;
    }
+    ih++;
  }

+  int y_idx = slice * 36 + row * 6;
  for (int y = 0; y < 6; y++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (int x = 0; x < 6; x++) {
      acc += BtD_row[x] * Bt[y * 6 + x];
    }
-    WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc);  // CH W  H=36
+    WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc);  // CH W  H=36
  }
 #undef PAD
 }
@ -247,36 +238,36 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_

  constant FLT *At_row = At + row * 6;
  FLT4 AtM_row[6] = {0};
-  for (int y = 0; y < 6; y++) {
-    for (int x = 0; x < 6; x++) {
-      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, slice * 36 + y * 6 + x));
+  for (int y = 0, idx = slice * 36; y < 6; y++) {
+    for (int x = 0; x < 6; x++, idx++) {
+      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx));
    }
  }

  int TILE_X = UP_DIV(OW, 4);
-  for (int x = 0; x < 4; x++) {
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+  int oh = tile_y * 4 + row;
+  int ow = tile_x * 4;
+  int x_idx = ow * SLICES + slice;
+
+  for (int x = 0, idx = 0; x < 4; x++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int y = 0; y < 6; y++) {
-      acc += AtM_row[y] * At[x * 6 + y];
+    for (int y = 0; y < 6; y++, idx++) {
+      acc += AtM_row[y] * At[idx];
    }
+
    if (bias) {
      acc += bias[slice];
    }
+
    if (act_type == ActType_Relu) {
      acc = max(acc, (FLT4)(0.0f));
    } else if (act_type == ActType_Relu6) {
      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
    }
-    int tile_x = tile_xy % TILE_X;
-    int tile_y = tile_xy / TILE_X;
-    int ow = tile_x * 4 + x;
-    int oh = tile_y * 4 + row;
-
-    // Format_NHWC4
-    if (ow < OW) {
-      WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);
-    }
-    // Format_NC4HW4
-    //    if (oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}
+
+    WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
+    x_idx += SLICES;
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@ -52,7 +52,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
                            param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
                            param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
                            IW_ == OW_ && batch_size_ == 1;
-    const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
+    const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8;
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -87,7 +87,11 @@ int ToFormatOpenCLKernel::InitNHWC() {
 int ToFormatOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
-  std::vector<size_t> local = {16, 8, 1};
+  std::vector<size_t> local = {8, 16, 3};
+  size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
+  if (max_work_group_size < 384) {
+    local[2] = 1;
+  }
  cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
  cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};