From e4c34c0c9369f6666dc6df320198584524d52127 Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Mon, 26 Oct 2020 22:02:51 +0800 Subject: [PATCH] opencl_winograd_speedup --- .../runtime/kernel/opencl/cl/convolution.cl | 65 ++++++++----------- .../kernel/opencl/kernel/convolution.h | 2 +- .../runtime/kernel/opencl/kernel/to_format.cc | 6 +- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl index 47cf881505..f7025adb11 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl @@ -123,36 +123,27 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_ constant FLT *Bt_row = Bt + row * 6; FLT4 BtD_row[6] = {0}; - for (int y = 0; y < 6; y++) { - int ih = tile_y * 4 - PAD + y; - - // Format_NHWC4 - int y_idx = ih; - // Format_NC4HW4 - // if (ih < 0 || ih >= IH) { continue;} - // int y_idx = slice * IH + ih; + int ih = tile_y * 4 - PAD; + int iw = tile_x * 4 - PAD; + for (int y = 0; y < 6; y++) { + int x_idx = iw * SLICES + slice; for (int x = 0; x < 6; x++) { - int iw = tile_x * 4 - PAD + x; - - // Format_NHWC4 - if (iw < 0 || iw >= IW) { - continue; - } - int x_idx = iw * SLICES + slice; - // Format_NC4HW4 - // int x_idx = iw; - - BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, y_idx)); + // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES + // if (iw < 0 || iw >= IW) { continue; } + BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); + x_idx += SLICES; } + ih++; } + int y_idx = slice * 36 + row * 6; for (int y = 0; y < 6; y++) { FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); for (int x = 0; x < 6; x++) { acc += BtD_row[x] * Bt[y * 6 + x]; } - WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36 + WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 } #undef PAD } @@ -247,36 +238,36 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ constant FLT *At_row = At + row * 6; FLT4 AtM_row[6] = {0}; - for (int y = 0; y < 6; y++) { - for (int x = 0; x < 6; x++) { - AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, slice * 36 + y * 6 + x)); + for (int y = 0, idx = slice * 36; y < 6; y++) { + for (int x = 0; x < 6; x++, idx++) { + AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); } } int TILE_X = UP_DIV(OW, 4); - for (int x = 0; x < 4; x++) { + int tile_x = tile_xy % TILE_X; + int tile_y = tile_xy / TILE_X; + int oh = tile_y * 4 + row; + int ow = tile_x * 4; + int x_idx = ow * SLICES + slice; + + for (int x = 0, idx = 0; x < 4; x++) { FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - for (int y = 0; y < 6; y++) { - acc += AtM_row[y] * At[x * 6 + y]; + for (int y = 0; y < 6; y++, idx++) { + acc += AtM_row[y] * At[idx]; } + if (bias) { acc += bias[slice]; } + if (act_type == ActType_Relu) { acc = max(acc, (FLT4)(0.0f)); } else if (act_type == ActType_Relu6) { acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); } - int tile_x = tile_xy % TILE_X; - int tile_y = tile_xy / TILE_X; - int ow = tile_x * 4 + x; - int oh = tile_y * 4 + row; - - // Format_NHWC4 - if (ow < OW) { - WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc); - } - // Format_NC4HW4 - // if (oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);} + + WRITE_IMAGE(output, (int2)(x_idx, oh), acc); + x_idx += SLICES; } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 75c1094193..bd989f3cd2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -52,7 +52,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1; - const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12; + const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8; const bool hw_good = TILES_X_ * TILES_Y_ >= 16; return attr_valid && channel_good && hw_good; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 369ae8a967..83dae3327a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -87,7 +87,11 @@ int ToFormatOpenCLKernel::InitNHWC() { int ToFormatOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; std::vector global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; - std::vector local = {16, 8, 1}; + std::vector local = {8, 16, 3}; + size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); + if (max_work_group_size < 384) { + local[2] = 1; + } cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};