From 351c21eaf2749b9f294abef362f210246a405d4f Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Tue, 3 Nov 2020 12:28:46 +0800 Subject: [PATCH] opencl_conv_optimize_performance --- .../runtime/kernel/opencl/cl/convolution.cl | 580 +++++++++++------- .../src/runtime/kernel/opencl/cl/winograd.cl | 187 ++++++ .../kernel/opencl/kernel/convolution.cc | 184 +++--- .../kernel/opencl/kernel/convolution.h | 32 +- .../src/runtime/kernel/opencl/opencl_kernel.h | 59 +- .../lite/src/runtime/kernel/opencl/utils.cc | 2 +- 6 files changed, 716 insertions(+), 328 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl index 7ea3edea22..cbed06393a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl @@ -3,268 +3,420 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #define CI_TILE 4 -#define CO_TILE 4 #define MAX_IMAGE2D_SIZE 65535 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define ActType_No 0 #define ActType_Relu 1 -#define ActType_Sigmod 2 #define ActType_Relu6 3 -__kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, - __global FLT4 *bias, const int4 input_shape, const int4 output_shape, - const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { - const int N = input_shape.x; - const int IH = input_shape.y; - const int IW = input_shape.z; - const int CI_SLICES = input_shape.w; - - const int OH = output_shape.y; - const int OW = output_shape.z; - const int CO_SLICES = output_shape.w; - - const int KH = kernel_stride.x; - const int KW = kernel_stride.y; - const int strideH = kernel_stride.z; - const int strideW = kernel_stride.w; - - const int padTop = pad.x; - const int padLeft = pad.z; - - const int dilationH = dilation.x; - const int dilationW = dilation.y; - - int n_oh = get_global_id(0); // [0, N*OH) - int ow = get_global_id(1); // [0, OW) - int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) - int n; - int oh; - if (N == 1) { - n = 0; - oh = n_oh; - } else { - n = n_oh / OH; - oh = n_oh % OH; - } - if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { - return; +#define DEFINE_ARGS \ + const int N = input_shape.x; \ + const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ + const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ + const int KH = kernel_stride.x, KW = kernel_stride.y; \ + const int strideH = kernel_stride.z, strideW = kernel_stride.w; \ + const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ + const int dilationH = dilation.x, dilationW = dilation.y; \ + \ + const int n_oh = get_global_id(0); \ + const int ow = get_global_id(1) * BlockW; \ + const int co_slice = get_global_id(2) * BlockC; \ + const int OH_SLICES = UP_DIV(OH, BlockH); \ + const int n = n_oh / OH_SLICES; \ + const int oh = (n_oh % OH_SLICES) * BlockH; \ + if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ + return; \ } - FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - __global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE; +__kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, + __global FLT4 *bias, const int4 input_shape, const int4 output_shape, + const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { + const int BlockH = 1; + const int BlockW = 1; + const int BlockC = 1; + DEFINE_ARGS; + + const int oh0 = oh + 0; + const int n_oh0 = n * OH + oh0; + const int ow0 = ow + 0; + const int co_slice0 = co_slice + 0; + + FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; + for (int kh = 0; kh < KH; ++kh) { - int ih = kh * dilationH + oh * strideH - padTop; + const int ih0 = kh * dilationH + oh0 * strideH - padTop; + const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + for (int kw = 0; kw < KW; ++kw) { - int iw = kw * dilationW + ow * strideW - padLeft; - if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { - FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); - out_c4 += w_ic1_oc4[0] * in_c4.x; - out_c4 += w_ic1_oc4[1] * in_c4.y; - out_c4 += w_ic1_oc4[2] * in_c4.z; - out_c4 += w_ic1_oc4[3] * in_c4.w; - w_ic1_oc4 += 4; - } - } else { - w_ic1_oc4 += 4 * CI_SLICES; + const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int x_idx0 = iw0 * CI_SLICES; + + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); + x_idx0++; + + out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; + out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; + out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; + out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; + + weight_ptr += 4; } } } - if (bias != 0) { - out_c4 = out_c4 + bias[co_slice]; + if (bias) { + out_h0_w0_c0 += bias[co_slice0]; } - // activation if (act_type == ActType_Relu) { - out_c4 = max(out_c4, (FLT4)(0.0f)); + out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); } else if (act_type == ActType_Relu6) { - out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); } if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { - WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4); - + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); } else { - WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4); + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); } } -constant FLT Bt[36] = { - 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, - 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, - 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, - 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, - 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, - 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, -}; - -__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, // N H W CI_SLICES - const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES -#define PAD 1 - int tile_xy = get_global_id(0); - int row = get_global_id(1); - int slice = get_global_id(2); - - int TILE_XY = output_shape.z; - int SLICES = input_shape.w; - if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { - return; - } +__kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, + __global FLT4 *bias, const int4 input_shape, const int4 output_shape, + const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { + const int BlockH = 2; + const int BlockW = 1; + const int BlockC = 1; + DEFINE_ARGS; + + const int oh0 = oh + 0; + const int oh1 = oh + 1; + const int n_oh0 = n * OH + oh0; + const int n_oh1 = n * OH + oh1; + const int ow0 = ow + 0; + const int co_slice0 = co_slice + 0; - int IW = input_shape.z; - int TILE_X = UP_DIV(IW, 4); - int tile_x = tile_xy % TILE_X; - int tile_y = tile_xy / TILE_X; - - constant FLT *Bt_row = Bt + row * 6; - FLT4 BtD_row[6] = {0}; - - int ih = tile_y * 4 - PAD; - int iw = tile_x * 4 - PAD; - for (int y = 0; y < 6; y++) { - int x_idx = iw * SLICES + slice; - for (int x = 0; x < 6; x++) { - // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES - // if (iw < 0 || iw >= IW) { continue; } - BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); - x_idx += SLICES; + FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; + + for (int kh = 0; kh < KH; ++kh) { + const int ih0 = kh * dilationH + oh0 * strideH - padTop; + // no need to check oh1, finally write out will check (oh1 < OH) + const int ih1 = kh * dilationH + oh1 * strideH - padTop; + // check ih0 and ih1 + const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; + + for (int kw = 0; kw < KW; ++kw) { + const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int x_idx0 = iw0 * CI_SLICES; + + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); + FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); + x_idx0++; + + out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; + out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; + out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; + out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; + out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; + out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; + out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; + out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; + + weight_ptr += 4; + } } - ih++; } - int y_idx = slice * 36 + row * 6; - for (int y = 0; y < 6; y++) { - FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - for (int x = 0; x < 6; x++) { - acc += BtD_row[x] * Bt[y * 6 + x]; - } - WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 + if (bias) { + out_h0_w0_c0 += bias[co_slice0]; + out_h1_w0_c0 += bias[co_slice0]; + } + + if (act_type == ActType_Relu) { + out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); + out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + } + + if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); + } // end if (oh1 < OH) + } else { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); + } // end (oh1 < OH) } -#undef PAD } -__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, - const int4 input_shape, // N 36 H/4*W/4 CI_SLICES - const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES -#define H 36 - int w = get_global_id(0) * 2; - int h = get_global_id(1); - int co_slice = get_global_id(2) * 2; +__kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, + __global FLT4 *bias, const int4 input_shape, const int4 output_shape, + const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { + const int BlockH = 2; + const int BlockW = 1; + const int BlockC = 2; + DEFINE_ARGS; + + const int oh0 = oh + 0; + const int oh1 = oh + 1; + const int n_oh0 = n * OH + oh0; + const int n_oh1 = n * OH + oh1; + const int ow0 = ow + 0; + const int co_slice0 = co_slice + 0; + const int co_slice1 = co_slice + 1; + + FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; - int CI_SLICES = input_shape.w; - int W = input_shape.z; - int CO_SLICES = output_shape.w; + for (int kh = 0; kh < KH; ++kh) { + const int ih0 = kh * dilationH + oh0 * strideH - padTop; + // no need to check oh1, finally write out will check (oh1 < OH) + const int ih1 = kh * dilationH + oh1 * strideH - padTop; + // check ih0 and ih1 + const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; - if (h >= H || w >= W || co_slice >= CO_SLICES) { - return; + for (int kw = 0; kw < KW; ++kw) { + const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int x_idx0 = iw0 * CI_SLICES; + + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); + FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); + x_idx0++; + + out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; + out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; + out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; + out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; + out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; + out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; + out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; + out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; + + out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; + out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; + out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; + out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; + out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; + out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; + out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; + out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; + + weight_ptr += 8; + } + } } - FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - - int y_idx = h; - __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { - FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); - FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); - y_idx += 36; - - FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; - weight_ptr += 2; - - out00 += in0.x * weight0.s0123; - out00 += in0.y * weight0.s4567; - out00 += in0.z * weight0.s89ab; - out00 += in0.w * weight0.scdef; - - out01 += in1.x * weight0.s0123; - out01 += in1.y * weight0.s4567; - out01 += in1.z * weight0.s89ab; - out01 += in1.w * weight0.scdef; - - out10 += in0.x * weight1.s0123; - out10 += in0.y * weight1.s4567; - out10 += in0.z * weight1.s89ab; - out10 += in0.w * weight1.scdef; - - out11 += in1.x * weight1.s0123; - out11 += in1.y * weight1.s4567; - out11 += in1.z * weight1.s89ab; - out11 += in1.w * weight1.scdef; + if (bias) { + out_h0_w0_c0 += bias[co_slice0]; + out_h1_w0_c0 += bias[co_slice0]; + out_h0_w0_c1 += bias[co_slice1]; + out_h1_w0_c1 += bias[co_slice1]; } - WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); - if (w + 1 < W) { - WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); + if (act_type == ActType_Relu) { + out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); + out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); + out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); + out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); } - if (co_slice + 1 < CO_SLICES) { - WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); - if (w + 1 < W) { - WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); - } + if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); + } // end if (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) + } else { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); + } // end (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) } -#undef H } -constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, - 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, - 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, - 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; - -__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, - const int4 input_shape, // N 36 H/4*W/4 CO_SLICES - const int4 output_shape, // N H W CO_SLICES - const int act_type) { - int tile_xy = get_global_id(0); - int row = get_global_id(1); - int slice = get_global_id(2); - - int TILE_XY = input_shape.z; - int SLICES = input_shape.w; - int OW = output_shape.z; - - if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { - return; - } +__kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, + __global FLT4 *bias, const int4 input_shape, const int4 output_shape, + const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { + const int BlockH = 2; + const int BlockW = 2; + const int BlockC = 2; + DEFINE_ARGS; + + const int oh0 = oh + 0; + const int oh1 = oh + 1; + const int n_oh0 = n * OH + oh0; + const int n_oh1 = n * OH + oh1; + const int ow0 = ow + 0; + const int ow1 = ow + 1; + const int co_slice0 = co_slice + 0; + const int co_slice1 = co_slice + 1; + + FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; - constant FLT *At_row = At + row * 6; - FLT4 AtM_row[6] = {0}; - for (int y = 0, idx = slice * 36; y < 6; y++) { - for (int x = 0; x < 6; x++, idx++) { - AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); - } - } + for (int kh = 0; kh < KH; ++kh) { + const int ih0 = kh * dilationH + oh0 * strideH - padTop; + // no need to check oh1, finally write out will check (oh1 < OH) + const int ih1 = kh * dilationH + oh1 * strideH - padTop; + // check ih0 and ih1 + const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; - int TILE_X = UP_DIV(OW, 4); - int tile_x = tile_xy % TILE_X; - int tile_y = tile_xy / TILE_X; - int oh = tile_y * 4 + row; - int ow = tile_x * 4; - int x_idx = ow * SLICES + slice; - - for (int x = 0, idx = 0; x < 4; x++) { - FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - for (int y = 0; y < 6; y++, idx++) { - acc += AtM_row[y] * At[idx]; + for (int kw = 0; kw < KW; ++kw) { + const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; + int x_idx0 = iw0 * CI_SLICES; + int x_idx1 = iw1 * CI_SLICES; + + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); + FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0)); + FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); + FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1)); + x_idx0++; + x_idx1++; + + out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; + out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x; + out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; + out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x; + out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; + out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y; + out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; + out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y; + out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; + out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z; + out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; + out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z; + out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; + out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w; + out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; + out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w; + + out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; + out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x; + out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; + out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x; + out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; + out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y; + out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; + out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y; + out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; + out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z; + out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; + out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z; + out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; + out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w; + out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; + out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w; + + weight_ptr += 8; + } } + } - if (bias != 0) { - acc += bias[slice]; - } + if (bias) { + out_h0_w0_c0 += bias[co_slice0]; + out_h0_w1_c0 += bias[co_slice0]; + out_h1_w0_c0 += bias[co_slice0]; + out_h1_w1_c0 += bias[co_slice0]; + out_h0_w0_c1 += bias[co_slice1]; + out_h0_w1_c1 += bias[co_slice1]; + out_h1_w0_c1 += bias[co_slice1]; + out_h1_w1_c1 += bias[co_slice1]; + } - if (act_type == ActType_Relu) { - acc = max(acc, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { - acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); - } + if (act_type == ActType_Relu) { + out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); + out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); + out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); + out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f)); + out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); + out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); + out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); + out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + } - WRITE_IMAGE(output, (int2)(x_idx, oh), acc); - x_idx += SLICES; + if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); + } // end if (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) + } else { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0); + } // end (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl new file mode 100644 index 0000000000..8e2029532a --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl @@ -0,0 +1,187 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) + +#define ActType_Relu 1 +#define ActType_Relu6 3 + +constant FLT Bt[36] = { + 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, + 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, + 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, + 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, + 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, + 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, +}; + +__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, // N H W CI_SLICES + const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES +#define PAD 1 + int tile_xy = get_global_id(0); + int row = get_global_id(1); + int slice = get_global_id(2); + + int TILE_XY = output_shape.z; + int SLICES = input_shape.w; + if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { + return; + } + + int IH = input_shape.y, IW = input_shape.z; + int TILE_X = UP_DIV(IW, 4); + int tile_x = tile_xy % TILE_X; + int tile_y = tile_xy / TILE_X; + + constant FLT *Bt_row = Bt + row * 6; + FLT4 BtD_row[6] = {0}; + + int ih = tile_y * 4 - PAD; + int iw = tile_x * 4 - PAD; + for (int y = 0; y < 6; y++) { + int x_idx = iw * SLICES + slice; + for (int x = 0; x < 6; x++) { + // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES + // if (iw < 0 || iw >= IW) { continue; } + BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); + x_idx += SLICES; + } + ih++; + } + + int y_idx = slice * 36 + row * 6; + for (int y = 0; y < 6; y++) { + FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + for (int x = 0; x < 6; x++) { + acc += BtD_row[x] * Bt[y * 6 + x]; + } + WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 + } +#undef PAD +} + +__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, + const int4 input_shape, // N 36 H/4*W/4 CI_SLICES + const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES +#define H 36 + int w = get_global_id(0) * 2; + int h = get_global_id(1); + int co_slice = get_global_id(2) * 2; + + int CI_SLICES = input_shape.w; + int W = input_shape.z; + int CO_SLICES = output_shape.w; + + if (h >= H || w >= W || co_slice >= CO_SLICES) { + return; + } + + FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + int y_idx = h; + __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); + FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); + y_idx += 36; + + FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; + weight_ptr += 2; + + out00 += in0.x * weight0.s0123; + out00 += in0.y * weight0.s4567; + out00 += in0.z * weight0.s89ab; + out00 += in0.w * weight0.scdef; + + out01 += in1.x * weight0.s0123; + out01 += in1.y * weight0.s4567; + out01 += in1.z * weight0.s89ab; + out01 += in1.w * weight0.scdef; + + out10 += in0.x * weight1.s0123; + out10 += in0.y * weight1.s4567; + out10 += in0.z * weight1.s89ab; + out10 += in0.w * weight1.scdef; + + out11 += in1.x * weight1.s0123; + out11 += in1.y * weight1.s4567; + out11 += in1.z * weight1.s89ab; + out11 += in1.w * weight1.scdef; + } + + WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); + if (w + 1 < W) { + WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); + } + + if (co_slice + 1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); + if (w + 1 < W) { + WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); + } + } +#undef H +} + +constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, + 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, + 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, + 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; + +__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, + const int4 input_shape, // N 36 H/4*W/4 CO_SLICES + const int4 output_shape, // N H W CO_SLICES + const int act_type) { + int tile_xy = get_global_id(0); + int row = get_global_id(1); + int slice = get_global_id(2); + + int TILE_XY = input_shape.z; + int SLICES = input_shape.w; + int OH = output_shape.y; + int OW = output_shape.z; + + if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { + return; + } + + constant FLT *At_row = At + row * 6; + FLT4 AtM_row[6] = {0}; + for (int y = 0, idx = slice * 36; y < 6; y++) { + for (int x = 0; x < 6; x++, idx++) { + AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); + } + } + + int TILE_X = UP_DIV(OW, 4); + int tile_x = tile_xy % TILE_X; + int tile_y = tile_xy / TILE_X; + int oh = tile_y * 4 + row; + int ow = tile_x * 4; + int x_idx = ow * SLICES + slice; + + for (int x = 0, idx = 0; x < 4; x++) { + FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + for (int y = 0; y < 6; y++, idx++) { + acc += AtM_row[y] * At[idx]; + } + + if (bias) { + acc += bias[slice]; + } + + if (act_type == ActType_Relu) { + acc = max(acc, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); + } + + WRITE_IMAGE(output, (int2)(x_idx, oh), acc); + x_idx += SLICES; + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index bbacb71bfc..8ce376a204 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -24,6 +24,7 @@ #include "src/kernel_registry.h" #include "include/errorcode.h" #include "src/runtime/kernel/opencl/cl/convolution.cl.inc" +#include "src/runtime/kernel/opencl/cl/winograd.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -38,14 +39,11 @@ constexpr size_t CI_TILE = C4NUM; constexpr size_t CO_TILE = C4NUM; int ConvolutionOpenCLKernel::Init() { - auto allocator = ocl_runtime_->GetAllocator(); - auto param = reinterpret_cast(op_parameter_); - std::set build_options; use_fp16_ = ocl_runtime_->GetFp16Enable(); + sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float); auto input_tensor = in_tensors_[0]; auto output_tensor = out_tensors_[0]; - batch_size_ = input_tensor->Batch(); CI_ = input_tensor->Channel(); IH_ = input_tensor->Height(); @@ -55,8 +53,8 @@ int ConvolutionOpenCLKernel::Init() { OW_ = output_tensor->Width(); CI_SLICES_ = UP_DIV(CI_, C4NUM); CO_SLICES_ = UP_DIV(CO_, C4NUM); - KH_ = param->kernel_h_; - KW_ = param->kernel_w_; + KH_ = param_->kernel_h_; + KW_ = param_->kernel_w_; has_bias_ = in_tensors_.size() == 3; // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true @@ -65,28 +63,39 @@ int ConvolutionOpenCLKernel::Init() { TILES_XY_ = TILES_X_ * TILES_Y_; use_winograd_ = UseWinograd4x4To6x6(); + if (!use_winograd_) { + SetBlockSize(); + SetGlobalLocal(); + } + // build kernel - std::string program_name = "Convolution"; - ocl_runtime_->LoadSource(program_name, convolution_source); + std::set build_options; if (use_winograd_) { MS_LOG(DEBUG) << "use winograd"; + std::string program_name = "Winograd"; + ocl_runtime_->LoadSource(program_name, winograd_source); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); } else { - ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options); + std::string program_name = "Convolution"; + std::string kernel_name = "Convolution_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + + "C" + std::to_string(block_size_.C); + ocl_runtime_->LoadSource("Convolution", convolution_source); + ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name, build_options); } // allocate winograd memory if (use_winograd_) { + auto allocator = ocl_runtime_->GetAllocator(); size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; - size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT(); + size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT_; size_t width = TILES_XY_; size_t height = CI_SLICES_ * 36; winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); - size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT(); + size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT_; width = TILES_XY_; height = CO_SLICES_ * 36; winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); @@ -156,9 +165,9 @@ int ConvolutionOpenCLKernel::InitWeight() { // allocate memory size_t packed_weight_size; if (use_winograd_) { - packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT(); + packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT_; } else { - packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT(); + packed_weight_size = UP_ROUND(CO_SLICES_, block_size_.C) * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT_; } packed_weight_ = allocator->Malloc(packed_weight_size); allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); @@ -171,15 +180,19 @@ int ConvolutionOpenCLKernel::InitWeight() { auto weight_tensor = in_tensors_[1]; if (weight_tensor->data_type() == kNumberTypeFloat16) { if (use_fp16_) { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, + block_size_.C); } else { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, + block_size_.C); } } else { if (use_fp16_) { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, + block_size_.C); } else { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, + block_size_.C); } } } @@ -193,14 +206,14 @@ int ConvolutionOpenCLKernel::InitBias() { // align bias from C to C4 auto bias_tensor = in_tensors_[2]; - size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT(); + size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_; packed_bias_ = allocator->Malloc(packed_bias_size); allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true); memset(packed_bias_, 0x00, packed_bias_size); if (bias_tensor->data_type() == kNumberTypeFloat16) { if (use_fp16_) { - memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); + memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); } else { auto packed_bias_fp32 = reinterpret_cast(packed_bias_); auto origin_bias_fp16 = reinterpret_cast(bias_tensor->data_c()); @@ -216,7 +229,7 @@ int ConvolutionOpenCLKernel::InitBias() { packed_bias_fp16[i] = static_cast(origin_bias_fp32[i]); } } else { - memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); + memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); } } allocator->UnmapBuffer(packed_bias_); @@ -231,6 +244,66 @@ int ConvolutionOpenCLKernel::InitBuffer() { return RET_OK; } +void ConvolutionOpenCLKernel::SetBlockSize() { + auto task_size = static_cast(batch_size_ * OH_ * OW_ * CO_SLICES_); + auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits(); + int block_size; + if (task_size_per_cu <= 256) { + block_size = 1; + } else if (task_size_per_cu <= 256 * 4) { + block_size = 2; + } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) { + block_size = 4; + } else { + block_size = 8; + } + + bool w_kernel_is_1 = + KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0; + bool h_kernel_is_1 = + KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0; + if (!w_kernel_is_1 || !h_kernel_is_1) { + block_size = std::min(block_size, 4); + } + + if (block_size == 8) { + block_size_ = {2, 2, 2}; + } else if (block_size == 4) { + block_size_ = {2, 1, 2}; + } else if (block_size == 2) { + block_size_ = {2, 1, 1}; + } else { + block_size_ = {1, 1, 1}; + } +} + +void ConvolutionOpenCLKernel::SetGlobalLocal() { + size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H); + size_t global_w = UP_DIV(OW_, block_size_.W); + size_t global_c = UP_DIV(CO_SLICES_, block_size_.C); + + constexpr int local_c_max = 16; + constexpr int local_hw_max = 256; + constexpr int OH_threshold = 100; + constexpr int OW_threshold = 100; + constexpr int OC_threshold = 64; + size_t local_c = GetMaxDivisor(global_c, local_c_max); + local_c = std::max(local_c, 1); + size_t local_hw = local_hw_max / local_c; + size_t local_h; + size_t local_w; + if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) { // c -> w -> h + local_w = std::min(global_w, local_hw); + local_h = std::min(local_hw / local_w, global_h); + } else { // c -> h -> w + local_h = std::min(global_h, local_hw); + local_w = std::min(local_hw / local_h, global_w); + } + + global_ = {global_h, global_w, global_c}; + local_ = {local_h, local_w, local_c}; +} + int ConvolutionOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; auto param = reinterpret_cast(op_parameter_); @@ -292,48 +365,9 @@ int ConvolutionOpenCLKernel::Run() { nullptr); ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); } else { - std::vector global, local; - SetGlobalLocalConv(&global, &local); - ocl_runtime_->RunKernel(kernel_conv_, global, local, nullptr); - } - - return RET_OK; -} - -int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std::vector *local) { - constexpr size_t work_group_size[] = {4, 4, 1}; - auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize(); - size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())()); - const size_t max_z_size = std::min(16, max_work_item_sizes[2]); - - size_t global_nh = UP_DIV(batch_size_ * OH_, work_group_size[0]) * work_group_size[0]; - size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; - size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; - - size_t local_c = GetMaxDivisor(global_c, max_z_size); - if (local_c == 0) { - MS_LOG(ERROR) << "Divide by zero"; - return mindspore::lite::RET_ERROR; - } - size_t local_hw_size = std::min(256, max_work_group_size) / local_c; - size_t local_w = std::min(global_w, local_hw_size); - size_t local_nh = std::min(local_hw_size / local_w, global_nh); - if (local_nh == global_nh && global_nh % 2 == 0) { - local_nh = global_nh / 2; + ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr); } - if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) { - local_w = 4; - } - - global->clear(); - global->push_back(UP_DIV(batch_size_ * OH_, local_nh) * local_nh); - global->push_back(UP_DIV(OW_, local_w) * local_w); - global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c); - local->clear(); - local->push_back(local_nh); - local->push_back(local_w); - local->push_back(local_c); return RET_OK; } @@ -349,37 +383,43 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vectorstride_w_ == 1 && conv_param->pad_u_ == 0 && conv_param->pad_d_ == 0 && conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1; + + OpParameter *real_param; if (is_hw1 && is_pad_stride_ok) { - auto param = static_cast(malloc(sizeof(MatMulParameter))); - if (param == nullptr) { + auto fc_param = static_cast(malloc(sizeof(MatMulParameter))); + if (fc_param == nullptr) { MS_LOG(ERROR) << "Create OpenCL FullConnection kernel param failed!"; return nullptr; } - param->op_parameter_.type_ = PrimitiveType_FullConnection; - param->a_transpose_ = false; - param->b_transpose_ = true; - param->act_type_ = conv_param->act_type_; - kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast(param), inputs, outputs); + fc_param->op_parameter_.type_ = PrimitiveType_FullConnection; + fc_param->a_transpose_ = false; + fc_param->b_transpose_ = true; + fc_param->act_type_ = conv_param->act_type_; + kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast(fc_param), inputs, outputs); + real_param = reinterpret_cast(fc_param); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL FullConnection kernel failed!"; - free(param); - free(opParameter); + free(fc_param); + free(conv_param); return nullptr; } else { - free(opParameter); + free(conv_param); } } else { - kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast(conv_param), inputs, outputs); + real_param = reinterpret_cast(conv_param); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; - free(opParameter); + free(conv_param); return nullptr; } } + auto ret = kernel->Init(); if (ret != mindspore::lite::RET_OK) { MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; delete kernel; + free(real_param); return nullptr; } return kernel; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index bd989f3cd2..d273e5ec2b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -31,7 +31,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { public: ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) - : OpenCLKernel(parameter, inputs, outputs) {} + : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast(parameter)) {} ~ConvolutionOpenCLKernel() override = default; int Init() override; @@ -39,26 +39,32 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int InitBuffer() override; private: + void SetBlockSize(); + void SetGlobalLocal(); int InitWeight(); int InitBias(); int GenerateWinogradWeight(); - int SetGlobalLocalConv(std::vector *global, std::vector *local); - - size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); } bool UseWinograd4x4To6x6() { - auto param = reinterpret_cast(op_parameter_); - const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && - param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && - param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ && - IW_ == OW_ && batch_size_ == 1; + const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 && + param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 && + param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 && + param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1; const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8; const bool hw_good = TILES_X_ * TILES_Y_ >= 16; return attr_valid && channel_good && hw_good; } + cl::Kernel kernel_4x4to36_; + cl::Kernel kernel_conv_; + cl::Kernel kernel_36to4x4_; + std::vector global_; + std::vector local_; + bool use_fp16_{false}; + size_t sizeof_FLT_{4}; + ConvParameter *param_{nullptr}; int batch_size_{}; int CI_{}; int IH_{}; @@ -81,9 +87,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { void *winograd_mem0_{nullptr}; void *winograd_mem1_{nullptr}; - cl::Kernel kernel_4x4to36_; - cl::Kernel kernel_conv_; - cl::Kernel kernel_36to4x4_; + struct { + int H{1}; + int W{1}; + int C{1}; + } block_size_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index abe32e558d..f29e2965a1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -38,30 +38,29 @@ struct OpenCLToFormatParameter { struct Image2DInfo { explicit Image2DInfo(const lite::Tensor *tensor) { - if (tensor != nullptr) { - auto shape = tensor->shape(); - if (shape.size() == 1) { - N = shape[0]; - } else if (shape.size() == 2) { - N = shape[0]; - C = shape[1]; - } else if (shape.size() == 3) { - N = shape[0]; - W = shape[1]; - C = shape[2]; - } else if (shape.size() == 4) { - N = shape[0]; - H = shape[1]; - W = shape[2]; - C = shape[3]; - } else if (shape.size() >= 5) { - MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size(); - } - FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); - } else { - FLT_size = sizeof(cl_float); + if (tensor == nullptr) { + return; } + auto shape = tensor->shape(); + if (shape.size() == 1) { + N = shape[0]; + } else if (shape.size() == 2) { + N = shape[0]; + C = shape[1]; + } else if (shape.size() == 3) { + N = shape[0]; + W = shape[1]; + C = shape[2]; + } else if (shape.size() == 4) { + N = shape[0]; + H = shape[1]; + W = shape[2]; + C = shape[3]; + } else if (shape.size() >= 5) { + MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size(); + } + FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); FLT4_size = FLT_size * 4; Slice = UP_DIV(C, C4NUM); if (W * Slice <= MAX_IMAGE2D_SIZE) { @@ -72,16 +71,19 @@ struct Image2DInfo { width = N * H * Slice; } - auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); - int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); - row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size; - ElementsNum = N * H * W * C; ElementsC4Num = N * H * W * Slice * C4NUM; OriginSize = ElementsNum * FLT_size; Image2DSize = height * width * FLT4_size; } + size_t RowPitch() const { + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); + size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size; + return row_pitch; + } + size_t N{1}; size_t H{1}; size_t W{1}; @@ -89,9 +91,8 @@ struct Image2DInfo { size_t Slice{}; size_t width{}; size_t height{}; - size_t FLT_size{}; - size_t FLT4_size{}; - size_t row_pitch{}; + size_t FLT_size{4}; + size_t FLT4_size{16}; size_t ElementsNum{}; size_t ElementsC4Num{}; size_t OriginSize{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 641d6b7031..8c3957307b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -262,7 +262,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons auto row_size = img_info.width * img_info.FLT4_size; for (int i = 0; i < img_info.height; ++i) { memcpy(reinterpret_cast(data.data()) + i * row_size, - static_cast(tensor->data_c()) + i * img_info.row_pitch, row_size); + static_cast(tensor->data_c()) + i * img_info.RowPitch(), row_size); } } allocator->UnmapBuffer(tensor->data_c());