diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl index 43a5a0306b..47cf881505 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl @@ -1,150 +1,282 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + #define CI_TILE 4 #define CO_TILE 4 +#define MAX_IMAGE2D_SIZE 65535 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -// #define __global -// #pragma OPENCL EXTENSION cl_arm_printf : enable -__kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias, - __global float *output, - const int4 input_shape, // NHWC - const int4 output_shape, // NHWC - const int4 kernel_stride, // kernelHW_strideHW - const int4 pad) { - int ow = get_global_id(0); - int oh = get_global_id(1); - int co_slice = get_global_id(2); - - int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; - int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; - int KH = kernel_stride.x, KW = kernel_stride.y; - int strideH = kernel_stride.z, strideW = kernel_stride.w; - int padTop = pad.x, padLeft = pad.z; - int CI_SLICES = UP_DIV(CI, CI_TILE); - int CO_SLICES = UP_DIV(CO, CO_TILE); - - if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) return; - - float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + +#define ActType_No 0 +#define ActType_Relu 1 +#define ActType_Sigmod 2 +#define ActType_Relu6 3 + +__kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, + __global FLT4 *bias, const int4 input_shape, const int4 output_shape, + const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { + const int N = input_shape.x; + const int IH = input_shape.y; + const int IW = input_shape.z; + const int CI_SLICES = input_shape.w; + + const int OH = output_shape.y; + const int OW = output_shape.z; + const int CO_SLICES = output_shape.w; + + const int KH = kernel_stride.x; + const int KW = kernel_stride.y; + const int strideH = kernel_stride.z; + const int strideW = kernel_stride.w; + + const int padTop = pad.x; + const int padBottom = pad.y; + const int padLeft = pad.z; + const int padRight = pad.w; + + const int dilationH = dilation.x; + const int dilationW = dilation.y; + + int n_oh = get_global_id(0); // [0, N*OH) + int ow = get_global_id(1); // [0, OW) + int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) + int n; + int oh; + if (N == 1) { + n = 0; + oh = n_oh; + } else { + n = n_oh / OH; + oh = n_oh % OH; + } + if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { + return; + } + + FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + __global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE; for (int kh = 0; kh < KH; ++kh) { - int ih = kh + oh * strideH - padTop; + int ih = kh * dilationH + oh * strideH - padTop; for (int kw = 0; kw < KW; ++kw) { - int iw = kw + ow * strideW - padLeft; - for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) { - for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) { - int ci = ci_slice * CI_TILE + ci_inner; - if (ci >= CI) break; - - int input_idx = ih * IW * CI + iw * CI + ci; - float value = 0; - if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) - value = 0; - else - value = input[input_idx]; - - int CO_OFFSET = KH * KW * CI; - int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci; - acc.x += weight[weight_idx + 0 * CO_OFFSET] * value; - acc.y += weight[weight_idx + 1 * CO_OFFSET] * value; - acc.z += weight[weight_idx + 2 * CO_OFFSET] * value; - acc.w += weight[weight_idx + 3 * CO_OFFSET] * value; + int iw = kw * dilationW + ow * strideW - padLeft; + if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); + out_c4 += w_ic1_oc4[0] * in_c4.x; + out_c4 += w_ic1_oc4[1] * in_c4.y; + out_c4 += w_ic1_oc4[2] * in_c4.z; + out_c4 += w_ic1_oc4[3] * in_c4.w; + w_ic1_oc4 += 4; } + } else { + w_ic1_oc4 += 4 * CI_SLICES; } } } - int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE); - if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; - output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3]; - } else if (CO % CO_TILE == 1) { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - } else if (CO % CO_TILE == 2) { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - } else if (CO % CO_TILE == 3) { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; + + if (bias) { + out_c4 = out_c4 + bias[co_slice]; + } + + // activation + if (act_type == ActType_Relu) { + out_c4 = max(out_c4, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f)); + } + + if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { + WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4); + + } else { + WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4); } } -// #pragma OPENCL EXTENSION cl_khr_fp16 : enable -// #define FLT4 half4 -#define FLT4 float4 -__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias, - __global FLT4 *output, - const int4 input_shape, // NHWC - const int4 output_shape, // NHWC - const int4 kernel_stride, // kernelHW_strideHW - const int4 pad) { - int oh = get_global_id(0); // [0, OH) - int ow = get_global_id(1); // [0, OW) - int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) +constant FLT Bt[36] = { + 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, + 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, + 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, + 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, + 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, + 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, +}; - int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; - int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; - int CI_SLICES = UP_DIV(CI, CI_TILE); - int CO_SLICES = UP_DIV(CO, CO_TILE); - int KH = kernel_stride.x, KW = kernel_stride.y; - int strideH = kernel_stride.z, strideW = kernel_stride.w; - int padTop = pad.x, padLeft = pad.z; - - if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) return; - if (2 * co_slice + 1 >= CO_SLICES) { - FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; - for (int kh = 0; kh < KH; ++kh) { - int ih = kh + oh * strideH - padTop; - for (int kw = 0; kw < KW; ++kw) { - int iw = kw + ow * strideW - padLeft; - if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { - FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice]; - out0_c4 += w0_ic1_oc4[0] * in_c4.x; - out0_c4 += w0_ic1_oc4[1] * in_c4.y; - out0_c4 += w0_ic1_oc4[2] * in_c4.z; - out0_c4 += w0_ic1_oc4[3] * in_c4.w; - w0_ic1_oc4 += 4; - } - } else { - w0_ic1_oc4 += 4 * CI_SLICES; - } +__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, // N H W CI_SLICES + const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES +#define PAD 1 + int tile_xy = get_global_id(0); + int row = get_global_id(1); + int slice = get_global_id(2); + + int TILE_XY = output_shape.z; + int SLICES = input_shape.w; + if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { + return; + } + + int IH = input_shape.y, IW = input_shape.z; + int TILE_X = UP_DIV(IW, 4); + int tile_x = tile_xy % TILE_X; + int tile_y = tile_xy / TILE_X; + + constant FLT *Bt_row = Bt + row * 6; + FLT4 BtD_row[6] = {0}; + for (int y = 0; y < 6; y++) { + int ih = tile_y * 4 - PAD + y; + + // Format_NHWC4 + int y_idx = ih; + // Format_NC4HW4 + // if (ih < 0 || ih >= IH) { continue;} + // int y_idx = slice * IH + ih; + + for (int x = 0; x < 6; x++) { + int iw = tile_x * 4 - PAD + x; + + // Format_NHWC4 + if (iw < 0 || iw >= IW) { + continue; } + int x_idx = iw * SLICES + slice; + // Format_NC4HW4 + // int x_idx = iw; + + BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, y_idx)); } - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; - } else { - FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; - __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE; - for (int kh = 0; kh < KH; ++kh) { - int ih = kh + oh * strideH - padTop; - for (int kw = 0; kw < KW; ++kw) { - int iw = kw + ow * strideW - padLeft; - if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { - int idx = ih * IW * CI_SLICES + iw * CI_SLICES; - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { - FLT4 in_c4 = input[idx + ci_slice]; - - out0_c4 += w0_ic1_oc4[0] * in_c4.x; - out0_c4 += w0_ic1_oc4[1] * in_c4.y; - out0_c4 += w0_ic1_oc4[2] * in_c4.z; - out0_c4 += w0_ic1_oc4[3] * in_c4.w; - w0_ic1_oc4 += 4; - - out1_c4 += w1_ic1_oc4[0] * in_c4.x; - out1_c4 += w1_ic1_oc4[1] * in_c4.y; - out1_c4 += w1_ic1_oc4[2] * in_c4.z; - out1_c4 += w1_ic1_oc4[3] * in_c4.w; - w1_ic1_oc4 += 4; - } - } else { - w0_ic1_oc4 += 4 * CI_SLICES; - w1_ic1_oc4 += 4 * CI_SLICES; - } - } + } + + for (int y = 0; y < 6; y++) { + FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + for (int x = 0; x < 6; x++) { + acc += BtD_row[x] * Bt[y * 6 + x]; + } + WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36 + } +#undef PAD +} + +__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, + const int4 input_shape, // N 36 H/4*W/4 CI_SLICES + const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES +#define H 36 + int w = get_global_id(0) * 2; + int h = get_global_id(1); + int co_slice = get_global_id(2) * 2; + + int CI_SLICES = input_shape.w; + int W = input_shape.z; + int CO_SLICES = output_shape.w; + + if (h >= H || w >= W || co_slice >= CO_SLICES) { + return; + } + + FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + int y_idx = h; + __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); + FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); + y_idx += 36; + + FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; + weight_ptr += 2; + + out00 += in0.x * weight0.s0123; + out00 += in0.y * weight0.s4567; + out00 += in0.z * weight0.s89ab; + out00 += in0.w * weight0.scdef; + + out01 += in1.x * weight0.s0123; + out01 += in1.y * weight0.s4567; + out01 += in1.z * weight0.s89ab; + out01 += in1.w * weight0.scdef; + + out10 += in0.x * weight1.s0123; + out10 += in0.y * weight1.s4567; + out10 += in0.z * weight1.s89ab; + out10 += in0.w * weight1.scdef; + + out11 += in1.x * weight1.s0123; + out11 += in1.y * weight1.s4567; + out11 += in1.z * weight1.s89ab; + out11 += in1.w * weight1.scdef; + } + + WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); + if (w + 1 < W) { + WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); + } + + if (co_slice + 1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); + if (w + 1 < W) { + WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); + } + } +#undef H +} + +constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, + 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, + 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, + 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; + +__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, + const int4 input_shape, // N 36 H/4*W/4 CO_SLICES + const int4 output_shape, // N H W CO_SLICES + const int act_type) { + int tile_xy = get_global_id(0); + int row = get_global_id(1); + int slice = get_global_id(2); + + int TILE_XY = input_shape.z; + int SLICES = input_shape.w; + int OH = output_shape.y; + int OW = output_shape.z; + + if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { + return; + } + + constant FLT *At_row = At + row * 6; + FLT4 AtM_row[6] = {0}; + for (int y = 0; y < 6; y++) { + for (int x = 0; x < 6; x++) { + AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, slice * 36 + y * 6 + x)); + } + } + + int TILE_X = UP_DIV(OW, 4); + for (int x = 0; x < 4; x++) { + FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + for (int y = 0; y < 6; y++) { + acc += AtM_row[y] * At[x * 6 + y]; + } + if (bias) { + acc += bias[slice]; + } + if (act_type == ActType_Relu) { + acc = max(acc, (FLT4)(0.0f)); + } else if (act_type == ActType_Relu6) { + acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); + } + int tile_x = tile_xy % TILE_X; + int tile_y = tile_xy / TILE_X; + int ow = tile_x * 4 + x; + int oh = tile_y * 4 + row; + + // Format_NHWC4 + if (ow < OW) { + WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc); } - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1]; + // Format_NC4HW4 + // if (oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);} } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 6a8c552333..bfe799f970 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -22,14 +22,13 @@ #include "src/runtime/kernel/opencl/utils.h" #include "src/kernel_registry.h" #include "include/errorcode.h" +#include "src/runtime/kernel/opencl/cl/convolution.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; -using mindspore::schema::Format::Format_NC4HW4; -using mindspore::schema::Format::Format_NHWC4; namespace mindspore::kernel { @@ -65,25 +64,14 @@ int ConvolutionOpenCLKernel::Init() { use_winograd_ = UseWinograd4x4To6x6(); // build kernel - auto code_id = get_code_id(); - std::string program_name; + std::string program_name = "Convolution"; + ocl_runtime_->LoadSource(program_name, convolution_source); if (use_winograd_) { MS_LOG(DEBUG) << "use winograd"; - program_name = "Winograd4x4To36" + code_id; - ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36()); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); - - program_name = "WinogradConvolution" + code_id; - ocl_runtime_->LoadSource(program_name, CodeGenWinogradConvolution()); ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options); - - program_name = "Winograd36To4x4" + code_id; - ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4()); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); } else { - program_name = "Convolution" + code_id; - std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4(); - ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options); } @@ -243,15 +231,23 @@ int ConvolutionOpenCLKernel::InitBuffer() { int ConvolutionOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; + auto param = reinterpret_cast(op_parameter_); + cl_int act_type = 0; + if (param->act_type_ == ActType_Relu) { + act_type = 1; + } else if (param->act_type_ == ActType_Relu6) { + act_type = 3; + } + cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; + cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; - int arg_cn = 0; + int arg_cn; if (use_winograd_) { arg_cn = 0; - cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_}; cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_}; ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape); + ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape); ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape); arg_cn = 0; @@ -265,28 +261,27 @@ int ConvolutionOpenCLKernel::Run() { arg_cn = 0; cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; - cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - if (has_bias_) { - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - } + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, act_type); } else { arg_cn = 0; + cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; + cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; + cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - if (has_bias_) { - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - } - if (op_format_ == Format_NC4HW4) { - cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; - cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape); - } + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation); + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, act_type); } if (use_winograd_) { @@ -303,549 +298,6 @@ int ConvolutionOpenCLKernel::Run() { return RET_OK; } -std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { - auto param = reinterpret_cast(op_parameter_); - const size_t CI_ALIGN = CI_SLICES_ * C4NUM; - const size_t CO_ALIGN = CO_SLICES_ * C4NUM; - const size_t strideH = param->stride_h_; - const size_t strideW = param->stride_w_; - const size_t padTop = param->pad_u_; - const size_t padBottom = param->pad_d_; - const size_t padLeft = param->pad_l_; - const size_t padRight = param->pad_r_; - - std::string code; - code += "#define CI_TILE 4\n"; - code += "#define CO_TILE 4\n\n"; - code += "#define N " + std::to_string(batch_size_) + "\n"; - code += "#define N_OH " + std::to_string(batch_size_ * OH_) + "\n"; - code += "#define CI " + std::to_string(CI_ALIGN) + "\n"; - code += "#define IH " + std::to_string(IH_) + "\n"; - code += "#define IW " + std::to_string(IW_) + "\n"; - code += "#define CO " + std::to_string(CO_ALIGN) + "\n"; - code += "#define OH " + std::to_string(OH_) + "\n"; - code += "#define OW " + std::to_string(OW_) + "\n"; - code += "#define KH " + std::to_string(KH_) + "\n"; - code += "#define KW " + std::to_string(KW_) + "\n"; - code += "#define strideH " + std::to_string(strideH) + "\n"; - code += "#define strideW " + std::to_string(strideW) + "\n"; - code += "#define padTop " + std::to_string(padTop) + "\n"; - code += "#define padBottom " + std::to_string(padBottom) + "\n"; - code += "#define padLeft " + std::to_string(padLeft) + "\n"; - code += "#define padRight " + std::to_string(padRight) + "\n"; - code += "#define dilationH " + std::to_string(param->dilation_h_) + "\n"; - code += "#define dilationW " + std::to_string(param->dilation_w_) + "\n"; - code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n"; - code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n"; - - if (use_fp16_) { - code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } - - code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n"; - - code += - "__kernel void Convolution(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n"; - if (has_bias_) { - code += - " __global FLT4 *weight,\n" - " __global FLT4 *bias) {\n"; - } else { - code += " __global FLT4 *weight) {\n"; - } - - code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; - if (batch_size_ == 1) { - code += " #define n 0\n"; - code += " int oh = n_oh;\n"; - } else { - code += " int n = n_oh / " + std::to_string(OH_) + ";\n"; - code += " int oh = n_oh % " + std::to_string(OH_) + ";\n"; - } - - code += - " int ow = get_global_id(1); // [0, OW)\n" - " int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n" - "\n" - " if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n" - " return;\n" - " }\n" - "\n" - " FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n"; - - code += - " for (int kh = 0; kh < KH; ++kh)\n" - " {\n" - " int ih = kh * dilationH + oh * strideH - padTop;\n" - " for (int kw = 0; kw < KW; ++kw)\n" - " {\n" - " int iw = kw * dilationW + ow * strideW - padLeft;\n" - " if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)\n" - " {\n" - " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" - " {\n"; - - code += - "FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); // NHWC4: NH WC\n\n"; - - code += - " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" - " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" - " out0_c4 += w0_ic1_oc4[2] * in_c4.z;\n" - " out0_c4 += w0_ic1_oc4[3] * in_c4.w;\n" - " w0_ic1_oc4 += 4;\n" - " }\n" - " }\n" - " else\n" - " {\n" - " w0_ic1_oc4 += 4 * CI_SLICES;\n" - " }\n" - " }\n" - " }\n\n"; - - if (has_bias_) { - code += " out0_c4 = out0_c4 + bias[co_slice];\n"; - } - - if (param->act_type_ == ActType_Relu) { - code += " out0_c4 = max(out0_c4, (FLT4)(0.0f));\n"; - } else if (param->act_type_ == ActType_Relu6) { - code += " out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n"; - } - if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { - code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}"; - } else { - code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}"; - } - return code; -} - -std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { - auto param = reinterpret_cast(op_parameter_); - const size_t strideH = param->stride_h_; - const size_t strideW = param->stride_w_; - const size_t padTop = param->pad_u_; - const size_t padBottom = param->pad_d_; - const size_t padLeft = param->pad_l_; - - std::string code; - - if (use_fp16_) { - code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } - - code += - "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" - "\n" - "__kernel void Convolution(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " __global FLT4 *weight,\n"; - if (has_bias_) { - code += " __global FLT4 *bias,\n"; - } - code += - " const int4 input_shape,\n" - " const int4 output_shape)\n" - "{\n"; - - code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; - if (batch_size_ == 1) { - code += " #define n 0\n"; - code += " int oh = n_oh;\n"; - } else { - code += " int n = n_oh / " + std::to_string(OH_) + ";\n"; - code += " int oh = n_oh % " + std::to_string(OH_) + ";\n"; - } - - code += - " int ow = get_global_id(1) * 2;\n" - " int co_slice = get_global_id(2);\n" - "\n" - " int CI_SLICES = input_shape.w;\n" - " int CO_SLICES = output_shape.w;\n\n"; - - code += " #define N " + std::to_string(batch_size_) + "\n"; - code += " #define N_OH " + std::to_string(batch_size_ * OH_) + "\n"; - code += " #define IH " + std::to_string(IH_) + "\n"; - code += " #define IW " + std::to_string(IW_) + "\n"; - code += " #define OH " + std::to_string(OH_) + "\n"; - code += " #define OW " + std::to_string(OW_) + "\n"; - code += " #define KH " + std::to_string(KH_) + "\n"; - code += " #define KW " + std::to_string(KW_) + "\n"; - code += " #define strideH " + std::to_string(strideH) + "\n"; - code += " #define strideW " + std::to_string(strideW) + "\n"; - code += " #define padTop " + std::to_string(padTop) + "\n"; - code += " #define padLeft " + std::to_string(padLeft) + "\n"; - code += " #define dilationH " + std::to_string(param->dilation_h_) + "\n"; - code += " #define dilationW " + std::to_string(param->dilation_w_) + "\n"; - - code += - " if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n" - " return;\n" - " }\n"; - - bool check_ow = (OW_ % 2) == 1; - if (check_ow) { - code += - " int last_is_double = 1;\n" - " if (ow + 1 >= OW)\n" - " last_is_double = 0;\n\n"; - } - - code += - " FLT4 out0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " FLT4 out1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " __global FLT4 *w = weight + co_slice * KH * KW * CI_SLICES * 4;\n" - "\n" - " for (int kh = 0; kh < KH; ++kh)\n" - " {\n" - " int ih = kh * dilationH + oh * strideH - padTop;\n" - " for (int kw = 0; kw < KW; ++kw)\n" - " {\n"; - - if (padTop || padBottom) { - code += - "if (ih >= 0 && ih < IH)\n" - "{\n"; - } - - code += " int iw0 = kw * dilationW + (ow + 0) * strideW - padLeft;\n"; - if (check_ow) { - code += - " if (last_is_double)\n" - " {\n"; - } - - code += - " int iw1 = kw * dilationW + (ow + 1) * strideW - padLeft;\n" - " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" - " {\n" - " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n" - " out0 += w[0] * in0.x;\n" - " out0 += w[1] * in0.y;\n" - " out0 += w[2] * in0.z;\n" - " out0 += w[3] * in0.w;\n" - " FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(iw1, (n * CI_SLICES + ci_slice) * IH + ih));\n" - " out1 += w[0] * in1.x;\n" - " out1 += w[1] * in1.y;\n" - " out1 += w[2] * in1.z;\n" - " out1 += w[3] * in1.w;\n" - " w += 4;\n" - " }\n"; - if (check_ow) { - code += - " }\n" - " else\n" - " {\n" - " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" - " {\n" - " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n" - " out0 += w[0] * in0.x;\n" - " out0 += w[1] * in0.y;\n" - " out0 += w[2] * in0.z;\n" - " out0 += w[3] * in0.w;\n" - " w += 4;\n" - " }\n" - " }\n"; - } - if (padTop || padBottom) { - code += - "}\n" - "else\n" - "{\n" - " w += CI_SLICES * 4;\n" - "}\n"; - } - code += - " }\n" - " }\n\n"; - - if (has_bias_) { - code += " out0 = out0 + bias[co_slice];\n"; - } - - if (param->act_type_ == ActType_Relu) { - code += " out0 = max(out0, (FLT4)(0.0f));\n"; - } else if (param->act_type_ == ActType_Relu6) { - code += " out0 = clamp(out0, (FLT4)(0.0f), (FLT4)(6.0f));\n"; - } - code += " WRITE_IMAGE(output, (int2)(ow + 0, (n * CO_SLICES + co_slice) * OH + oh), out0);\n"; - - if (check_ow) { - code += - " if (last_is_double)" - " {\n"; - } - if (has_bias_) { - code += " out1 = out1 + bias[co_slice];\n"; - } - if (param->act_type_ == ActType_Relu) { - code += " out1 = max(out1, (FLT4)(0.0f));\n"; - } else if (param->act_type_ == ActType_Relu6) { - code += " out1 = clamp(out1, (FLT4)(0.0f), (FLT4)(6.0f));\n"; - } - code += " WRITE_IMAGE(output, (int2)(ow + 1, (n * CO_SLICES + co_slice) * OH + oh), out1);\n"; - if (check_ow) { - code += "}\n"; - } - code += "}\n"; - - return code; -} - -std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { - std::string code; - code += - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" - "#define PAD 1\n" - "\n" - "__constant sampler_t\n" - "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" - "\n" - "constant FLT Bt[36] = {\n" - " 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n" - " 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n" - " 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n" - " 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n" - " 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n" - " 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n" - "};\n" - "\n" - "__kernel void Winograd4x4To36(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " int4 input_shape, // N H W CI_SLICES\n" - " int4 output_shape) // N 36 H/4*W/4 CI_SLICES\n" - "{\n" - " int tile_xy = get_global_id(0);\n" - " int row = get_global_id(1);\n" - " int slice = get_global_id(2);\n" - "\n" - " int TILE_XY = output_shape.z;\n" - " int SLICES = input_shape.w;\n" - " if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n" - " {\n" - " return;\n" - " }\n" - "\n" - " int IH = input_shape.y, IW = input_shape.z;\n" - " int TILE_X = UP_DIV(IW, 4);\n" - " int tile_x = tile_xy % TILE_X;\n" - " int tile_y = tile_xy / TILE_X;\n" - "\n" - " constant FLT *Bt_row = Bt + row * 6;\n" - " FLT4 BtD_row[6] = {0};\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " int ih = tile_y * 4 - PAD + y;\n"; - - if (op_format_ == Format_NHWC4) { - code += " int y_idx = ih;\n"; - } else if (op_format_ == Format_NC4HW4) { - code += - " if(ih < 0 || ih >= IH) {continue;}\n" - " int y_idx = slice * IH + ih;\n"; - } - - code += - " for (int x = 0; x < 6; x++)\n" - " {\n" - " int iw = tile_x * 4 - PAD + x;\n"; - - if (op_format_ == Format_NHWC4) { - code += - " if(iw < 0 || iw >= IW) {continue;}\n" - " int x_idx = iw * SLICES + slice;\n"; - } else if (op_format_ == Format_NC4HW4) { - code += " int x_idx = iw;\n"; - } - - code += - " BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n" - " }\n" - " }\n" - "\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " for (int x = 0; x < 6; x++)\n" - " {\n" - " acc += BtD_row[x] * Bt[y * 6 + x];\n" - " }\n" - " WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" - " }\n" - "}"; - return code; -} - -std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { - return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#define CI_TILE 4\n" - "#define H 36\n" - "__constant sampler_t\n" - "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" - "\n" - "__kernel void WinogradConvolution(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " __global FLT16 *weight,\n" - " int4 input_shape, // N 36 H/4*W/4 CI_SLICES\n" - " int4 output_shape) // N 36 H/4*W/4 CO_SLICES\n" - "{\n" - " int w = get_global_id(0) * 2;\n" - " int h = get_global_id(1);\n" - " int co_slice = get_global_id(2) * 2;\n" - "\n" - " int CI_SLICES = input_shape.w;\n" - " int W = input_shape.z;\n" - " int CO_SLICES = output_shape.w;\n" - "\n" - " if (h >= H || w >= W || co_slice >= CO_SLICES)\n" - " {\n" - " return;\n" - " }\n" - "\n" - " FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - "\n" - " int y_idx = h;\n" - " __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n" - " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" - " {\n" - " FLT4 in0 = READ_IMAGE(input, smp_none, (int2)(w + 0, y_idx));\n" - " FLT4 in1 = READ_IMAGE(input, smp_none, (int2)(w + 1, y_idx));\n" - " y_idx += 36;\n" - "\n" - " FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n" - " weight_ptr += 2;\n" - "\n" - "\n" - " out00 += in0.x * weight0.s0123;\n" - " out00 += in0.y * weight0.s4567;\n" - " out00 += in0.z * weight0.s89ab;\n" - " out00 += in0.w * weight0.scdef;\n" - "\n" - " out01 += in1.x * weight0.s0123;\n" - " out01 += in1.y * weight0.s4567;\n" - " out01 += in1.z * weight0.s89ab;\n" - " out01 += in1.w * weight0.scdef;\n" - "\n" - " out10 += in0.x * weight1.s0123;\n" - " out10 += in0.y * weight1.s4567;\n" - " out10 += in0.z * weight1.s89ab;\n" - " out10 += in0.w * weight1.scdef;\n" - "\n" - " out11 += in1.x * weight1.s0123;\n" - " out11 += in1.y * weight1.s4567;\n" - " out11 += in1.z * weight1.s89ab;\n" - " out11 += in1.w * weight1.scdef;\n" - " }\n" - "\n" - " WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n" - " if (w + 1 < W)\n" - " {\n" - " WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n" - " }\n" - "\n" - " if (co_slice + 1 < CO_SLICES)\n" - " {\n" - " WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n" - " if (w + 1 < W)\n" - " {\n" - " WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n" - " }\n" - " }\n" - "}"; -} - -std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { - std::string code = - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" - "\n" - "__constant sampler_t\n" - "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" - "\n" - "constant FLT At[24] = {\n" - " 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n" - " 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n" - " 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n" - " 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n" - "};\n" - "\n" - "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n"; - if (has_bias_) { - code += " __global FLT4 *bias,\n"; - } - code += - " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" - " int4 output_shape) // N H W CO_SLICES\n" - "{\n" - " int tile_xy = get_global_id(0);\n" - " int row = get_global_id(1);\n" - " int slice = get_global_id(2);\n" - "\n" - " int TILE_XY = input_shape.z;\n" - " int SLICES = input_shape.w;\n" - " int OH = output_shape.y;\n" - " int OW = output_shape.z;\n" - "\n" - " if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n" - " {\n" - " return;\n" - " }\n" - "\n" - " constant FLT *At_row = At + row * 6;\n" - " FLT4 AtM_row[6] = {0};\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " for (int x = 0; x < 6; x++)\n" - " {\n" - " AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + x));\n" - " }\n" - " }\n" - "\n" - " int TILE_X = UP_DIV(OW, 4);\n" - " for (int x = 0; x < 4; x++)\n" - " {\n" - " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " acc += AtM_row[y] * At[x * 6 + y];\n" - " }\n"; - if (has_bias_) { - code += " acc += bias[slice];\n"; - } - - auto param = reinterpret_cast(op_parameter_); - if (param->act_type_ == ActType_Relu) { - code += " acc = max(acc, (FLT4)(0.0f));\n\n"; - } else if (param->act_type_ == ActType_Relu6) { - code += " acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n\n"; - } - - code += - " int tile_x = tile_xy % TILE_X;\n" - " int tile_y = tile_xy / TILE_X;\n" - " int ow = tile_x * 4 + x;\n" - " int oh = tile_y * 4 + row;\n"; - - if (op_format_ == Format_NHWC4) { - code += " if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n"; - } else if (op_format_ == Format_NC4HW4) { - code += " if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n"; - } - - code += - " }\n" - "}"; - return code; -} - int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std::vector *local) { constexpr size_t work_group_size[] = {4, 4, 1}; auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize(); @@ -868,10 +320,8 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std local_nh = global_nh / 2; } - if (op_format_ == Format_NHWC4) { - if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) { - local_w = 4; - } + if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) { + local_w = 4; } global->clear(); @@ -882,14 +332,6 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std local->push_back(local_nh); local->push_back(local_w); local->push_back(local_c); - - if (op_format_ == Format_NC4HW4) { - // calculate 2 FLT4 along width per work-item - global->at(1) = UP_DIV(global->at(1), 2); - if (local->at(1) > global->at(1)) { - local->at(1) = global->at(1); - } - } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 6191872d64..75c1094193 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -42,11 +42,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int InitWeight(); int InitBias(); int GenerateWinogradWeight(); - std::string CodeGenConvolutionNHWC4(); - std::string CodeGenConvolutionNC4HW4(); - std::string CodeGenWinograd4x4To36(); - std::string CodeGenWinogradConvolution(); - std::string CodeGenWinograd36To4x4(); int SetGlobalLocalConv(std::vector *global, std::vector *local); size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); } @@ -62,38 +57,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { return attr_valid && channel_good && hw_good; } - std::string get_code_id() { - auto param = reinterpret_cast(op_parameter_); - std::vector vpara{batch_size_, - CI_, - IH_, - IW_, - CO_, - OH_, - OW_, - KH_, - KW_, - param->stride_h_, - param->stride_w_, - param->pad_u_, - param->pad_l_, - param->pad_d_, - param->pad_r_, - param->dilation_h_, - param->dilation_w_, - has_bias_, - use_fp16_, - op_format_, - param->act_type_}; - std::string code_id; - for (auto &iv : vpara) { - code_id += "_" + std::to_string(iv); - } - return code_id; - } - bool use_fp16_{false}; - const schema::Format op_format_{schema::Format_NHWC4}; int batch_size_{}; int CI_{}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index 67185d8718..da72fe4b21 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -16,7 +16,6 @@ #include #include "src/common/log_adapter.h" #include "common/common_test.h" -#include "mindspore/lite/src/common/file_utils.h" #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h" @@ -29,10 +28,7 @@ using mindspore::lite::Tensor; using mindspore::schema::Format; using mindspore::schema::NodeType_ValueNode; using mindspore::schema::Format::Format_KHWC; -using mindspore::schema::Format::Format_NC4HW4; -using mindspore::schema::Format::Format_NCHW; using mindspore::schema::Format::Format_NHWC; -using mindspore::schema::Format::Format_NHWC4; namespace mindspore { @@ -67,70 +63,38 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) { } printf("\n"); - float max_err = -1.0f; - std::array idx_5d{}; - int max_err_idx = -1, first_err_idx = -1; - auto SLICES = UP_DIV(output->Channel(), 4); - int I = 1, J = 1, K = 1, L = 1, M = 1; - switch (output->GetFormat()) { - case Format_NHWC: - I = output->Batch(), J = output->Height(), K = output->Width(), L = output->Channel(); - break; - case Format_NCHW: - I = output->Batch(), J = output->Channel(), K = output->Height(), L = output->Width(); - break; - case Format_NHWC4: - I = output->Batch(), J = output->Height(), K = output->Width(), L = SLICES, M = 4; - break; - case Format_NC4HW4: - I = output->Batch(), J = SLICES, K = output->Height(), L = output->Width(), M = 4; - break; - default: - break; - } - - int cn = 0; - for (int i = 0; i < I; ++i) { - for (int j = 0; j < J; ++j) { - for (int k = 0; k < K; ++k) { - for (int l = 0; l < L; ++l) { - for (int m = 0; m < M; ++m) { - auto err = std::fabs(output_data[cn] - expect_data[cn]); - if (first_err_idx == -1 && err > atol) { - first_err_idx = cn; - } - if (err > max_err) { - max_err = err; - idx_5d = {i, j, k, l, m}; - max_err_idx = cn; - } - cn++; + bool not_equal = false; + int idx = 0; + std::array idx_4d{}; + auto N = output->Batch(), H = output->Height(), W = output->Width(), C = output->Channel(); + for (int i = 0, cn = 0; i < N; ++i) { + for (int j = 0; j < H; ++j) { + for (int k = 0; k < W; ++k) { + for (int l = 0; l < C; ++l) { + auto err = std::fabs(output_data[cn] - expect_data[cn]); + if (err > atol) { + not_equal = true; + idx_4d = {i, j, k, l}; + goto End; } + cn++; } } } } - if (max_err > atol) { - printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx], - output_data[first_err_idx]); +End: + if (not_equal) { + printf("first error at [%d %d %d %d] expect=%.3f output=%.3f\n", idx_4d[0], idx_4d[1], idx_4d[2], idx_4d[3], + expect_data[idx], output_data[idx]); FAIL(); } else { - float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx])); - if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) { - printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]); - } else { - printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]); - } - printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx], - output_data[max_err_idx], max_err, relative_err * 100); printf("COMPARE SUCCESS!\n\n"); } } -void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format, - const TypeId data_type, const float atol, const float *input_data, const float *weight_data, - const float *bias_data, const float *expect_data) { +void TEST_MAIN(const std::string &attr, const TypeId data_type, const float atol, const float *input_data, + const float *weight_data, const float *bias_data, const float *expect_data) { auto param = static_cast(malloc(sizeof(ConvParameter))); if (param == nullptr) { MS_LOG(ERROR) << "ConvParameter create error."; @@ -145,7 +109,8 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma ¶m->dilation_h_, ¶m->dilation_w_); MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + auto ocl_runtime = runtime_wrapper.GetInstance(); ocl_runtime->Init(); ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); auto allocator = ocl_runtime->GetAllocator(); @@ -155,19 +120,24 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma std::vector weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; std::vector bias_shape = {param->output_channel_}; std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; - auto input = Tensor(data_type, input_shape, input_format, lite::TensorCategory(NodeType_ValueNode)); + auto input = Tensor(data_type, input_shape, Format_NHWC, lite::TensorCategory(NodeType_ValueNode)); auto weight = Tensor(data_type, weight_shape, Format_KHWC, lite::TensorCategory(NodeType_ValueNode)); auto bias = Tensor(data_type, bias_shape, Format_KHWC, lite::TensorCategory(NodeType_ValueNode)); - auto output = Tensor(data_type, output_shape, output_format, lite::TensorCategory(NodeType_ValueNode)); + auto output = Tensor(data_type, output_shape, Format_NHWC, lite::TensorCategory(NodeType_ValueNode)); MS_LOG(DEBUG) << "allocate memory and initialize weight/bias"; weight.MallocData(); - bias.MallocData(); LoadData(&weight, weight_data); - LoadData(&bias, bias_data); + if (bias_data) { + bias.MallocData(); + LoadData(&bias, bias_data); + } MS_LOG(DEBUG) << "create OpenCL Kernel"; - std::vector inputs{&input, &weight, &bias}; + std::vector inputs{&input, &weight}; + if (bias_data) { + inputs.push_back(&bias); + } std::vector outputs{&output}; auto kernel = std::make_unique(reinterpret_cast(param), inputs, outputs); kernel->Init(); @@ -186,132 +156,56 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma MS_LOG(DEBUG) << "release resources"; weight.FreeData(); - bias.FreeData(); - input.SetData(nullptr); - output.SetData(nullptr); + if (bias_data) { + bias.FreeData(); + } delete sub_graph; } -void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format, - const TypeId data_type, const float atol, const std::string &data_path) { - auto testcase_path = data_path + "/" + attr + "/"; - std::map format_str{ - {Format_NCHW, "NCHW"}, {Format_NHWC, "NHWC"}, {Format_NHWC4, "NHWC4"}, {Format_NC4HW4, "NC4HW4"}}; - auto input_file = testcase_path + "input_" + format_str[input_format] + ".bin"; - auto weight_file = testcase_path + "weight_OHWI.bin"; - auto bias_file = testcase_path + "bias_C.bin"; - auto expect_file = testcase_path + "expect_" + format_str[output_format] + ".bin"; - MS_LOG(DEBUG) << "input_file :" << input_file; - MS_LOG(DEBUG) << "weight_file :" << weight_file; - MS_LOG(DEBUG) << "bias_file :" << bias_file; - MS_LOG(DEBUG) << "expect_file :" << expect_file; - - size_t dst_size; - auto input_data = reinterpret_cast(mindspore::lite::ReadFile(input_file.c_str(), &dst_size)); - auto weight_data = reinterpret_cast(mindspore::lite::ReadFile(weight_file.c_str(), &dst_size)); - auto bias_data = reinterpret_cast(mindspore::lite::ReadFile(bias_file.c_str(), &dst_size)); - auto expect_data = reinterpret_cast(mindspore::lite::ReadFile(expect_file.c_str(), &dst_size)); - printf("input [0-3]: %7.3f %7.3f %7.3f\n", input_data[0], input_data[1], input_data[2]); - printf("weight[0-3]: %7.3f %7.3f %7.3f\n", weight_data[0], weight_data[1], weight_data[2]); - printf("bias [0-3]: %7.3f %7.3f %7.3f\n", bias_data[0], bias_data[1], bias_data[2]); - printf("expect[0-3]: %7.3f %7.3f %7.3f\n", expect_data[0], expect_data[1], expect_data[2]); - - TEST_MAIN(attr, input_format, output_format, op_format, data_type, atol, input_data, weight_data, bias_data, - expect_data); -} - -TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { - std::string attr = - "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" - "1x1"; - // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, - // "testcases/mobilenetv2_fp32/"); TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, - // 2e-2f, "testcases/mobilenetv2_fp32/"); - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/"); - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/"); -} - -TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { - std::string attr = - "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_" - "1x1"; - // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); - // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); -} - -TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) { +TEST_F(TestConvolutionOpenCL, test0) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float bias_data[] = {0.0f, 0.0f}; float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f}; - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); -} - -TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) { - std::string attr = - "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; - float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f}; - float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - float bias_data[] = {0.0f, 0.0f}; - float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f}; - TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) { +TEST_F(TestConvolutionOpenCL, test0_no_bias) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; - float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f}; + float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - float bias_data[] = {0.0f, 0.0f}; - float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f, - 9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f}; - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, - bias_data, expect_data); - TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, - bias_data, expect_data); - TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, - bias_data, expect_data); - TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, - bias_data, expect_data); + float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f}; + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, nullptr, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, nullptr, expect_data); } -TEST_F(TestConvolutionOpenCL, simple_test1) { +TEST_F(TestConvolutionOpenCL, test1) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; float bias_data[] = {0.5f, -0.5f}; float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f}; - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, simple_test2) { +TEST_F(TestConvolutionOpenCL, test2) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float bias_data[] = {0.0f}; float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f}; - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, simple_test3) { +TEST_F(TestConvolutionOpenCL, test3) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; @@ -319,13 +213,11 @@ TEST_F(TestConvolutionOpenCL, simple_test3) { 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; float bias_data[] = {0.5f, -0.5f}; float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f}; - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, simple_test3_batch2) { +TEST_F(TestConvolutionOpenCL, test3_batch2) { std::string attr = "inputNHWC_2x2x2x2_outputNHWC_2x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; @@ -334,14 +226,8 @@ TEST_F(TestConvolutionOpenCL, simple_test3_batch2) { float bias_data[] = {0.5f, -0.5f}; float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f, 168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f}; - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, - expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, - bias_data, expect_data); - TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, - bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } } // namespace mindspore