From 04e4cba6f00604996bd2c29537e4c7be9ff9622d Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Fri, 31 Jul 2020 23:57:36 +0800 Subject: [PATCH] update lite gpu opencl convolution kernel --- .../kernel/opencl/cl/fp32/convolution.cl | 194 ++++++++++++++---- .../kernel/opencl/kernel/convolution.cc | 155 ++++++++------ .../kernel/opencl/kernel/convolution.h | 11 +- 3 files changed, 252 insertions(+), 108 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl index af7f858a87..8007438267 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl @@ -1,87 +1,191 @@ #define CI_TILE 4 #define CO_TILE 4 - #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) +//#define __global //#pragma OPENCL EXTENSION cl_arm_printf : enable __kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias, __global float *output, - const uint4 input_shape, // NHWC - const uint4 weight_shape, // OHWI - const uint4 output_shape, // NHWC - const uint2 stride, // HW - const uint4 pad) // top bottom left right + const int4 input_shape, // NHWC + const int4 output_shape, // NHWC + const int4 kernel_stride, // kernelHW_strideHW + const int4 pad) // top bottom left right { - uint ow = get_global_id(0); - uint oh = get_global_id(1); - uint co_outer = get_global_id(2); + int ow = get_global_id(0); + int oh = get_global_id(1); + int co_slice = get_global_id(2); + + int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; + int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; + int KH = kernel_stride.x, KW = kernel_stride.y; + int strideH = kernel_stride.z, strideW = kernel_stride.w; + int padTop = pad.x, padLeft = pad.z; + int CI_SLICES = UP_DIV(CI, CI_TILE); + int CO_SLICES = UP_DIV(CO, CO_TILE); - uint CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; - uint CO = output_shape.w, OW = output_shape.z; - uint KH = weight_shape.y, KW = weight_shape.z; - uint stride_h = stride.x, stride_w = stride.y; - uint pad_top = pad.x, pad_left = pad.z; - uint CI_TILE_NUM = UP_DIV(CI, CI_TILE); - uint CO_TILE_NUM = UP_DIV(CO, CO_TILE); + if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) + return; float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); - for (uint kh = 0; kh < KH; ++kh) + for (int kh = 0; kh < KH; ++kh) { - uint ih = kh + oh * stride_h - pad_top; - for (uint kw = 0; kw < KW; ++kw) + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) { - uint iw = kw + ow * stride_w - pad_left; - for (uint ci_outer = 0; ci_outer < CI_TILE_NUM; ++ci_outer) + int iw = kw + ow * strideW - padLeft; + for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) { - for (uint ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) + for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) { - uint ci = ci_outer * CI_TILE + ci_inner; + int ci = ci_slice * CI_TILE + ci_inner; if (ci >= CI) break; - uint input_idx = ih * IW * CI + iw * CI + ci; + int input_idx = ih * IW * CI + iw * CI + ci; float value = 0; if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) value = 0; else value = input[input_idx]; - uint CO_TILE_OFFSET = KH * KW * CI; - uint weight_idx = (co_outer * CO_TILE) * CO_TILE_OFFSET + - kh * KW * CI + - kw * CI + - ci; - acc.x += weight[weight_idx + 0 * CO_TILE_OFFSET] * value; - acc.y += weight[weight_idx + 1 * CO_TILE_OFFSET] * value; - acc.z += weight[weight_idx + 2 * CO_TILE_OFFSET] * value; - acc.w += weight[weight_idx + 3 * CO_TILE_OFFSET] * value; + int CO_OFFSET = KH * KW * CI; + int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + + kh * KW * CI + + kw * CI + + ci; + acc.x += weight[weight_idx + 0 * CO_OFFSET] * value; + acc.y += weight[weight_idx + 1 * CO_OFFSET] * value; + acc.z += weight[weight_idx + 2 * CO_OFFSET] * value; + acc.w += weight[weight_idx + 3 * CO_OFFSET] * value; } } } } - uint output_idx = oh * OW * CO + ow * CO + (co_outer * CO_TILE); - if (co_outer < CO_TILE_NUM - 1 || CO % CO_TILE == 0) + int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE); + if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) { - output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2]; - output[output_idx + 3] = acc.w + bias[co_outer * CO_TILE + 3]; + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; + output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; + output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3]; } else if (CO % CO_TILE == 1) { - output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0]; + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; } else if (CO % CO_TILE == 2) { - output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1]; + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; } else if (CO % CO_TILE == 3) { - output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2]; + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; + output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; + } +} + + + +//#pragma OPENCL EXTENSION cl_khr_fp16 : enable +//#define FLT4 half4 +#define FLT4 float4 +__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, + __global FLT4 *weight, + __global FLT4 *bias, + __global FLT4 *output, + const int4 input_shape, // NHWC + const int4 output_shape, // NHWC + const int4 kernel_stride, // kernelHW_strideHW + const int4 pad) // top bottom left right +{ + int oh = get_global_id(0); // [0, OH) + int ow = get_global_id(1); // [0, OW) + int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) + + int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; + int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; + int CI_SLICES = UP_DIV(CI, CI_TILE); + int CO_SLICES = UP_DIV(CO, CO_TILE); + int KH = kernel_stride.x, KW = kernel_stride.y; + int strideH = kernel_stride.z, strideW = kernel_stride.w; + int padTop = pad.x, padLeft = pad.z; + + if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) + return; + if (2 * co_slice + 1 >= CO_SLICES) + { + FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; + for (int kh = 0; kh < KH; ++kh) + { + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) + { + int iw = kw + ow * strideW - padLeft; + if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) + { + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) + { + FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice]; + out0_c4 += w0_ic1_oc4[0] * in_c4.x; + out0_c4 += w0_ic1_oc4[1] * in_c4.y; + out0_c4 += w0_ic1_oc4[2] * in_c4.z; + out0_c4 += w0_ic1_oc4[3] * in_c4.w; + w0_ic1_oc4 += 4; + } + } + else + { + w0_ic1_oc4 += 4 * CI_SLICES; + } + } + } + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; + } + else + { + FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; + __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE; + for (int kh = 0; kh < KH; ++kh) + { + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) + { + int iw = kw + ow * strideW - padLeft; + if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) + { + int idx = ih * IW * CI_SLICES + iw * CI_SLICES; + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) + { + FLT4 in_c4 = input[idx + ci_slice]; + + out0_c4 += w0_ic1_oc4[0] * in_c4.x; + out0_c4 += w0_ic1_oc4[1] * in_c4.y; + out0_c4 += w0_ic1_oc4[2] * in_c4.z; + out0_c4 += w0_ic1_oc4[3] * in_c4.w; + w0_ic1_oc4 += 4; + + out1_c4 += w1_ic1_oc4[0] * in_c4.x; + out1_c4 += w1_ic1_oc4[1] * in_c4.y; + out1_c4 += w1_ic1_oc4[2] * in_c4.z; + out1_c4 += w1_ic1_oc4[3] * in_c4.w; + w1_ic1_oc4 += 4; + } + } + else + { + w0_ic1_oc4 += 4 * CI_SLICES; + w1_ic1_oc4 += 4 * CI_SLICES; + } + } + } + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1]; } } \ No newline at end of file diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index fbb1f2f58c..3e9a2e7d04 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -14,16 +14,12 @@ * limitations under the License. */ -#include "src/runtime/kernel/opencl/kernel/convolution.h" -#include #include #include -#include "schema/model_generated.h" -#include "src/kernel_registry.h" -#include "src/runtime/opencl/opencl_runtime.h" -#ifndef PROGRAM_WITH_IL +#include +#include "src/runtime/kernel/opencl/kernel/convolution.h" #include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc" -#endif +#include "src/kernel_registry.h" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() { MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!"; } - outputs_[0]->SetFormat(schema::Format_NHWC4); - io_dataformat_ = outputs_[0]->GetFormat(); + auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC; + auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4; + if (!io_NHWC && !io_NHWC4) { + MS_LOG(ERROR) << "input and output data_format is invalid!"; + } + io_dataformat_ = inputs_[0]->GetFormat(); if (inputs_[1]->GetFormat() != schema::Format_KHWC) { MS_LOG(ERROR) << "weight data_format is invalid!"; } - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - std::string kernel_name = "convolution_NHWC_OHWI"; -#ifdef PROGRAM_WITH_IL - ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); -#else std::set build_options; std::string source = convolution_source_fp32; std::string program_name = "convolution"; + std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); -#endif - this->InitBuffer(); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return 0; } int ConvolutionOpenCLKernel::InitBuffer() { @@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() { memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size()); allocator->UnmapBuffer(packed_bias_); } else if (io_dataformat_ == schema::Format_NHWC4) { + // OHWI -> OHWIIO auto weight_shape = weight_tensor->shape(); size_t CO = weight_shape[0]; size_t KH = weight_shape[1]; size_t KW = weight_shape[2]; size_t CI = weight_shape[3]; - size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; - size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; - size_t weight_size_tiled = CO_ALIGN * KH * KW * CI_ALIGN * sizeof(float); + size_t CI_SLICES = UP_DIV(CI, C4NUM); + size_t CO_SLICES = UP_DIV(CO, C4NUM); + constexpr size_t CI_TILE = C4NUM; + constexpr size_t CO_TILE = C4NUM; + size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); - packed_weight_ = reinterpret_cast(allocator->Malloc(weight_size_tiled)); + packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); packed_weight_ = reinterpret_cast(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); - memset_s(packed_weight_, weight_size_tiled, 0x00, weight_size_tiled); + memset_s(packed_weight_, packed_weight_size, 0x00, packed_weight_size); auto weight_data = reinterpret_cast(weight_tensor->Data()); for (int co = 0; co < CO; ++co) { for (int kh = 0; kh < KH; ++kh) { for (int kw = 0; kw < KW; ++kw) { for (int ci = 0; ci < CI; ++ci) { - packed_weight_[co * KH * KW * CI_ALIGN + kh * KW * CI_ALIGN + kw * CI_ALIGN + ci] = - weight_data[co * KH * KW * CI + kh * KW * CI + kw * CI + ci]; + auto co_outer = co / CO_TILE; + auto co_inner = co % CO_TILE; + auto ci_outer = ci / CI_TILE; + auto ci_inner = ci % CI_TILE; + packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + + co_inner] = *(weight_data++); } } } } allocator->UnmapBuffer(packed_weight_); - - size_t bias_size_tiled = CO_ALIGN * sizeof(float); - packed_bias_ = reinterpret_cast(allocator->Malloc(bias_size_tiled)); + size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); + packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); - memset_s(packed_bias_, bias_size_tiled, 0x00, bias_size_tiled); + memset_s(packed_bias_, packed_bias_size, 0x00, packed_bias_size); auto bias_data = reinterpret_cast(bias_tensor->Data()); for (int co = 0; co < CO; ++co) { packed_bias_[co] = bias_data[co]; @@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() { } return 0; -} +} // namespace mindspore::kernel int ConvolutionOpenCLKernel::ReSize() { return 0; } +static int GetBiggestDivider(int x, int y) { + for (int i = y; i != 0; i--) { + if (x % i == 0) { + return i; + } + } + return 1; +} + +static void GetLocalSize(const ConvParameter *param, std::vector *global, std::vector *local) { + constexpr size_t work_group_size[] = {4, 4, 1}; + constexpr size_t max_work_item_sizes[] = {512, 512, 512}; + constexpr size_t max_work_group_size = 512; + const size_t max_z_size = std::min(16, max_work_item_sizes[2]); + + // 先用OH OW CO_SLICES初始化global,并且441对齐 + size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0]; + size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1]; + size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; + + // 使用策略计算local + size_t local_c = GetBiggestDivider(global_c, max_z_size); + size_t local_hw_size = std::min(256, max_work_group_size) / local_c; + size_t local_w = std::min(global_w, local_hw_size); + size_t local_h = std::min(local_hw_size / local_w, global_h); + if (local_h == global_h && global_h % 2 == 0) { + local_h = global_h / 2; + } + + global->clear(); + global->push_back(UP_DIV(param->output_h_, local_h) * local_h); + global->push_back(UP_DIV(param->output_w_, local_w) * local_w); + global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c); + local->clear(); + local->push_back(local_h); + local->push_back(local_w); + local->push_back(local_c); +} + int ConvolutionOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->Name() << " Running!"; + MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto param = reinterpret_cast(opParameter); auto input0_shape = inputs_[0]->shape(); // NHWC auto input1_shape = inputs_[1]->shape(); // OHWI auto outpu0_shape = outputs_[0]->shape(); // NHWC - cl_uint N = input0_shape[0]; - cl_uint CI = input0_shape[3]; - cl_uint IH = input0_shape[1]; - cl_uint IW = input0_shape[2]; - cl_uint CO = outpu0_shape[3]; - cl_uint OH = outpu0_shape[1]; - cl_uint OW = outpu0_shape[2]; - cl_uint KH = input1_shape[1]; - cl_uint KW = input1_shape[2]; - cl_uint CI_TILE_NUM = UP_DIV(CI, C4NUM); - cl_uint CO_TILE_NUM = UP_DIV(CO, C4NUM); - cl_uint CI_ALIGN = CI_TILE_NUM * C4NUM; - cl_uint CO_ALIGN = CO_TILE_NUM * C4NUM; - - cl_uint4 input_shape; - cl_uint4 weight_shape; - cl_uint4 output_shape; + cl_int N = input0_shape[0]; + cl_int CI = input0_shape[3]; + cl_int IH = input0_shape[1]; + cl_int IW = input0_shape[2]; + cl_int CO = outpu0_shape[3]; + cl_int OH = outpu0_shape[1]; + cl_int OW = outpu0_shape[2]; + cl_int KH = input1_shape[1]; + cl_int KW = input1_shape[2]; + cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; + cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; + + cl_int4 input_shape; + cl_int4 output_shape; if (io_dataformat_ == schema::Format_NHWC) { input_shape = {N, IH, IW, CI}; - weight_shape = {CO, KH, KW, CI}; output_shape = {N, OH, OW, CO}; } else if (io_dataformat_ == schema::Format_NHWC4) { input_shape = {N, IH, IW, CI_ALIGN}; - weight_shape = {CO_ALIGN, KH, KW, CI_ALIGN}; output_shape = {N, OH, OW, CO_ALIGN}; } - cl_uint2 stride = {static_cast(param->stride_h_), static_cast(param->stride_w_)}; - cl_uint4 pad = {static_cast(param->pad_u_), static_cast(param->pad_d_), - static_cast(param->pad_l_), static_cast(param->pad_r_)}; + cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_}; + cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; int arg_cn = 0; ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); @@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() { ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, weight_shape); ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, stride); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride); ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad); - std::vector global = {OW, OH, CO_TILE_NUM}; - std::vector local = {1, 1, CO_TILE_NUM}; - + std::vector global; + std::vector local; + GetLocalSize(reinterpret_cast(this->opParameter), &global, &local); + // float8 per thread + if (io_dataformat_ == schema::Format_NHWC4) { + local[2] = UP_DIV(local[2], 2); + global[2] = UP_DIV(global[2], 2); + global[2] = UP_DIV(global[2], global[2]) * global[2]; + } ocl_runtime->RunKernel(kernel_, global, local, nullptr); return 0; @@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector -#include "src/runtime/kernel/arm/fp32/convolution.h" +#include "src/ir/tensor.h" +#include "src/lite_kernel.h" +#include "schema/model_generated.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/arm/opclib/conv_parameter.h" @@ -44,5 +46,4 @@ class ConvolutionOpenCLKernel : public LiteKernel { }; } // namespace mindspore::kernel -#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_ - +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_