update lite gpu opencl convolution kernel

pull/3830/head
wangdongxu 5 years ago
parent 6c4ee3f3d1
commit 04e4cba6f0

@ -1,87 +1,191 @@
#define CI_TILE 4
#define CO_TILE 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
//#define __global
//#pragma OPENCL EXTENSION cl_arm_printf : enable
__kernel void convolution_NHWC_OHWI(__global float *input,
__global float *weight,
__global float *bias,
__global float *output,
const uint4 input_shape, // NHWC
const uint4 weight_shape, // OHWI
const uint4 output_shape, // NHWC
const uint2 stride, // HW
const uint4 pad) // top bottom left right
const int4 input_shape, // NHWC
const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right
{
uint ow = get_global_id(0);
uint oh = get_global_id(1);
uint co_outer = get_global_id(2);
int ow = get_global_id(0);
int oh = get_global_id(1);
int co_slice = get_global_id(2);
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z;
int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE);
uint CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
uint CO = output_shape.w, OW = output_shape.z;
uint KH = weight_shape.y, KW = weight_shape.z;
uint stride_h = stride.x, stride_w = stride.y;
uint pad_top = pad.x, pad_left = pad.z;
uint CI_TILE_NUM = UP_DIV(CI, CI_TILE);
uint CO_TILE_NUM = UP_DIV(CO, CO_TILE);
if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)
return;
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for (uint kh = 0; kh < KH; ++kh)
for (int kh = 0; kh < KH; ++kh)
{
uint ih = kh + oh * stride_h - pad_top;
for (uint kw = 0; kw < KW; ++kw)
int ih = kh + oh * strideH - padTop;
for (int kw = 0; kw < KW; ++kw)
{
uint iw = kw + ow * stride_w - pad_left;
for (uint ci_outer = 0; ci_outer < CI_TILE_NUM; ++ci_outer)
int iw = kw + ow * strideW - padLeft;
for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice)
{
for (uint ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
{
uint ci = ci_outer * CI_TILE + ci_inner;
int ci = ci_slice * CI_TILE + ci_inner;
if (ci >= CI)
break;
uint input_idx = ih * IW * CI + iw * CI + ci;
int input_idx = ih * IW * CI + iw * CI + ci;
float value = 0;
if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
value = 0;
else
value = input[input_idx];
uint CO_TILE_OFFSET = KH * KW * CI;
uint weight_idx = (co_outer * CO_TILE) * CO_TILE_OFFSET +
kh * KW * CI +
kw * CI +
ci;
acc.x += weight[weight_idx + 0 * CO_TILE_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_TILE_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_TILE_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_TILE_OFFSET] * value;
int CO_OFFSET = KH * KW * CI;
int weight_idx = (co_slice * CO_TILE) * CO_OFFSET +
kh * KW * CI +
kw * CI +
ci;
acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
}
}
}
}
uint output_idx = oh * OW * CO + ow * CO + (co_outer * CO_TILE);
if (co_outer < CO_TILE_NUM - 1 || CO % CO_TILE == 0)
int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0)
{
output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_outer * CO_TILE + 3];
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
}
else if (CO % CO_TILE == 1)
{
output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
}
else if (CO % CO_TILE == 2)
{
output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
}
else if (CO % CO_TILE == 3)
{
output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
}
}
//#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//#define FLT4 half4
#define FLT4 float4
__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
__global FLT4 *weight,
__global FLT4 *bias,
__global FLT4 *output,
const int4 input_shape, // NHWC
const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right
{
int oh = get_global_id(0); // [0, OH)
int ow = get_global_id(1); // [0, OW)
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE);
int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z;
if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES)
return;
if (2 * co_slice + 1 >= CO_SLICES)
{
FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
for (int kh = 0; kh < KH; ++kh)
{
int ih = kh + oh * strideH - padTop;
for (int kw = 0; kw < KW; ++kw)
{
int iw = kw + ow * strideW - padLeft;
if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
{
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
{
FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
out0_c4 += w0_ic1_oc4[0] * in_c4.x;
out0_c4 += w0_ic1_oc4[1] * in_c4.y;
out0_c4 += w0_ic1_oc4[2] * in_c4.z;
out0_c4 += w0_ic1_oc4[3] * in_c4.w;
w0_ic1_oc4 += 4;
}
}
else
{
w0_ic1_oc4 += 4 * CI_SLICES;
}
}
}
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
}
else
{
FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
__global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
for (int kh = 0; kh < KH; ++kh)
{
int ih = kh + oh * strideH - padTop;
for (int kw = 0; kw < KW; ++kw)
{
int iw = kw + ow * strideW - padLeft;
if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
{
int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
{
FLT4 in_c4 = input[idx + ci_slice];
out0_c4 += w0_ic1_oc4[0] * in_c4.x;
out0_c4 += w0_ic1_oc4[1] * in_c4.y;
out0_c4 += w0_ic1_oc4[2] * in_c4.z;
out0_c4 += w0_ic1_oc4[3] * in_c4.w;
w0_ic1_oc4 += 4;
out1_c4 += w1_ic1_oc4[0] * in_c4.x;
out1_c4 += w1_ic1_oc4[1] * in_c4.y;
out1_c4 += w1_ic1_oc4[2] * in_c4.z;
out1_c4 += w1_ic1_oc4[3] * in_c4.w;
w1_ic1_oc4 += 4;
}
}
else
{
w0_ic1_oc4 += 4 * CI_SLICES;
w1_ic1_oc4 += 4 * CI_SLICES;
}
}
}
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
}
}

@ -14,16 +14,12 @@
* limitations under the License.
*/
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include <vector>
#include <string>
#include <set>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#ifndef PROGRAM_WITH_IL
#include <algorithm>
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc"
#endif
#include "src/kernel_registry.h"
using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() {
MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!";
}
outputs_[0]->SetFormat(schema::Format_NHWC4);
io_dataformat_ = outputs_[0]->GetFormat();
auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC;
auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4;
if (!io_NHWC && !io_NHWC4) {
MS_LOG(ERROR) << "input and output data_format is invalid!";
}
io_dataformat_ = inputs_[0]->GetFormat();
if (inputs_[1]->GetFormat() != schema::Format_KHWC) {
MS_LOG(ERROR) << "weight data_format is invalid!";
}
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::string kernel_name = "convolution_NHWC_OHWI";
#ifdef PROGRAM_WITH_IL
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
#else
std::set<std::string> build_options;
std::string source = convolution_source_fp32;
std::string program_name = "convolution";
std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->LoadSource(program_name, source);
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
this->InitBuffer();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return 0;
}
int ConvolutionOpenCLKernel::InitBuffer() {
@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() {
memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size());
allocator->UnmapBuffer(packed_bias_);
} else if (io_dataformat_ == schema::Format_NHWC4) {
// OHWI -> OHWIIO
auto weight_shape = weight_tensor->shape();
size_t CO = weight_shape[0];
size_t KH = weight_shape[1];
size_t KW = weight_shape[2];
size_t CI = weight_shape[3];
size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
size_t weight_size_tiled = CO_ALIGN * KH * KW * CI_ALIGN * sizeof(float);
size_t CI_SLICES = UP_DIV(CI, C4NUM);
size_t CO_SLICES = UP_DIV(CO, C4NUM);
constexpr size_t CI_TILE = C4NUM;
constexpr size_t CO_TILE = C4NUM;
size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(weight_size_tiled));
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
memset_s(packed_weight_, weight_size_tiled, 0x00, weight_size_tiled);
memset_s(packed_weight_, packed_weight_size, 0x00, packed_weight_size);
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
for (int co = 0; co < CO; ++co) {
for (int kh = 0; kh < KH; ++kh) {
for (int kw = 0; kw < KW; ++kw) {
for (int ci = 0; ci < CI; ++ci) {
packed_weight_[co * KH * KW * CI_ALIGN + kh * KW * CI_ALIGN + kw * CI_ALIGN + ci] =
weight_data[co * KH * KW * CI + kh * KW * CI + kw * CI + ci];
auto co_outer = co / CO_TILE;
auto co_inner = co % CO_TILE;
auto ci_outer = ci / CI_TILE;
auto ci_inner = ci % CI_TILE;
packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
co_inner] = *(weight_data++);
}
}
}
}
allocator->UnmapBuffer(packed_weight_);
size_t bias_size_tiled = CO_ALIGN * sizeof(float);
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(bias_size_tiled));
size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
memset_s(packed_bias_, bias_size_tiled, 0x00, bias_size_tiled);
memset_s(packed_bias_, packed_bias_size, 0x00, packed_bias_size);
auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
for (int co = 0; co < CO; ++co) {
packed_bias_[co] = bias_data[co];
@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() {
}
return 0;
}
} // namespace mindspore::kernel
int ConvolutionOpenCLKernel::ReSize() { return 0; }
static int GetBiggestDivider(int x, int y) {
for (int i = y; i != 0; i--) {
if (x % i == 0) {
return i;
}
}
return 1;
}
static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global, std::vector<size_t> *local) {
constexpr size_t work_group_size[] = {4, 4, 1};
constexpr size_t max_work_item_sizes[] = {512, 512, 512};
constexpr size_t max_work_group_size = 512;
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
// 先用OH OW CO_SLICES初始化global并且441对齐
size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2];
// 使用策略计算local
size_t local_c = GetBiggestDivider(global_c, max_z_size);
size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
size_t local_w = std::min(global_w, local_hw_size);
size_t local_h = std::min(local_hw_size / local_w, global_h);
if (local_h == global_h && global_h % 2 == 0) {
local_h = global_h / 2;
}
global->clear();
global->push_back(UP_DIV(param->output_h_, local_h) * local_h);
global->push_back(UP_DIV(param->output_w_, local_w) * local_w);
global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c);
local->clear();
local->push_back(local_h);
local->push_back(local_w);
local->push_back(local_c);
}
int ConvolutionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->Name() << " Running!";
MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto param = reinterpret_cast<ConvParameter *>(opParameter);
auto input0_shape = inputs_[0]->shape(); // NHWC
auto input1_shape = inputs_[1]->shape(); // OHWI
auto outpu0_shape = outputs_[0]->shape(); // NHWC
cl_uint N = input0_shape[0];
cl_uint CI = input0_shape[3];
cl_uint IH = input0_shape[1];
cl_uint IW = input0_shape[2];
cl_uint CO = outpu0_shape[3];
cl_uint OH = outpu0_shape[1];
cl_uint OW = outpu0_shape[2];
cl_uint KH = input1_shape[1];
cl_uint KW = input1_shape[2];
cl_uint CI_TILE_NUM = UP_DIV(CI, C4NUM);
cl_uint CO_TILE_NUM = UP_DIV(CO, C4NUM);
cl_uint CI_ALIGN = CI_TILE_NUM * C4NUM;
cl_uint CO_ALIGN = CO_TILE_NUM * C4NUM;
cl_uint4 input_shape;
cl_uint4 weight_shape;
cl_uint4 output_shape;
cl_int N = input0_shape[0];
cl_int CI = input0_shape[3];
cl_int IH = input0_shape[1];
cl_int IW = input0_shape[2];
cl_int CO = outpu0_shape[3];
cl_int OH = outpu0_shape[1];
cl_int OW = outpu0_shape[2];
cl_int KH = input1_shape[1];
cl_int KW = input1_shape[2];
cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
cl_int4 input_shape;
cl_int4 output_shape;
if (io_dataformat_ == schema::Format_NHWC) {
input_shape = {N, IH, IW, CI};
weight_shape = {CO, KH, KW, CI};
output_shape = {N, OH, OW, CO};
} else if (io_dataformat_ == schema::Format_NHWC4) {
input_shape = {N, IH, IW, CI_ALIGN};
weight_shape = {CO_ALIGN, KH, KW, CI_ALIGN};
output_shape = {N, OH, OW, CO_ALIGN};
}
cl_uint2 stride = {static_cast<cl_uint>(param->stride_h_), static_cast<cl_uint>(param->stride_w_)};
cl_uint4 pad = {static_cast<cl_uint>(param->pad_u_), static_cast<cl_uint>(param->pad_d_),
static_cast<cl_uint>(param->pad_l_), static_cast<cl_uint>(param->pad_r_)};
cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_};
cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
int arg_cn = 0;
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() {
ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, weight_shape);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, stride);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad);
std::vector<size_t> global = {OW, OH, CO_TILE_NUM};
std::vector<size_t> local = {1, 1, CO_TILE_NUM};
std::vector<size_t> global;
std::vector<size_t> local;
GetLocalSize(reinterpret_cast<ConvParameter *>(this->opParameter), &global, &local);
// float8 per thread
if (io_dataformat_ == schema::Format_NHWC4) {
local[2] = UP_DIV(local[2], 2);
global[2] = UP_DIV(global[2], 2);
global[2] = UP_DIV(global[2], global[2]) * global[2];
}
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return 0;
@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
REG_KERNEL(kGPU, PrimitiveType_Conv2D, OpenCLConvolutionKernelCreator)
} // namespace mindspore::kernel

@ -14,11 +14,13 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
#include <vector>
#include "src/runtime/kernel/arm/fp32/convolution.h"
#include "src/ir/tensor.h"
#include "src/lite_kernel.h"
#include "schema/model_generated.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
@ -44,5 +46,4 @@ class ConvolutionOpenCLKernel : public LiteKernel {
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_

Loading…
Cancel
Save