From 9674491150b478e88a9189dfc5902ae36f45d667 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Tue, 1 Sep 2020 19:56:50 +0800 Subject: [PATCH] fix bug in pooling and matmul --- .../runtime/kernel/opencl/cl/avg_pool2d.cl | 31 ++++++++++--------- .../runtime/kernel/opencl/cl/max_pool2d.cl | 23 ++++++++------ .../runtime/kernel/opencl/kernel/matmul.cc | 7 ++--- .../runtime/kernel/opencl/kernel/pooling2d.cc | 10 +++--- .../runtime/kernel/opencl/kernel/pooling2d.h | 1 + .../src/runtime/kernel/opencl/opencl_kernel.h | 2 ++ 6 files changed, 41 insertions(+), 33 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl index c5fd9e84ca..7728a10210 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl @@ -1,4 +1,7 @@ -__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif +__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); @@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, return; } - float4 r = (float4)(0.0f); - float window_size = 0.0f; - int xs = X * stride.x + padding.x; - int ys = Y * stride.y + padding.y; + FLT4 r = (FLT4)(0.0f); + FLT window_size = 0.0f; + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = xs + kx; @@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, for (int ky = 0; ky < kernel_size.y; ++ky) { int y_c = ys + ky; bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; - r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f); + r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f); window_size += !outside ? 1.0f : 0.0f; } } - float4 result = convert_float4(r / window_size); + FLT4 result = TO_FLT4(r / window_size); output[(output_shape.y * X + Y) * output_shape.w + Z] = result; } @@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d return; } - float4 r = (float4)(0.0f); - float window_size = 0.0f; - int xs = X * stride.x + padding.x; - int ys = Y * stride.y + padding.y; + FLT4 r = (FLT4)(0.0f); + FLT window_size = 0.0f; + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; for (int ky = 0; ky < kernel_size.y; ++ky) { int y_c = ys + ky; @@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = xs + kx; bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; - r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); + r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f); window_size += !outside ? 1.0f : 0.0f; } } - float4 result = convert_float4(r / window_size); - write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); + FLT4 result = TO_FLT4(r / window_size); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl index c64d40667e..f7b2542360 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl @@ -1,4 +1,7 @@ -__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif +__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); @@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, return; } - float4 maximum = (float4)(-10000.0f); - int xs = X * stride.x + padding.x; - int ys = Y * stride.y + padding.y; + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = xs + kx; @@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, if (y_c < 0 || y_c >= input_shape.y) { continue; } - float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; + FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; maximum = max(src, maximum); } } @@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d return; } - float4 maximum = (float4)(-10000.0f); - int xs = X * stride.x + padding.x; - int ys = Y * stride.y + padding.y; + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; for (int ky = 0; ky < kernel_size.y; ++ky) { int y_c = ys + ky; if (y_c < 0 || y_c >= input_shape.y) continue; for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = xs + kx; if (x_c < 0 || x_c >= input_shape.x) continue; - float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); + FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); maximum = max(src, maximum); } } - write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index 809cd5b1b2..a020dd857d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() { sizeCO = {co, UP_DIV(co, C4NUM)}; PadWeight(); in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); out_ori_format_ = out_tensors_[0]->GetFormat(); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); if (out_tensors_[0]->shape().size() == 2) { - out_ori_format_ = schema::Format_NC; out_tensors_[0]->SetFormat(schema::Format_NC4); - in_ori_format_ = schema::Format_NC; in_tensors_[0]->SetFormat(schema::Format_NC4); + } else { + in_tensors_[0]->SetFormat(schema::Format_NHWC4); + out_tensors_[0]->SetFormat(schema::Format_NHWC4); } MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 1e1c18e6eb..3d87827769 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() { return RET_INVALID_OP_NAME; } auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - + enable_fp16_ = ocl_runtime->GetFp16Enable(); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); #else @@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) size_t im_dst_x, im_dst_y; im_dst_x = out_tensors_[0]->Width() * CO4; im_dst_y = out_tensors_[0]->Height(); -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; -#else size_t img_dtype = CL_FLOAT; -#endif + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } img_size->clear(); std::vector vec{im_dst_x, im_dst_y, img_dtype}; *img_size = vec; @@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector InitGlobalSize() const; PoolingParameter *parameter_; cl::Kernel kernel_; + bool enable_fp16_{false}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index f7acc71fcb..b8b725cfb0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel { } OpenCLMemType GetMemType() { return out_mem_type_; } void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } + void SetFormatType(schema::Format format_type) { op_format_ = format_type; } schema::Format GetInOriFormat() { return in_ori_format_; } schema::Format GetOutOriFormat() { return out_ori_format_; } @@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel { OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; schema::Format in_ori_format_{schema::Format_NHWC}; schema::Format out_ori_format_{schema::Format_NHWC4}; + schema::Format op_format_{schema::Format_NC4HW4}; }; } // namespace mindspore::kernel