From 4ad86f42d0cfd54a633a593a2225a6046caa4f50 Mon Sep 17 00:00:00 2001 From: chenzomi Date: Tue, 11 Aug 2020 11:36:28 +0800 Subject: [PATCH] [MS][LITE] opencl bug fix for pooling --- .../kernel/opencl/cl/fp32/avg_pool2d.cl | 22 ++++++++---------- .../kernel/opencl/cl/fp32/max_pool2d.cl | 23 ++++++++----------- .../runtime/kernel/opencl/kernel/pooling2d.cc | 13 ++++------- 3 files changed, 23 insertions(+), 35 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl index 891b2fcf51..c5fd9e84ca 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl @@ -1,5 +1,5 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { + const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); int Y = get_global_id(1); @@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, - const int2 padding) { + const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); int Y = get_global_id(1); @@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d int xs = X * stride.x + padding.x; int ys = Y * stride.y + padding.y; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - bool outside_x = x_c < 0 || x_c >= input_shape.x; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; - - r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z)); + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + bool outside_y = y_c < 0 || y_c >= input_shape.y; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; + r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); window_size += !outside ? 1.0f : 0.0f; } } float4 result = convert_float4(r / window_size); - write_imagef(output, (int2)(X, Y * output_shape.w + Z), result); + write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl index f65e3e06d6..c64d40667e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl @@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; } -__constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { @@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d float4 maximum = (float4)(-10000.0f); int xs = X * stride.x + padding.x; int ys = Y * stride.y + padding.y; - - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.x) { - continue; - } - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.y) { - continue; - } - float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z)); + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) continue; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) continue; + float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); maximum = max(src, maximum); } } - write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum); + write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 8a5b05e0a2..0e7d8dc27c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() { std::vector PoolingOpenCLKernel::InitGlobalSize() const { const size_t global_x = outputs_[0]->Height(); const size_t global_y = outputs_[0]->Width(); - const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4); + const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM); std::vector global = {global_x, global_y, global_z}; return global; } @@ -90,13 +90,8 @@ std::vector PoolingOpenCLKernel::InitGlobalSize() const { int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); size_t im_dst_x, im_dst_y; - if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { - im_dst_x = outputs_[0]->Height(); - im_dst_y = outputs_[0]->Width() * CO4; - } else { - im_dst_y = outputs_[0]->Width(); - im_dst_x = outputs_[0]->Height() * CO4; - } + im_dst_x = outputs_[0]->Width() * CO4; + im_dst_y = outputs_[0]->Height(); #ifdef ENABLE_FP16 size_t img_dtype = CL_HALF_FLOAT; #else @@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); // attribute - int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4); + int slices = UP_DIV(outputs_[0]->Channel(), C4NUM); cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices}; cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices}; cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};