From 4ad86f42d0cfd54a633a593a2225a6046caa4f50 Mon Sep 17 00:00:00 2001
From: chenzomi <chenzomi12@gmail.com>
Date: Tue, 11 Aug 2020 11:36:28 +0800
Subject: [PATCH] [MS][LITE] opencl bug fix for pooling

---
 .../kernel/opencl/cl/fp32/avg_pool2d.cl       | 22 ++++++++----------
 .../kernel/opencl/cl/fp32/max_pool2d.cl       | 23 ++++++++-----------
 .../runtime/kernel/opencl/kernel/pooling2d.cc | 13 ++++-------
 3 files changed, 23 insertions(+), 35 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
index 891b2fcf51..c5fd9e84ca 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
@@ -1,5 +1,5 @@
 __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
-                           const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
+                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
   // axis to dst tensor coordinate
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                                  const int4 output_shape, const int2 stride, const int2 kernel_size,
-                                  const int2 padding) {
+                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
   // axis to dst tensor coordinate
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
   int xs = X * stride.x + padding.x;
   int ys = Y * stride.y + padding.y;
 
-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    bool outside_x = x_c < 0 || x_c >= input_shape.x;
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
-
-      r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z));
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    bool outside_y = y_c < 0 || y_c >= input_shape.y;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
+      r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
       window_size += !outside ? 1.0f : 0.0f;
     }
   }
   float4 result = convert_float4(r / window_size);
-  write_imagef(output, (int2)(X, Y * output_shape.w + Z), result);
+  write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
index f65e3e06d6..c64d40667e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
@@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
   output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
 }
 
-__constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                                const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
@@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
   float4 maximum = (float4)(-10000.0f);
   int xs = X * stride.x + padding.x;
   int ys = Y * stride.y + padding.y;
-
-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    if (x_c < 0 || x_c >= input_shape.x) {
-      continue;
-    }
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      if (y_c < 0 || y_c >= input_shape.y) {
-        continue;
-      }
-      float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z));
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    if (y_c < 0 || y_c >= input_shape.y) continue;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      if (x_c < 0 || x_c >= input_shape.x) continue;
+      float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
       maximum = max(src, maximum);
     }
   }
-  write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum);
+  write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 8a5b05e0a2..0e7d8dc27c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() {
 std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
   const size_t global_x = outputs_[0]->Height();
   const size_t global_y = outputs_[0]->Width();
-  const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
+  const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM);
   std::vector<size_t> global = {global_x, global_y, global_z};
   return global;
 }
@@ -90,13 +90,8 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
 int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
   size_t im_dst_x, im_dst_y;
-  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
-    im_dst_x = outputs_[0]->Height();
-    im_dst_y = outputs_[0]->Width() * CO4;
-  } else {
-    im_dst_y = outputs_[0]->Width();
-    im_dst_x = outputs_[0]->Height() * CO4;
-  }
+  im_dst_x = outputs_[0]->Width() * CO4;
+  im_dst_y = outputs_[0]->Height();
 #ifdef ENABLE_FP16
   size_t img_dtype = CL_HALF_FLOAT;
 #else
@@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 
   // attribute
-  int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
+  int slices = UP_DIV(outputs_[0]->Channel(), C4NUM);
   cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices};
   cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices};
   cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};