fix opencl winograd OH/OW odd bug

5 years ago · 0b4c480474
parent 0a9938ddc8
commit 0b4c480474
12 changed files with 279 additions and 218 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write
  }
  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
 }
+__kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                                int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  FLT4 data = (FLT4)(0.f);
+  __global float *src_addr = (__global float *)src_data;
+  __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
+  __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
+  __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
+  if ((Z + 1) * 4 <= shape.w) {
+    data = TO_FLT4(((__global float4 *)src_addr_0)[0]);
+  } else {
+    if ((shape.w - Z * 4) >= 1) {
+      data.x = src_addr_0[0];
+    }
+    if ((shape.w - Z * 4) >= 2) {
+      data.y = src_addr_1[0];
+    }
+    if ((shape.w - Z * 4) >= 3) {
+      data.z = src_addr_2[0];
+    }
+  }
+  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
+}
+__kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                               int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  FLT4 data = (FLT4)(0.f);
+  __global half *src_addr = (__global half *)src_data;
+  __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
+  __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
+  __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
+  if ((Z + 1) * 4 <= shape.w) {
+    data = TO_FLT4(((__global half4 *)src_addr_0)[0]);
+  } else {
+    if ((shape.w - Z * 4) >= 1) {
+      data.x = src_addr_0[0];
+    }
+    if ((shape.w - Z * 4) >= 2) {
+      data.y = src_addr_1[0];
+    }
+    if ((shape.w - Z * 4) >= 3) {
+      data.z = src_addr_2[0];
+    }
+  }
+  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
+}
 __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
                                                 int4 shape) {
  int X = get_global_id(0);
@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
    }
  }
 }
+__kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
+                                                int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+  int offset = (X * shape.z + Y) * shape.w + Z * 4;
+  __global float *dst_addr = (__global float *)dst_data;
+  __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
+  __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
+  __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
+  dst_addr += offset;
+  if ((Z + 1) * 4 <= shape.w) {
+    ((__global float4 *)dst_addr_0)[0] = data;
+  } else {
+    if (shape.w - Z * 4 >= 1) {
+      dst_addr_0[0] = data.x;
+    }
+    if (shape.w - Z * 4 >= 2) {
+      dst_addr_1[0] = data.y;
+    }
+    if (shape.w - Z * 4 >= 3) {
+      dst_addr_2[0] = data.z;
+    }
+  }
+}
+__kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
+                                               int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+  int offset = (X * shape.z + Y) * shape.w + Z * 4;
+  __global half *dst_addr = (__global half *)dst_data;
+  __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
+  __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
+  __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
+  dst_addr += offset;
+  if ((Z + 1) * 4 <= shape.w) {
+    ((__global half4 *)dst_addr_0)[0] = data;
+  } else {
+    if (shape.w - Z * 4 >= 1) {
+      dst_addr_0[0] = data.x;
+    }
+    if (shape.w - Z * 4 >= 2) {
+      dst_addr_1[0] = data.y;
+    }
+    if (shape.w - Z * 4 >= 3) {
+      dst_addr_2[0] = data.z;
+    }
+  }
+}
 __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
                                               int4 shape) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation;

 namespace mindspore::kernel {

-void ActivationOpenClKernel::InitBuffer() {}
-
 int ActivationOpenClKernel::Init() {
  in_size_ = in_tensors_[0]->shape().size();
  out_size_ = out_tensors_[0]->shape().size();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel {
  int Run() override;
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
  cl_int4 GetImg2dShape();
-  void InitBuffer();
+  void InitBuffer() {}

 private:
  cl::Kernel kernel_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@ -16,10 +16,10 @@
 #include <cstring>
 #include <algorithm>
 #include <set>
-#include<string>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() {

 int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; }

-int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) {
-  if (number % 8 == 0 && max_divider >= 8) {
-    return number / 8;
-  }
-  if (number % 4 == 0 && 4 <= max_divider) {
-    return number / 4;
-  }
-  if (number % 2 == 0 && 2 <= max_divider) {
-    return number / 2;
-  }
-
-  for (int i = max_divider; i != 0; i--) {
-    if (number % i == 0) {
-      return i;
-    }
-  }
-  return RET_OK;
-}
-
 void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
-  int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x);
+  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
-  int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
+  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

  local->clear();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -20,6 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/batchnorm.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
  *img_size = vec;
  return RET_OK;
 }
+
 int BatchNormOpenCLKernel::Init() {
  auto in_format = op_format_;
  if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) {
@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() {

 int BatchNormOpenCLKernel::ReSize() { return RET_OK; }

-int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) {
-  if (number % 8 == 0 && 8 <= max_divider) {
-    return number / 8;
-  }
-  if (number % 4 == 0 && 4 <= max_divider) {
-    return number / 4;
-  }
-  if (number % 2 == 0 && 2 <= max_divider) {
-    return number / 2;
-  }
-
-  for (int i = max_divider; i != 0; i--) {
-    if (number % i == 0) {
-      return i;
-    }
-  }
-  return RET_OK;
-}
-
 void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
-  int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x);
+  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
-  int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
+  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

  local->clear();
@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
  local->push_back(y);
  local->push_back(z);
 }
+
 int BatchNormOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@ -14,12 +14,12 @@
 * limitations under the License.
 */
 #include <cstring>
-#include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/concat.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/concat.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int
  return RET_OK;
 }

-int ConcatGetBiggestDividerWithPriority(int number, int max_divider) {
-  if (number % 8 == 0 && max_divider >= 8) {
-    return number / 8;
-  }
-  if (number % 4 == 0 && 4 <= max_divider) {
-    return number / 4;
-  }
-  if (number % 2 == 0 && 2 <= max_divider) {
-    return number / 2;
-  }
-
-  for (int i = max_divider; i != 0; i--) {
-    if (number % i == 0) {
-      return i;
-    }
-  }
-  return RET_OK;
-}
-
 void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
-  int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x);
+  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
-  int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
+  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

  local->clear();
@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
  local->push_back(y);
  local->push_back(z);
 }
+
 int ConcatOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@ -19,6 +19,7 @@
 #include <algorithm>
 #include "src/common/utils.h"
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"

@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() {
  return RET_OK;
 }

-int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
+int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
  }

  if (use_fp16_) {
-    OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
+    ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6,
+                                              6, CI_, 2);
  } else {
-    OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
+    ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6,
+                                          CI_, 2);
  }

  return RET_OK;
 }

-template <typename SRC_T, typename DST_T>
-int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
-  auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
-  auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);
-
-  // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
-  for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
-    for (size_t kh = 0; kh < KH; ++kh) {
-      for (size_t kw = 0; kw < KW; ++kw) {
-        for (size_t ci = 0; ci < CI_; ++ci) {
-          size_t co_outer = co / (CO_TILE * OGroup);
-          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
-          size_t co_inner = co % CO_TILE;
-          size_t ci_outer = ci / CI_TILE;
-          size_t ci_inner = ci % CI_TILE;
-          size_t dst_idx =
-            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
-              CO_TILE +
-            co_inner;
-          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
-        }
-      }
-    }
-  }
-  return RET_OK;
-}
-
 int ConvolutionOpenCLKernel::InitWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();

@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() {

  // rearrange weight
  if (use_winograd_) {
-    RearrangeWinogradWeight();
+    GenerateWinogradWeight();
  } else {
    auto weight_tensor = in_tensors_[1];
    if (weight_tensor->data_type() == kNumberTypeFloat16) {
      if (use_fp16_) {
-        OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
+        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      } else {
-        OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
+        ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      }
    } else {
      if (use_fp16_) {
-        OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
+        ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      } else {
-        OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
+        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      }
    }
  }
@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
    "    }\n"
    "\n"
    "    int IH = input_shape.y, IW = input_shape.z;\n"
-    "    int TILE_X = IW / 4;\n"
+    "    int TILE_X = UP_DIV(IW, 4);\n"
    "    int tile_x = tile_xy % TILE_X;\n"
    "    int tile_y = tile_xy / TILE_X;\n"
    "\n"
@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
 std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
  std::string code =
    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+    "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
+    "\n"
    "__constant sampler_t\n"
    "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
    "\n"
@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
    "        }\n"
    "    }\n"
    "\n"
+    "    int TILE_X = UP_DIV(OW, 4);\n"
    "    for (int x = 0; x < 4; x++)\n"
    "    {\n"
    "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
  }

  code +=
-    "        int TILE_X = OW / 4;\n"
-    "        int tile_x = tile_xy % TILE_X * 4;\n"
-    "        int tile_y = tile_xy / TILE_X * 4;\n";
+    "        int tile_x = tile_xy % TILE_X;\n"
+    "        int tile_y = tile_xy / TILE_X;\n"
+    "        int ow = tile_x * 4 + x;\n"
+    "        int oh = tile_y * 4 + row;\n";

  if (op_format_ == Format_NHWC4) {
-    code += "        WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
+    code += "        if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n";
  } else if (op_format_ == Format_NC4HW4) {
-    code += "        WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
+    code += "        if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n";
  }

  code +=
@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
  size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
  size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];

-  size_t local_c = GetBiggestDivider(global_c, max_z_size);
+  size_t local_c = GetMaxDivisor(global_c, max_z_size);
  if (local_c == 0) {
    MS_LOG(ERROR) << "Divide by zero";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {

  int InitWeight();
  int InitBias();
-  int RearrangeWinogradWeight();
-  template <typename SRC_T, typename DST_T>
-  int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);
+  int GenerateWinogradWeight();

  std::string CodeGenConvolutionNHWC4();
  std::string CodeGenConvolutionNC4HW4();
@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }
-
-  static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
-    std::vector<float> C(M * K);
-    for (int i = 0; i < M; ++i) {
-      for (int j = 0; j < K; ++j) {
-        float s = 0.0f;
-        for (int k = 0; k < N; ++k) {
-          s += A[i * N + k] * B[k * K + j];
-        }
-        C[i * K + j] = s;
-      }
-    }
-    return C;
-  }
-
-  static int GetBiggestDivider(int x, int y) {
-    for (int i = y; i != 0; i--) {
-      if (x % i == 0) {
-        return i;
-      }
-    }
-    return 1;
-  }
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@ -20,6 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/slice.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/slice.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  *img_size = vec;
  return RET_OK;
 }
+
 int SliceOpenCLKernel::Init() {
  std::string kernel_name = "slice";
  auto in_format = op_format_;
@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() {

 int SliceOpenCLKernel::ReSize() { return RET_OK; }

-int SliceGetBiggestDividerWithPriority(int number, int max_divider) {
-  if (number % 8 == 0 && 8 <= max_divider) {
-    return number / 8;
-  } else if (number % 4 == 0 && 4 <= max_divider) {
-    return number / 4;
-  } else if (number % 2 == 0 && 2 <= max_divider) {
-    return number / 2;
-  }
-  for (int i = max_divider; i != 0; i--) {
-    if (number % i == 0) {
-      return i;
-    }
-  }
-  return 1;
-}
-
 void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
-  int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x);
+  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
-  int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
+  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

  local->clear();
@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
  local->push_back(y);
  local->push_back(z);
 }
+
 int SliceOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
-
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -16,7 +16,6 @@

 #include "src/runtime/kernel/opencl/utils.h"
 #include <algorithm>
-#include <string>
 #include <vector>
 #include "src/kernel_registry.h"

@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
 }
 }  // namespace mindspore::lite

-namespace mindspore {
-namespace kernel {
+namespace mindspore::kernel {
+
+int GetMaxDivisor(int x, int divisor) {
+  int i = divisor;
+  while (i > 0) {
+    if (x % i == 0) {
+      return i;
+    }
+    i--;
+  }
+  return 1;
+}
+
+int GetMaxDivisorStrategy0(int x, int divisor) {
+  if (divisor >= 8 && x % 8 == 0) {
+    return 8;
+  } else if (divisor >= 4 && x % 4 == 0) {
+    return 4;
+  } else if (divisor >= 2 && x % 2 == 0) {
+    return 2;
+  } else {
+    return GetMaxDivisor(x, divisor);
+  }
+}
+
+int GetMaxDivisorStrategy1(int x, int divisor) {
+  if (divisor >= 8 && x % 8 == 0) {
+    return x / 8;
+  } else if (divisor >= 4 && x % 4 == 0) {
+    return x / 4;
+  } else if (divisor >= 2 && x % 2 == 0) {
+    return x / 2;
+  } else {
+    return GetMaxDivisor(x, divisor);
+  }
+}
+
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
-  std::vector<size_t> result(3, 1);
+  std::vector<size_t> result(3);
  for (int i = 0; i < 3; ++i) {
-    result[i] = AlignByN(global[i], local[i]);
+    result[i] = UP_ROUND(global[i], local[i]);
  }
  return result;
 }

 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) {
-  size_t wg_z = GetBiggestDividerWithPriority(global[2], 8);
-  if (wg_z == 0) {
+  size_t local_z = GetMaxDivisorStrategy0(global[2], 8);
+  if (local_z == 0) {
    MS_LOG(ERROR) << "Divide by zero";
    return {};
  }
-  size_t wg_xy_size = max_size / wg_z;
-  size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size);
-  size_t wg_y = std::min(wg_xy_size / wg_x, global[1]);
-  std::vector<size_t> local = {wg_x, wg_y, wg_z};
+  size_t local_xy = max_size / local_z;
+  size_t local_x = std::min(UP_DIV(global[0], 2), local_xy);
+  size_t local_y = std::min(local_xy / local_x, global[1]);
+  std::vector<size_t> local = {local_x, local_y, local_z};
  return local;
 }

@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) {
      return "Unknown OpenCL error code";
  }
 }
-}  // namespace kernel
-}  // namespace mindspore
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@ -23,7 +23,7 @@
 #include "utils/log_adapter.h"
 #include "nnacl/op_base.h"
 #include "src/lite_kernel.h"
-#include "src/common//utils.h"
+#include "src/common/utils.h"

 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con

 namespace mindspore::kernel {

-/**
- * GetLocalSize
- * @param number
- * @param max_divider
- * @return
- */
-template <typename T, typename N>
-T GetBiggestDividerWithPriority(T number, N max_divider) {
-  if (number % 8 == 0 && 8 <= max_divider) {
-    return (T)8;
-  }
-  if (number % 4 == 0 && 4 <= max_divider) {
-    return (T)4;
-  }
-  if (number % 2 == 0 && 2 <= max_divider) {
-    return (T)2;
-  }
-  for (int i = max_divider; i != 0; i--) {
-    if (number % i == 0) {
-      return (T)i;
-    }
-  }
-  return (T)1;
-}
+int GetMaxDivisor(int x, int divisor);

-/**
- * GetLocalSize
- * @param n must be non negative
- * @param divisor must be greater than zero
- * @return
- */
-template <typename T, typename N>
-T DivideRoundUp(T n, N divisor) {
-  const T div = static_cast<T>(divisor);
-  const T q = n / div;
-  return n % div == 0 ? q : q + 1;
-}
+int GetMaxDivisorStrategy0(int x, int divisor);

-/**
- * GetLocalSize
- * @param number
- * @param n
- * @return
- */
-template <typename T, typename N>
-T AlignByN(T number, N n) {
-  return DivideRoundUp(number, n) * n;
-}
+int GetMaxDivisorStrategy1(int x, int divisor);

-// GetGlobalSize
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global);

-// GetLocalSize
 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size);

 std::string CLErrorCode(cl_int error_code);
@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
    }
  }
 }
+
 template <class T1, class T2>
 void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co
    }
  }
 }
+
 template <class T1, class T2>
 void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
  }
 }

+template <class T>
+std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) {
+  std::vector<T> C(M * K);
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < K; ++j) {
+      float s = 0.0f;
+      for (int k = 0; k < N; ++k) {
+        s += A[i * N + k] * B[k * K + j];
+      }
+      C[i * K + j] = s;
+    }
+  }
+  return C;
+}
+
+template <typename SRC_T, typename DST_T>
+void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1,
+                             size_t CI_TILE = 4, size_t CO_TILE = 4) {
+  auto origin_weight = reinterpret_cast<SRC_T *>(src);
+  auto packed_weight = reinterpret_cast<DST_T *>(dst);
+  auto CI_SLICES = UP_DIV(CI, CI_TILE);
+  for (size_t co = 0, src_idx = 0; co < CO; ++co) {
+    for (size_t kh = 0; kh < KH; ++kh) {
+      for (size_t kw = 0; kw < KW; ++kw) {
+        for (size_t ci = 0; ci < CI; ++ci) {
+          size_t co_outer = co / (CO_TILE * OGroup);
+          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
+          size_t co_inner = co % CO_TILE;
+          size_t ci_outer = ci / CI_TILE;
+          size_t ci_inner = ci % CI_TILE;
+          size_t dst_idx =
+            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
+              CO_TILE +
+            co_inner;
+          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@ -133,6 +133,8 @@ Format get_op_format(Format input_format) {
    case Format_NHWC:
    case Format_NHWC4:
      return Format_NHWC4;
+    case Format_NCHW:
+      return Format_NHWC4;
    default:
      return Format_NC4HW4;
  }
@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256
  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
 }

-TEST_F(TestConvolutionOpenCL, simple_test0) {
+TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
  std::string attr =
    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) {
  TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
  TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
+TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
+  float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f};
+  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  float bias_data[] = {0.0f, 0.0f};
+  float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f};
+  TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
+}
+
+TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
+  float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f};
+  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  float bias_data[] = {0.0f, 0.0f};
+  float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f,  5.0f,  0.0f, 0.0f,
+                         9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f};
+  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
+            expect_data);
+  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
+            expect_data);
+  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
+            expect_data);
+  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
+            expect_data);
+}

 TEST_F(TestConvolutionOpenCL, simple_test1) {
  std::string attr =