From 351c21eaf2749b9f294abef362f210246a405d4f Mon Sep 17 00:00:00 2001
From: wangdongxu <wangdongxu6@huawei.com>
Date: Tue, 3 Nov 2020 12:28:46 +0800
Subject: [PATCH] opencl_conv_optimize_performance

---
 .../runtime/kernel/opencl/cl/convolution.cl   | 580 +++++++++++-------
 .../src/runtime/kernel/opencl/cl/winograd.cl  | 187 ++++++
 .../kernel/opencl/kernel/convolution.cc       | 184 +++---
 .../kernel/opencl/kernel/convolution.h        |  32 +-
 .../src/runtime/kernel/opencl/opencl_kernel.h |  59 +-
 .../lite/src/runtime/kernel/opencl/utils.cc   |   2 +-
 6 files changed, 716 insertions(+), 328 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
index 7ea3edea22..cbed06393a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
@@ -3,268 +3,420 @@
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 #define CI_TILE 4
-#define CO_TILE 4
 #define MAX_IMAGE2D_SIZE 65535
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 
-#define ActType_No 0
 #define ActType_Relu 1
-#define ActType_Sigmod 2
 #define ActType_Relu6 3
 
-__kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                          __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                          const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
-  const int N = input_shape.x;
-  const int IH = input_shape.y;
-  const int IW = input_shape.z;
-  const int CI_SLICES = input_shape.w;
-
-  const int OH = output_shape.y;
-  const int OW = output_shape.z;
-  const int CO_SLICES = output_shape.w;
-
-  const int KH = kernel_stride.x;
-  const int KW = kernel_stride.y;
-  const int strideH = kernel_stride.z;
-  const int strideW = kernel_stride.w;
-
-  const int padTop = pad.x;
-  const int padLeft = pad.z;
-
-  const int dilationH = dilation.x;
-  const int dilationW = dilation.y;
-
-  int n_oh = get_global_id(0);      // [0, N*OH)
-  int ow = get_global_id(1);        // [0, OW)
-  int co_slice = get_global_id(2);  // [0, UP_DIV(CO, CO_TILE) )
-  int n;
-  int oh;
-  if (N == 1) {
-    n = 0;
-    oh = n_oh;
-  } else {
-    n = n_oh / OH;
-    oh = n_oh % OH;
-  }
-  if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
-    return;
+#define DEFINE_ARGS                                                               \
+  const int N = input_shape.x;                                                    \
+  const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w;    \
+  const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \
+  const int KH = kernel_stride.x, KW = kernel_stride.y;                           \
+  const int strideH = kernel_stride.z, strideW = kernel_stride.w;                 \
+  const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \
+  const int dilationH = dilation.x, dilationW = dilation.y;                       \
+                                                                                  \
+  const int n_oh = get_global_id(0);                                              \
+  const int ow = get_global_id(1) * BlockW;                                       \
+  const int co_slice = get_global_id(2) * BlockC;                                 \
+  const int OH_SLICES = UP_DIV(OH, BlockH);                                       \
+  const int n = n_oh / OH_SLICES;                                                 \
+  const int oh = (n_oh % OH_SLICES) * BlockH;                                     \
+  if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {                  \
+    return;                                                                       \
   }
 
-  FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  __global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;
+__kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+  const int BlockH = 1;
+  const int BlockW = 1;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  const int oh0 = oh + 0;
+  const int n_oh0 = n * OH + oh0;
+  const int ow0 = ow + 0;
+  const int co_slice0 = co_slice + 0;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
   for (int kh = 0; kh < KH; ++kh) {
-    int ih = kh * dilationH + oh * strideH - padTop;
+    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+
     for (int kw = 0; kw < KW; ++kw) {
-      int iw = kw * dilationW + ow * strideW - padLeft;
-      if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
-        for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-          FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih));
-          out_c4 += w_ic1_oc4[0] * in_c4.x;
-          out_c4 += w_ic1_oc4[1] * in_c4.y;
-          out_c4 += w_ic1_oc4[2] * in_c4.z;
-          out_c4 += w_ic1_oc4[3] * in_c4.w;
-          w_ic1_oc4 += 4;
-        }
-      } else {
-        w_ic1_oc4 += 4 * CI_SLICES;
+      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int x_idx0 = iw0 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        x_idx0++;
+
+        out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+        out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+        out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+        out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+
+        weight_ptr += 4;
       }
     }
   }
 
-  if (bias != 0) {
-    out_c4 = out_c4 + bias[co_slice];
+  if (bias) {
+    out_h0_w0_c0 += bias[co_slice0];
   }
 
-  // activation
   if (act_type == ActType_Relu) {
-    out_c4 = max(out_c4, (FLT4)(0.0f));
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
   } else if (act_type == ActType_Relu6) {
-    out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
   }
 
   if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
-    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4);
-
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
   } else {
-    WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4);
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0);
   }
 }
 
-constant FLT Bt[36] = {
-  1.0000000000f, 0.0000000000f,  -2.5000004768f, -0.0000001192f, 1.0000001192f,  0.0000000000f,
-  0.0000000000f, 0.9428091049f,  1.3333333731f,  -0.4714044929f, -0.6666667461f, 0.0000000000f,
-  0.0000000000f, -0.9428089857f, 1.3333334923f,  0.4714045525f,  -0.6666667461f, 0.0000000000f,
-  0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f,  0.1666666865f,  0.0000000000f,
-  0.0000000000f, 0.1178511307f,  -0.0833333507f, -0.2357022911f, 0.1666666865f,  0.0000000000f,
-  0.0000000000f, 0.9999998808f,  -0.0000000596f, -2.5000000000f, 0.0000000000f,  1.0000000000f,
-};
-
-__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
-                              const int4 input_shape,     // N H W CI_SLICES
-                              const int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
-#define PAD 1
-  int tile_xy = get_global_id(0);
-  int row = get_global_id(1);
-  int slice = get_global_id(2);
-
-  int TILE_XY = output_shape.z;
-  int SLICES = input_shape.w;
-  if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) {
-    return;
-  }
+__kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+  const int BlockH = 2;
+  const int BlockW = 1;
+  const int BlockC = 1;
+  DEFINE_ARGS;
+
+  const int oh0 = oh + 0;
+  const int oh1 = oh + 1;
+  const int n_oh0 = n * OH + oh0;
+  const int n_oh1 = n * OH + oh1;
+  const int ow0 = ow + 0;
+  const int co_slice0 = co_slice + 0;
 
-  int IW = input_shape.z;
-  int TILE_X = UP_DIV(IW, 4);
-  int tile_x = tile_xy % TILE_X;
-  int tile_y = tile_xy / TILE_X;
-
-  constant FLT *Bt_row = Bt + row * 6;
-  FLT4 BtD_row[6] = {0};
-
-  int ih = tile_y * 4 - PAD;
-  int iw = tile_x * 4 - PAD;
-  for (int y = 0; y < 6; y++) {
-    int x_idx = iw * SLICES + slice;
-    for (int x = 0; x < 6; x++) {
-      // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES
-      // if (iw < 0 || iw >= IW) { continue; }
-      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih));
-      x_idx += SLICES;
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
+
+  for (int kh = 0; kh < KH; ++kh) {
+    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    // no need to check oh1, finally write out will check (oh1 < OH)
+    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    // check ih0 and ih1
+    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+
+    for (int kw = 0; kw < KW; ++kw) {
+      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int x_idx0 = iw0 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+        x_idx0++;
+
+        out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+        out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+        out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+        out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+        out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+        out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+        out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+        out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+
+        weight_ptr += 4;
+      }
     }
-    ih++;
   }
 
-  int y_idx = slice * 36 + row * 6;
-  for (int y = 0; y < 6; y++) {
-    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int x = 0; x < 6; x++) {
-      acc += BtD_row[x] * Bt[y * 6 + x];
-    }
-    WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc);  // CH W  H=36
+  if (bias) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+  }
+
+  if (act_type == ActType_Relu) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+  } else if (act_type == ActType_Relu6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  }
+
+  if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    }  // end if (oh1 < OH)
+  } else {
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0);
+    }  // end (oh1 < OH)
   }
-#undef PAD
 }
 
-__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
-                                  const int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
-                                  const int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
-#define H 36
-  int w = get_global_id(0) * 2;
-  int h = get_global_id(1);
-  int co_slice = get_global_id(2) * 2;
+__kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+  const int BlockH = 2;
+  const int BlockW = 1;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  const int oh0 = oh + 0;
+  const int oh1 = oh + 1;
+  const int n_oh0 = n * OH + oh0;
+  const int n_oh1 = n * OH + oh1;
+  const int ow0 = ow + 0;
+  const int co_slice0 = co_slice + 0;
+  const int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
-  int CI_SLICES = input_shape.w;
-  int W = input_shape.z;
-  int CO_SLICES = output_shape.w;
+  for (int kh = 0; kh < KH; ++kh) {
+    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    // no need to check oh1, finally write out will check (oh1 < OH)
+    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    // check ih0 and ih1
+    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
 
-  if (h >= H || w >= W || co_slice >= CO_SLICES) {
-    return;
+    for (int kw = 0; kw < KW; ++kw) {
+      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int x_idx0 = iw0 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+        x_idx0++;
+
+        out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+        out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+        out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+        out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+        out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+        out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+        out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+        out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+
+        out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
+        out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
+        out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
+        out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
+        out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
+        out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
+        out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
+        out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
+
+        weight_ptr += 8;
+      }
+    }
   }
 
-  FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-
-  int y_idx = h;
-  __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
-  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx));
-    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx));
-    y_idx += 36;
-
-    FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
-    weight_ptr += 2;
-
-    out00 += in0.x * weight0.s0123;
-    out00 += in0.y * weight0.s4567;
-    out00 += in0.z * weight0.s89ab;
-    out00 += in0.w * weight0.scdef;
-
-    out01 += in1.x * weight0.s0123;
-    out01 += in1.y * weight0.s4567;
-    out01 += in1.z * weight0.s89ab;
-    out01 += in1.w * weight0.scdef;
-
-    out10 += in0.x * weight1.s0123;
-    out10 += in0.y * weight1.s4567;
-    out10 += in0.z * weight1.s89ab;
-    out10 += in0.w * weight1.scdef;
-
-    out11 += in1.x * weight1.s0123;
-    out11 += in1.y * weight1.s4567;
-    out11 += in1.z * weight1.s89ab;
-    out11 += in1.w * weight1.scdef;
+  if (bias) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
   }
 
-  WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
-  if (w + 1 < W) {
-    WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
+  if (act_type == ActType_Relu) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+  } else if (act_type == ActType_Relu6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
   }
 
-  if (co_slice + 1 < CO_SLICES) {
-    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
-    if (w + 1 < W) {
-      WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
-    }
+  if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    }  // end if (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
+  } else {
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0);
+    }  // end (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
   }
-#undef H
 }
 
-constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  0.0000000000f,
-                       0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
-                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
-                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
-
-__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
-                              const int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
-                              const int4 output_shape,  // N H W CO_SLICES
-                              const int act_type) {
-  int tile_xy = get_global_id(0);
-  int row = get_global_id(1);
-  int slice = get_global_id(2);
-
-  int TILE_XY = input_shape.z;
-  int SLICES = input_shape.w;
-  int OW = output_shape.z;
-
-  if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) {
-    return;
-  }
+__kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+  const int BlockH = 2;
+  const int BlockW = 2;
+  const int BlockC = 2;
+  DEFINE_ARGS;
+
+  const int oh0 = oh + 0;
+  const int oh1 = oh + 1;
+  const int n_oh0 = n * OH + oh0;
+  const int n_oh1 = n * OH + oh1;
+  const int ow0 = ow + 0;
+  const int ow1 = ow + 1;
+  const int co_slice0 = co_slice + 0;
+  const int co_slice1 = co_slice + 1;
+
+  FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
-  constant FLT *At_row = At + row * 6;
-  FLT4 AtM_row[6] = {0};
-  for (int y = 0, idx = slice * 36; y < 6; y++) {
-    for (int x = 0; x < 6; x++, idx++) {
-      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx));
-    }
-  }
+  for (int kh = 0; kh < KH; ++kh) {
+    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    // no need to check oh1, finally write out will check (oh1 < OH)
+    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    // check ih0 and ih1
+    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
 
-  int TILE_X = UP_DIV(OW, 4);
-  int tile_x = tile_xy % TILE_X;
-  int tile_y = tile_xy / TILE_X;
-  int oh = tile_y * 4 + row;
-  int ow = tile_x * 4;
-  int x_idx = ow * SLICES + slice;
-
-  for (int x = 0, idx = 0; x < 4; x++) {
-    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int y = 0; y < 6; y++, idx++) {
-      acc += AtM_row[y] * At[idx];
+    for (int kw = 0; kw < KW; ++kw) {
+      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
+      int x_idx0 = iw0 * CI_SLICES;
+      int x_idx1 = iw1 * CI_SLICES;
+
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+        FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
+        FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
+        FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
+        FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
+        x_idx0++;
+        x_idx1++;
+
+        out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
+        out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
+        out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
+        out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
+        out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
+        out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
+        out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
+        out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
+        out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
+        out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
+        out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
+        out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
+        out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
+        out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
+        out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
+        out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
+
+        out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
+        out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x;
+        out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
+        out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x;
+        out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
+        out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y;
+        out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
+        out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y;
+        out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
+        out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z;
+        out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
+        out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z;
+        out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
+        out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w;
+        out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
+        out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w;
+
+        weight_ptr += 8;
+      }
     }
+  }
 
-    if (bias != 0) {
-      acc += bias[slice];
-    }
+  if (bias) {
+    out_h0_w0_c0 += bias[co_slice0];
+    out_h0_w1_c0 += bias[co_slice0];
+    out_h1_w0_c0 += bias[co_slice0];
+    out_h1_w1_c0 += bias[co_slice0];
+    out_h0_w0_c1 += bias[co_slice1];
+    out_h0_w1_c1 += bias[co_slice1];
+    out_h1_w0_c1 += bias[co_slice1];
+    out_h1_w1_c1 += bias[co_slice1];
+  }
 
-    if (act_type == ActType_Relu) {
-      acc = max(acc, (FLT4)(0.0f));
-    } else if (act_type == ActType_Relu6) {
-      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
-    }
+  if (act_type == ActType_Relu) {
+    out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
+    out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
+    out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
+    out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
+    out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
+    out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
+    out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
+    out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
+  } else if (act_type == ActType_Relu6) {
+    out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+    out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  }
 
-    WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
-    x_idx += SLICES;
+  if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+    }  // end if (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
+  } else {
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0);
+    WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0);
+    if (oh1 < OH) {
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0);
+    }  // end (oh1 < OH)
+    if (co_slice1 < CO_SLICES) {
+      WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1);
+      WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1);
+      if (oh1 < OH) {
+        WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1);
+        WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1);
+      }  // end if (oh1 < OH)
+    }    // end if (co_slice1 < CO_SLICES)
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
new file mode 100644
index 0000000000..8e2029532a
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@@ -0,0 +1,187 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+
+#define ActType_Relu 1
+#define ActType_Relu6 3
+
+constant FLT Bt[36] = {
+  1.0000000000f, 0.0000000000f,  -2.5000004768f, -0.0000001192f, 1.0000001192f,  0.0000000000f,
+  0.0000000000f, 0.9428091049f,  1.3333333731f,  -0.4714044929f, -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.9428089857f, 1.3333334923f,  0.4714045525f,  -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f,  0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.1178511307f,  -0.0833333507f, -0.2357022911f, 0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.9999998808f,  -0.0000000596f, -2.5000000000f, 0.0000000000f,  1.0000000000f,
+};
+
+__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
+                              const int4 input_shape,     // N H W CI_SLICES
+                              const int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
+#define PAD 1
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = output_shape.z;
+  int SLICES = input_shape.w;
+  if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) {
+    return;
+  }
+
+  int IH = input_shape.y, IW = input_shape.z;
+  int TILE_X = UP_DIV(IW, 4);
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+
+  constant FLT *Bt_row = Bt + row * 6;
+  FLT4 BtD_row[6] = {0};
+
+  int ih = tile_y * 4 - PAD;
+  int iw = tile_x * 4 - PAD;
+  for (int y = 0; y < 6; y++) {
+    int x_idx = iw * SLICES + slice;
+    for (int x = 0; x < 6; x++) {
+      // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES
+      // if (iw < 0 || iw >= IW) { continue; }
+      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih));
+      x_idx += SLICES;
+    }
+    ih++;
+  }
+
+  int y_idx = slice * 36 + row * 6;
+  for (int y = 0; y < 6; y++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int x = 0; x < 6; x++) {
+      acc += BtD_row[x] * Bt[y * 6 + x];
+    }
+    WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc);  // CH W  H=36
+  }
+#undef PAD
+}
+
+__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
+                                  const int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
+                                  const int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
+#define H 36
+  int w = get_global_id(0) * 2;
+  int h = get_global_id(1);
+  int co_slice = get_global_id(2) * 2;
+
+  int CI_SLICES = input_shape.w;
+  int W = input_shape.z;
+  int CO_SLICES = output_shape.w;
+
+  if (h >= H || w >= W || co_slice >= CO_SLICES) {
+    return;
+  }
+
+  FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int y_idx = h;
+  __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx));
+    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx));
+    y_idx += 36;
+
+    FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
+    weight_ptr += 2;
+
+    out00 += in0.x * weight0.s0123;
+    out00 += in0.y * weight0.s4567;
+    out00 += in0.z * weight0.s89ab;
+    out00 += in0.w * weight0.scdef;
+
+    out01 += in1.x * weight0.s0123;
+    out01 += in1.y * weight0.s4567;
+    out01 += in1.z * weight0.s89ab;
+    out01 += in1.w * weight0.scdef;
+
+    out10 += in0.x * weight1.s0123;
+    out10 += in0.y * weight1.s4567;
+    out10 += in0.z * weight1.s89ab;
+    out10 += in0.w * weight1.scdef;
+
+    out11 += in1.x * weight1.s0123;
+    out11 += in1.y * weight1.s4567;
+    out11 += in1.z * weight1.s89ab;
+    out11 += in1.w * weight1.scdef;
+  }
+
+  WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
+  if (w + 1 < W) {
+    WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
+  }
+
+  if (co_slice + 1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
+    if (w + 1 < W) {
+      WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
+    }
+  }
+#undef H
+}
+
+constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  0.0000000000f,
+                       0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
+                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
+                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
+
+__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
+                              const int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
+                              const int4 output_shape,  // N H W CO_SLICES
+                              const int act_type) {
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = input_shape.z;
+  int SLICES = input_shape.w;
+  int OH = output_shape.y;
+  int OW = output_shape.z;
+
+  if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) {
+    return;
+  }
+
+  constant FLT *At_row = At + row * 6;
+  FLT4 AtM_row[6] = {0};
+  for (int y = 0, idx = slice * 36; y < 6; y++) {
+    for (int x = 0; x < 6; x++, idx++) {
+      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx));
+    }
+  }
+
+  int TILE_X = UP_DIV(OW, 4);
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+  int oh = tile_y * 4 + row;
+  int ow = tile_x * 4;
+  int x_idx = ow * SLICES + slice;
+
+  for (int x = 0, idx = 0; x < 4; x++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int y = 0; y < 6; y++, idx++) {
+      acc += AtM_row[y] * At[idx];
+    }
+
+    if (bias) {
+      acc += bias[slice];
+    }
+
+    if (act_type == ActType_Relu) {
+      acc = max(acc, (FLT4)(0.0f));
+    } else if (act_type == ActType_Relu6) {
+      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
+    }
+
+    WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
+    x_idx += SLICES;
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index bbacb71bfc..8ce376a204 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -24,6 +24,7 @@
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/cl/convolution.cl.inc"
+#include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -38,14 +39,11 @@ constexpr size_t CI_TILE = C4NUM;
 constexpr size_t CO_TILE = C4NUM;
 
 int ConvolutionOpenCLKernel::Init() {
-  auto allocator = ocl_runtime_->GetAllocator();
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  std::set<std::string> build_options;
   use_fp16_ = ocl_runtime_->GetFp16Enable();
+  sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float);
 
   auto input_tensor = in_tensors_[0];
   auto output_tensor = out_tensors_[0];
-
   batch_size_ = input_tensor->Batch();
   CI_ = input_tensor->Channel();
   IH_ = input_tensor->Height();
@@ -55,8 +53,8 @@ int ConvolutionOpenCLKernel::Init() {
   OW_ = output_tensor->Width();
   CI_SLICES_ = UP_DIV(CI_, C4NUM);
   CO_SLICES_ = UP_DIV(CO_, C4NUM);
-  KH_ = param->kernel_h_;
-  KW_ = param->kernel_w_;
+  KH_ = param_->kernel_h_;
+  KW_ = param_->kernel_w_;
   has_bias_ = in_tensors_.size() == 3;
 
   // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
@@ -65,28 +63,39 @@ int ConvolutionOpenCLKernel::Init() {
   TILES_XY_ = TILES_X_ * TILES_Y_;
   use_winograd_ = UseWinograd4x4To6x6();
 
+  if (!use_winograd_) {
+    SetBlockSize();
+    SetGlobalLocal();
+  }
+
   // build kernel
-  std::string program_name = "Convolution";
-  ocl_runtime_->LoadSource(program_name, convolution_source);
+  std::set<std::string> build_options;
   if (use_winograd_) {
     MS_LOG(DEBUG) << "use winograd";
+    std::string program_name = "Winograd";
+    ocl_runtime_->LoadSource(program_name, winograd_source);
     ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
     ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
     ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
   } else {
-    ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
+    std::string program_name = "Convolution";
+    std::string kernel_name = "Convolution_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) +
+                              "C" + std::to_string(block_size_.C);
+    ocl_runtime_->LoadSource("Convolution", convolution_source);
+    ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name, build_options);
   }
 
   // allocate winograd memory
   if (use_winograd_) {
+    auto allocator = ocl_runtime_->GetAllocator();
     size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
 
-    size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT();
+    size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT_;
     size_t width = TILES_XY_;
     size_t height = CI_SLICES_ * 36;
     winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});
 
-    size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT();
+    size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT_;
     width = TILES_XY_;
     height = CO_SLICES_ * 36;
     winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
@@ -156,9 +165,9 @@ int ConvolutionOpenCLKernel::InitWeight() {
   // allocate memory
   size_t packed_weight_size;
   if (use_winograd_) {
-    packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT();
+    packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT_;
   } else {
-    packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT();
+    packed_weight_size = UP_ROUND(CO_SLICES_, block_size_.C) * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT_;
   }
   packed_weight_ = allocator->Malloc(packed_weight_size);
   allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
@@ -171,15 +180,19 @@ int ConvolutionOpenCLKernel::InitWeight() {
     auto weight_tensor = in_tensors_[1];
     if (weight_tensor->data_type() == kNumberTypeFloat16) {
       if (use_fp16_) {
-        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
+        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
+                                                      block_size_.C);
       } else {
-        ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
+        ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
+                                                  block_size_.C);
       }
     } else {
       if (use_fp16_) {
-        ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
+        ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
+                                                  block_size_.C);
       } else {
-        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
+        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_,
+                                              block_size_.C);
       }
     }
   }
@@ -193,14 +206,14 @@ int ConvolutionOpenCLKernel::InitBias() {
 
   // align bias from C to C4
   auto bias_tensor = in_tensors_[2];
-  size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT();
+  size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
   packed_bias_ = allocator->Malloc(packed_bias_size);
 
   allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
   memset(packed_bias_, 0x00, packed_bias_size);
   if (bias_tensor->data_type() == kNumberTypeFloat16) {
     if (use_fp16_) {
-      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
+      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
     } else {
       auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
       auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
@@ -216,7 +229,7 @@ int ConvolutionOpenCLKernel::InitBias() {
         packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
       }
     } else {
-      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
+      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
     }
   }
   allocator->UnmapBuffer(packed_bias_);
@@ -231,6 +244,66 @@ int ConvolutionOpenCLKernel::InitBuffer() {
   return RET_OK;
 }
 
+void ConvolutionOpenCLKernel::SetBlockSize() {
+  auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
+  auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
+  int block_size;
+  if (task_size_per_cu <= 256) {
+    block_size = 1;
+  } else if (task_size_per_cu <= 256 * 4) {
+    block_size = 2;
+  } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) {
+    block_size = 4;
+  } else {
+    block_size = 8;
+  }
+
+  bool w_kernel_is_1 =
+    KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
+  bool h_kernel_is_1 =
+    KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
+  if (!w_kernel_is_1 || !h_kernel_is_1) {
+    block_size = std::min(block_size, 4);
+  }
+
+  if (block_size == 8) {
+    block_size_ = {2, 2, 2};
+  } else if (block_size == 4) {
+    block_size_ = {2, 1, 2};
+  } else if (block_size == 2) {
+    block_size_ = {2, 1, 1};
+  } else {
+    block_size_ = {1, 1, 1};
+  }
+}
+
+void ConvolutionOpenCLKernel::SetGlobalLocal() {
+  size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H);
+  size_t global_w = UP_DIV(OW_, block_size_.W);
+  size_t global_c = UP_DIV(CO_SLICES_, block_size_.C);
+
+  constexpr int local_c_max = 16;
+  constexpr int local_hw_max = 256;
+  constexpr int OH_threshold = 100;
+  constexpr int OW_threshold = 100;
+  constexpr int OC_threshold = 64;
+  size_t local_c = GetMaxDivisor(global_c, local_c_max);
+  local_c = std::max<size_t>(local_c, 1);
+  size_t local_hw = local_hw_max / local_c;
+  size_t local_h;
+  size_t local_w;
+  if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) {  // c -> w -> h
+    local_w = std::min(global_w, local_hw);
+    local_h = std::min(local_hw / local_w, global_h);
+  } else {  // c -> h -> w
+    local_h = std::min(global_h, local_hw);
+    local_w = std::min(local_hw / local_h, global_w);
+  }
+
+  global_ = {global_h, global_w, global_c};
+  local_ = {local_h, local_w, local_c};
+}
+
 int ConvolutionOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
@@ -292,48 +365,9 @@ int ConvolutionOpenCLKernel::Run() {
                             nullptr);
     ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
   } else {
-    std::vector<size_t> global, local;
-    SetGlobalLocalConv(&global, &local);
-    ocl_runtime_->RunKernel(kernel_conv_, global, local, nullptr);
-  }
-
-  return RET_OK;
-}
-
-int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
-  constexpr size_t work_group_size[] = {4, 4, 1};
-  auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize();
-  size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())());
-  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
-
-  size_t global_nh = UP_DIV(batch_size_ * OH_, work_group_size[0]) * work_group_size[0];
-  size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
-  size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];
-
-  size_t local_c = GetMaxDivisor(global_c, max_z_size);
-  if (local_c == 0) {
-    MS_LOG(ERROR) << "Divide by zero";
-    return mindspore::lite::RET_ERROR;
-  }
-  size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
-  size_t local_w = std::min(global_w, local_hw_size);
-  size_t local_nh = std::min(local_hw_size / local_w, global_nh);
-  if (local_nh == global_nh && global_nh % 2 == 0) {
-    local_nh = global_nh / 2;
+    ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr);
   }
 
-  if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
-    local_w = 4;
-  }
-
-  global->clear();
-  global->push_back(UP_DIV(batch_size_ * OH_, local_nh) * local_nh);
-  global->push_back(UP_DIV(OW_, local_w) * local_w);
-  global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c);
-  local->clear();
-  local->push_back(local_nh);
-  local->push_back(local_w);
-  local->push_back(local_c);
   return RET_OK;
 }
 
@@ -349,37 +383,43 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::Tenso
                           conv_param->stride_w_ == 1 && conv_param->pad_u_ == 0 && conv_param->pad_d_ == 0 &&
                           conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->dilation_h_ == 1 &&
                           conv_param->dilation_w_ == 1;
+
+  OpParameter *real_param;
   if (is_hw1 && is_pad_stride_ok) {
-    auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
-    if (param == nullptr) {
+    auto fc_param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
+    if (fc_param == nullptr) {
       MS_LOG(ERROR) << "Create OpenCL FullConnection kernel param failed!";
       return nullptr;
     }
-    param->op_parameter_.type_ = PrimitiveType_FullConnection;
-    param->a_transpose_ = false;
-    param->b_transpose_ = true;
-    param->act_type_ = conv_param->act_type_;
-    kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+    fc_param->op_parameter_.type_ = PrimitiveType_FullConnection;
+    fc_param->a_transpose_ = false;
+    fc_param->b_transpose_ = true;
+    fc_param->act_type_ = conv_param->act_type_;
+    kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(fc_param), inputs, outputs);
+    real_param = reinterpret_cast<OpParameter *>(fc_param);
     if (kernel == nullptr) {
       MS_LOG(ERROR) << "Create OpenCL FullConnection kernel failed!";
-      free(param);
-      free(opParameter);
+      free(fc_param);
+      free(conv_param);
       return nullptr;
     } else {
-      free(opParameter);
+      free(conv_param);
     }
   } else {
-    kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+    kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
+    real_param = reinterpret_cast<OpParameter *>(conv_param);
     if (kernel == nullptr) {
       MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!";
-      free(opParameter);
+      free(conv_param);
       return nullptr;
     }
   }
+
   auto ret = kernel->Init();
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
     delete kernel;
+    free(real_param);
     return nullptr;
   }
   return kernel;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index bd989f3cd2..d273e5ec2b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -31,7 +31,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  public:
   ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                           const std::vector<lite::Tensor *> &outputs)
-      : OpenCLKernel(parameter, inputs, outputs) {}
+      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {}
   ~ConvolutionOpenCLKernel() override = default;
 
   int Init() override;
@@ -39,26 +39,32 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   int InitBuffer() override;
 
  private:
+  void SetBlockSize();
+  void SetGlobalLocal();
   int InitWeight();
   int InitBias();
   int GenerateWinogradWeight();
-  int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
-
-  size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }
 
   bool UseWinograd4x4To6x6() {
-    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
-                            param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
-                            param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
-                            IW_ == OW_ && batch_size_ == 1;
+    const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 &&
+                            param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 &&
+                            param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 &&
+                            param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1;
     const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8;
     const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
     return attr_valid && channel_good && hw_good;
   }
 
+  cl::Kernel kernel_4x4to36_;
+  cl::Kernel kernel_conv_;
+  cl::Kernel kernel_36to4x4_;
+  std::vector<size_t> global_;
+  std::vector<size_t> local_;
+
   bool use_fp16_{false};
+  size_t sizeof_FLT_{4};
 
+  ConvParameter *param_{nullptr};
   int batch_size_{};
   int CI_{};
   int IH_{};
@@ -81,9 +87,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   void *winograd_mem0_{nullptr};
   void *winograd_mem1_{nullptr};
 
-  cl::Kernel kernel_4x4to36_;
-  cl::Kernel kernel_conv_;
-  cl::Kernel kernel_36to4x4_;
+  struct {
+    int H{1};
+    int W{1};
+    int C{1};
+  } block_size_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index abe32e558d..f29e2965a1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -38,30 +38,29 @@ struct OpenCLToFormatParameter {
 
 struct Image2DInfo {
   explicit Image2DInfo(const lite::Tensor *tensor) {
-    if (tensor != nullptr) {
-      auto shape = tensor->shape();
-      if (shape.size() == 1) {
-        N = shape[0];
-      } else if (shape.size() == 2) {
-        N = shape[0];
-        C = shape[1];
-      } else if (shape.size() == 3) {
-        N = shape[0];
-        W = shape[1];
-        C = shape[2];
-      } else if (shape.size() == 4) {
-        N = shape[0];
-        H = shape[1];
-        W = shape[2];
-        C = shape[3];
-      } else if (shape.size() >= 5) {
-        MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
-      }
-      FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
-    } else {
-      FLT_size = sizeof(cl_float);
+    if (tensor == nullptr) {
+      return;
     }
 
+    auto shape = tensor->shape();
+    if (shape.size() == 1) {
+      N = shape[0];
+    } else if (shape.size() == 2) {
+      N = shape[0];
+      C = shape[1];
+    } else if (shape.size() == 3) {
+      N = shape[0];
+      W = shape[1];
+      C = shape[2];
+    } else if (shape.size() == 4) {
+      N = shape[0];
+      H = shape[1];
+      W = shape[2];
+      C = shape[3];
+    } else if (shape.size() >= 5) {
+      MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
+    }
+    FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
     FLT4_size = FLT_size * 4;
     Slice = UP_DIV(C, C4NUM);
     if (W * Slice <= MAX_IMAGE2D_SIZE) {
@@ -72,16 +71,19 @@ struct Image2DInfo {
       width = N * H * Slice;
     }
 
-    auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-    int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
-    row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
-
     ElementsNum = N * H * W * C;
     ElementsC4Num = N * H * W * Slice * C4NUM;
     OriginSize = ElementsNum * FLT_size;
     Image2DSize = height * width * FLT4_size;
   }
 
+  size_t RowPitch() const {
+    auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+    int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
+    size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
+    return row_pitch;
+  }
+
   size_t N{1};
   size_t H{1};
   size_t W{1};
@@ -89,9 +91,8 @@ struct Image2DInfo {
   size_t Slice{};
   size_t width{};
   size_t height{};
-  size_t FLT_size{};
-  size_t FLT4_size{};
-  size_t row_pitch{};
+  size_t FLT_size{4};
+  size_t FLT4_size{16};
   size_t ElementsNum{};
   size_t ElementsC4Num{};
   size_t OriginSize{};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
index 641d6b7031..8c3957307b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -262,7 +262,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons
     auto row_size = img_info.width * img_info.FLT4_size;
     for (int i = 0; i < img_info.height; ++i) {
       memcpy(reinterpret_cast<char *>(data.data()) + i * row_size,
-             static_cast<char *>(tensor->data_c()) + i * img_info.row_pitch, row_size);
+             static_cast<char *>(tensor->data_c()) + i * img_info.RowPitch(), row_size);
     }
   }
   allocator->UnmapBuffer(tensor->data_c());