diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
index 43a5a0306b..47cf881505 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
@@ -1,150 +1,282 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
 #define CI_TILE 4
 #define CO_TILE 4
+#define MAX_IMAGE2D_SIZE 65535
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-// #define __global
-// #pragma OPENCL EXTENSION cl_arm_printf : enable
-__kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias,
-                                    __global float *output,
-                                    const int4 input_shape,    // NHWC
-                                    const int4 output_shape,   // NHWC
-                                    const int4 kernel_stride,  // kernelHW_strideHW
-                                    const int4 pad) {
-  int ow = get_global_id(0);
-  int oh = get_global_id(1);
-  int co_slice = get_global_id(2);
-
-  int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
-  int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
-  int KH = kernel_stride.x, KW = kernel_stride.y;
-  int strideH = kernel_stride.z, strideW = kernel_stride.w;
-  int padTop = pad.x, padLeft = pad.z;
-  int CI_SLICES = UP_DIV(CI, CI_TILE);
-  int CO_SLICES = UP_DIV(CO, CO_TILE);
-
-  if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) return;
-
-  float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+#define ActType_No 0
+#define ActType_Relu 1
+#define ActType_Sigmod 2
+#define ActType_Relu6 3
+
+__kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                          __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                          const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+  const int N = input_shape.x;
+  const int IH = input_shape.y;
+  const int IW = input_shape.z;
+  const int CI_SLICES = input_shape.w;
+
+  const int OH = output_shape.y;
+  const int OW = output_shape.z;
+  const int CO_SLICES = output_shape.w;
+
+  const int KH = kernel_stride.x;
+  const int KW = kernel_stride.y;
+  const int strideH = kernel_stride.z;
+  const int strideW = kernel_stride.w;
+
+  const int padTop = pad.x;
+  const int padBottom = pad.y;
+  const int padLeft = pad.z;
+  const int padRight = pad.w;
+
+  const int dilationH = dilation.x;
+  const int dilationW = dilation.y;
+
+  int n_oh = get_global_id(0);      // [0, N*OH)
+  int ow = get_global_id(1);        // [0, OW)
+  int co_slice = get_global_id(2);  // [0, UP_DIV(CO, CO_TILE) )
+  int n;
+  int oh;
+  if (N == 1) {
+    n = 0;
+    oh = n_oh;
+  } else {
+    n = n_oh / OH;
+    oh = n_oh % OH;
+  }
+  if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
+    return;
+  }
+
+  FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  __global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;
   for (int kh = 0; kh < KH; ++kh) {
-    int ih = kh + oh * strideH - padTop;
+    int ih = kh * dilationH + oh * strideH - padTop;
     for (int kw = 0; kw < KW; ++kw) {
-      int iw = kw + ow * strideW - padLeft;
-      for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) {
-        for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) {
-          int ci = ci_slice * CI_TILE + ci_inner;
-          if (ci >= CI) break;
-
-          int input_idx = ih * IW * CI + iw * CI + ci;
-          float value = 0;
-          if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
-            value = 0;
-          else
-            value = input[input_idx];
-
-          int CO_OFFSET = KH * KW * CI;
-          int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci;
-          acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
-          acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
-          acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
-          acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
+      int iw = kw * dilationW + ow * strideW - padLeft;
+      if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
+        for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+          FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih));
+          out_c4 += w_ic1_oc4[0] * in_c4.x;
+          out_c4 += w_ic1_oc4[1] * in_c4.y;
+          out_c4 += w_ic1_oc4[2] * in_c4.z;
+          out_c4 += w_ic1_oc4[3] * in_c4.w;
+          w_ic1_oc4 += 4;
         }
+      } else {
+        w_ic1_oc4 += 4 * CI_SLICES;
       }
     }
   }
-  int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
-  if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) {
-    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-    output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
-    output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
-  } else if (CO % CO_TILE == 1) {
-    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-  } else if (CO % CO_TILE == 2) {
-    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-  } else if (CO % CO_TILE == 3) {
-    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-    output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
+
+  if (bias) {
+    out_c4 = out_c4 + bias[co_slice];
+  }
+
+  // activation
+  if (act_type == ActType_Relu) {
+    out_c4 = max(out_c4, (FLT4)(0.0f));
+  } else if (act_type == ActType_Relu6) {
+    out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f));
+  }
+
+  if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
+    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4);
+
+  } else {
+    WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4);
   }
 }
 
-// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-// #define FLT4 half4
-#define FLT4 float4
-__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias,
-                                              __global FLT4 *output,
-                                              const int4 input_shape,    // NHWC
-                                              const int4 output_shape,   // NHWC
-                                              const int4 kernel_stride,  // kernelHW_strideHW
-                                              const int4 pad) {
-  int oh = get_global_id(0);        // [0, OH)
-  int ow = get_global_id(1);        // [0, OW)
-  int co_slice = get_global_id(2);  // [0, UP_DIV(CO, CO_TILE) )
+constant FLT Bt[36] = {
+  1.0000000000f, 0.0000000000f,  -2.5000004768f, -0.0000001192f, 1.0000001192f,  0.0000000000f,
+  0.0000000000f, 0.9428091049f,  1.3333333731f,  -0.4714044929f, -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.9428089857f, 1.3333334923f,  0.4714045525f,  -0.6666667461f, 0.0000000000f,
+  0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f,  0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.1178511307f,  -0.0833333507f, -0.2357022911f, 0.1666666865f,  0.0000000000f,
+  0.0000000000f, 0.9999998808f,  -0.0000000596f, -2.5000000000f, 0.0000000000f,  1.0000000000f,
+};
 
-  int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
-  int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
-  int CI_SLICES = UP_DIV(CI, CI_TILE);
-  int CO_SLICES = UP_DIV(CO, CO_TILE);
-  int KH = kernel_stride.x, KW = kernel_stride.y;
-  int strideH = kernel_stride.z, strideW = kernel_stride.w;
-  int padTop = pad.x, padLeft = pad.z;
-
-  if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) return;
-  if (2 * co_slice + 1 >= CO_SLICES) {
-    FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
-    for (int kh = 0; kh < KH; ++kh) {
-      int ih = kh + oh * strideH - padTop;
-      for (int kw = 0; kw < KW; ++kw) {
-        int iw = kw + ow * strideW - padLeft;
-        if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
-          for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-            FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
-            out0_c4 += w0_ic1_oc4[0] * in_c4.x;
-            out0_c4 += w0_ic1_oc4[1] * in_c4.y;
-            out0_c4 += w0_ic1_oc4[2] * in_c4.z;
-            out0_c4 += w0_ic1_oc4[3] * in_c4.w;
-            w0_ic1_oc4 += 4;
-          }
-        } else {
-          w0_ic1_oc4 += 4 * CI_SLICES;
-        }
+__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
+                              const int4 input_shape,     // N H W CI_SLICES
+                              const int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
+#define PAD 1
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = output_shape.z;
+  int SLICES = input_shape.w;
+  if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) {
+    return;
+  }
+
+  int IH = input_shape.y, IW = input_shape.z;
+  int TILE_X = UP_DIV(IW, 4);
+  int tile_x = tile_xy % TILE_X;
+  int tile_y = tile_xy / TILE_X;
+
+  constant FLT *Bt_row = Bt + row * 6;
+  FLT4 BtD_row[6] = {0};
+  for (int y = 0; y < 6; y++) {
+    int ih = tile_y * 4 - PAD + y;
+
+    // Format_NHWC4
+    int y_idx = ih;
+    // Format_NC4HW4
+    // if (ih < 0 || ih >= IH) { continue;}
+    // int y_idx = slice * IH + ih;
+
+    for (int x = 0; x < 6; x++) {
+      int iw = tile_x * 4 - PAD + x;
+
+      // Format_NHWC4
+      if (iw < 0 || iw >= IW) {
+        continue;
       }
+      int x_idx = iw * SLICES + slice;
+      // Format_NC4HW4
+      // int x_idx = iw;
+
+      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, y_idx));
     }
-    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
-  } else {
-    FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
-    __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
-    for (int kh = 0; kh < KH; ++kh) {
-      int ih = kh + oh * strideH - padTop;
-      for (int kw = 0; kw < KW; ++kw) {
-        int iw = kw + ow * strideW - padLeft;
-        if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
-          int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
-          for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-            FLT4 in_c4 = input[idx + ci_slice];
-
-            out0_c4 += w0_ic1_oc4[0] * in_c4.x;
-            out0_c4 += w0_ic1_oc4[1] * in_c4.y;
-            out0_c4 += w0_ic1_oc4[2] * in_c4.z;
-            out0_c4 += w0_ic1_oc4[3] * in_c4.w;
-            w0_ic1_oc4 += 4;
-
-            out1_c4 += w1_ic1_oc4[0] * in_c4.x;
-            out1_c4 += w1_ic1_oc4[1] * in_c4.y;
-            out1_c4 += w1_ic1_oc4[2] * in_c4.z;
-            out1_c4 += w1_ic1_oc4[3] * in_c4.w;
-            w1_ic1_oc4 += 4;
-          }
-        } else {
-          w0_ic1_oc4 += 4 * CI_SLICES;
-          w1_ic1_oc4 += 4 * CI_SLICES;
-        }
-      }
+  }
+
+  for (int y = 0; y < 6; y++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int x = 0; x < 6; x++) {
+      acc += BtD_row[x] * Bt[y * 6 + x];
+    }
+    WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc);  // CH W  H=36
+  }
+#undef PAD
+}
+
+__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
+                                  const int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
+                                  const int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
+#define H 36
+  int w = get_global_id(0) * 2;
+  int h = get_global_id(1);
+  int co_slice = get_global_id(2) * 2;
+
+  int CI_SLICES = input_shape.w;
+  int W = input_shape.z;
+  int CO_SLICES = output_shape.w;
+
+  if (h >= H || w >= W || co_slice >= CO_SLICES) {
+    return;
+  }
+
+  FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+  FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+  int y_idx = h;
+  __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
+  for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
+    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx));
+    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx));
+    y_idx += 36;
+
+    FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
+    weight_ptr += 2;
+
+    out00 += in0.x * weight0.s0123;
+    out00 += in0.y * weight0.s4567;
+    out00 += in0.z * weight0.s89ab;
+    out00 += in0.w * weight0.scdef;
+
+    out01 += in1.x * weight0.s0123;
+    out01 += in1.y * weight0.s4567;
+    out01 += in1.z * weight0.s89ab;
+    out01 += in1.w * weight0.scdef;
+
+    out10 += in0.x * weight1.s0123;
+    out10 += in0.y * weight1.s4567;
+    out10 += in0.z * weight1.s89ab;
+    out10 += in0.w * weight1.scdef;
+
+    out11 += in1.x * weight1.s0123;
+    out11 += in1.y * weight1.s4567;
+    out11 += in1.z * weight1.s89ab;
+    out11 += in1.w * weight1.scdef;
+  }
+
+  WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
+  if (w + 1 < W) {
+    WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
+  }
+
+  if (co_slice + 1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
+    if (w + 1 < W) {
+      WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
+    }
+  }
+#undef H
+}
+
+constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  0.0000000000f,
+                       0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
+                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
+                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
+
+__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
+                              const int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
+                              const int4 output_shape,  // N H W CO_SLICES
+                              const int act_type) {
+  int tile_xy = get_global_id(0);
+  int row = get_global_id(1);
+  int slice = get_global_id(2);
+
+  int TILE_XY = input_shape.z;
+  int SLICES = input_shape.w;
+  int OH = output_shape.y;
+  int OW = output_shape.z;
+
+  if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) {
+    return;
+  }
+
+  constant FLT *At_row = At + row * 6;
+  FLT4 AtM_row[6] = {0};
+  for (int y = 0; y < 6; y++) {
+    for (int x = 0; x < 6; x++) {
+      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, slice * 36 + y * 6 + x));
+    }
+  }
+
+  int TILE_X = UP_DIV(OW, 4);
+  for (int x = 0; x < 4; x++) {
+    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int y = 0; y < 6; y++) {
+      acc += AtM_row[y] * At[x * 6 + y];
+    }
+    if (bias) {
+      acc += bias[slice];
+    }
+    if (act_type == ActType_Relu) {
+      acc = max(acc, (FLT4)(0.0f));
+    } else if (act_type == ActType_Relu6) {
+      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
+    }
+    int tile_x = tile_xy % TILE_X;
+    int tile_y = tile_xy / TILE_X;
+    int ow = tile_x * 4 + x;
+    int oh = tile_y * 4 + row;
+
+    // Format_NHWC4
+    if (ow < OW) {
+      WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);
     }
-    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
-    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
+    // Format_NC4HW4
+    //    if (oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index 6a8c552333..bfe799f970 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -22,14 +22,13 @@
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#include "src/runtime/kernel/opencl/cl/convolution.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2D;
-using mindspore::schema::Format::Format_NC4HW4;
-using mindspore::schema::Format::Format_NHWC4;
 
 namespace mindspore::kernel {
 
@@ -65,25 +64,14 @@ int ConvolutionOpenCLKernel::Init() {
   use_winograd_ = UseWinograd4x4To6x6();
 
   // build kernel
-  auto code_id = get_code_id();
-  std::string program_name;
+  std::string program_name = "Convolution";
+  ocl_runtime_->LoadSource(program_name, convolution_source);
   if (use_winograd_) {
     MS_LOG(DEBUG) << "use winograd";
-    program_name = "Winograd4x4To36" + code_id;
-    ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36());
     ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
-
-    program_name = "WinogradConvolution" + code_id;
-    ocl_runtime_->LoadSource(program_name, CodeGenWinogradConvolution());
     ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
-
-    program_name = "Winograd36To4x4" + code_id;
-    ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4());
     ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
   } else {
-    program_name = "Convolution" + code_id;
-    std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
-    ocl_runtime_->LoadSource(program_name, source);
     ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
   }
 
@@ -243,15 +231,23 @@ int ConvolutionOpenCLKernel::InitBuffer() {
 
 int ConvolutionOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
+  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  cl_int act_type = 0;
+  if (param->act_type_ == ActType_Relu) {
+    act_type = 1;
+  } else if (param->act_type_ == ActType_Relu6) {
+    act_type = 3;
+  }
+  cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
+  cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
 
-  int arg_cn = 0;
+  int arg_cn;
   if (use_winograd_) {
     arg_cn = 0;
-    cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
     cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
     ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
     ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
+    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
     ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
 
     arg_cn = 0;
@@ -265,28 +261,27 @@ int ConvolutionOpenCLKernel::Run() {
 
     arg_cn = 0;
     cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
-    cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-    if (has_bias_) {
-      ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    }
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, act_type);
   } else {
     arg_cn = 0;
+    cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_};
+    cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
+    cl_int2 dilation = {param->dilation_h_, param->dilation_w_};
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
-    if (has_bias_) {
-      ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    }
-    if (op_format_ == Format_NC4HW4) {
-      cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
-      cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
-      ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
-      ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
-    }
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, act_type);
   }
 
   if (use_winograd_) {
@@ -303,549 +298,6 @@ int ConvolutionOpenCLKernel::Run() {
   return RET_OK;
 }
 
-std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  const size_t CI_ALIGN = CI_SLICES_ * C4NUM;
-  const size_t CO_ALIGN = CO_SLICES_ * C4NUM;
-  const size_t strideH = param->stride_h_;
-  const size_t strideW = param->stride_w_;
-  const size_t padTop = param->pad_u_;
-  const size_t padBottom = param->pad_d_;
-  const size_t padLeft = param->pad_l_;
-  const size_t padRight = param->pad_r_;
-
-  std::string code;
-  code += "#define CI_TILE 4\n";
-  code += "#define CO_TILE 4\n\n";
-  code += "#define N " + std::to_string(batch_size_) + "\n";
-  code += "#define N_OH " + std::to_string(batch_size_ * OH_) + "\n";
-  code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
-  code += "#define IH " + std::to_string(IH_) + "\n";
-  code += "#define IW " + std::to_string(IW_) + "\n";
-  code += "#define CO " + std::to_string(CO_ALIGN) + "\n";
-  code += "#define OH " + std::to_string(OH_) + "\n";
-  code += "#define OW " + std::to_string(OW_) + "\n";
-  code += "#define KH " + std::to_string(KH_) + "\n";
-  code += "#define KW " + std::to_string(KW_) + "\n";
-  code += "#define strideH " + std::to_string(strideH) + "\n";
-  code += "#define strideW " + std::to_string(strideW) + "\n";
-  code += "#define padTop " + std::to_string(padTop) + "\n";
-  code += "#define padBottom " + std::to_string(padBottom) + "\n";
-  code += "#define padLeft " + std::to_string(padLeft) + "\n";
-  code += "#define padRight " + std::to_string(padRight) + "\n";
-  code += "#define dilationH " + std::to_string(param->dilation_h_) + "\n";
-  code += "#define dilationW " + std::to_string(param->dilation_w_) + "\n";
-  code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n";
-  code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n";
-
-  if (use_fp16_) {
-    code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
-  }
-
-  code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n";
-
-  code +=
-    "__kernel void Convolution(__read_only image2d_t input,\n"
-    "                          __write_only image2d_t output,\n";
-  if (has_bias_) {
-    code +=
-      "                          __global FLT4 *weight,\n"
-      "                          __global FLT4 *bias) {\n";
-  } else {
-    code += "                          __global FLT4 *weight) {\n";
-  }
-
-  code += "    int n_oh = get_global_id(0);  // [0, N*OH)\n";
-  if (batch_size_ == 1) {
-    code += "    #define n 0\n";
-    code += "    int oh = n_oh;\n";
-  } else {
-    code += "    int n = n_oh / " + std::to_string(OH_) + ";\n";
-    code += "    int oh = n_oh % " + std::to_string(OH_) + ";\n";
-  }
-
-  code +=
-    "    int ow = get_global_id(1);  // [0, OW)\n"
-    "    int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n"
-    "\n"
-    "    if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n"
-    "        return;\n"
-    "    }\n"
-    "\n"
-    "    FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-    "    __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n";
-
-  code +=
-    "    for (int kh = 0; kh < KH; ++kh)\n"
-    "    {\n"
-    "        int ih = kh * dilationH + oh * strideH - padTop;\n"
-    "        for (int kw = 0; kw < KW; ++kw)\n"
-    "        {\n"
-    "            int iw = kw * dilationW + ow * strideW - padLeft;\n"
-    "            if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)\n"
-    "            {\n"
-    "                for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
-    "                {\n";
-
-  code +=
-    "FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); // NHWC4: NH WC\n\n";
-
-  code +=
-    "                    out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n"
-    "                    out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n"
-    "                    out0_c4 += w0_ic1_oc4[2] * in_c4.z;\n"
-    "                    out0_c4 += w0_ic1_oc4[3] * in_c4.w;\n"
-    "                    w0_ic1_oc4 += 4;\n"
-    "                }\n"
-    "            }\n"
-    "            else\n"
-    "            {\n"
-    "                w0_ic1_oc4 += 4 * CI_SLICES;\n"
-    "            }\n"
-    "        }\n"
-    "    }\n\n";
-
-  if (has_bias_) {
-    code += "    out0_c4 = out0_c4 + bias[co_slice];\n";
-  }
-
-  if (param->act_type_ == ActType_Relu) {
-    code += "    out0_c4 = max(out0_c4, (FLT4)(0.0f));\n";
-  } else if (param->act_type_ == ActType_Relu6) {
-    code += "    out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n";
-  }
-  if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
-    code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}";
-  } else {
-    code += "    WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}";
-  }
-  return code;
-}
-
-std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  const size_t strideH = param->stride_h_;
-  const size_t strideW = param->stride_w_;
-  const size_t padTop = param->pad_u_;
-  const size_t padBottom = param->pad_d_;
-  const size_t padLeft = param->pad_l_;
-
-  std::string code;
-
-  if (use_fp16_) {
-    code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
-  }
-
-  code +=
-    "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
-    "\n"
-    "__kernel void Convolution(__read_only image2d_t input,\n"
-    "                          __write_only image2d_t output,\n"
-    "                          __global FLT4 *weight,\n";
-  if (has_bias_) {
-    code += "                          __global FLT4 *bias,\n";
-  }
-  code +=
-    "                          const int4 input_shape,\n"
-    "                          const int4 output_shape)\n"
-    "{\n";
-
-  code += "    int n_oh = get_global_id(0);  // [0, N*OH)\n";
-  if (batch_size_ == 1) {
-    code += "    #define n 0\n";
-    code += "    int oh = n_oh;\n";
-  } else {
-    code += "    int n = n_oh / " + std::to_string(OH_) + ";\n";
-    code += "    int oh = n_oh % " + std::to_string(OH_) + ";\n";
-  }
-
-  code +=
-    "    int ow = get_global_id(1) * 2;\n"
-    "    int co_slice = get_global_id(2);\n"
-    "\n"
-    "    int CI_SLICES = input_shape.w;\n"
-    "    int CO_SLICES = output_shape.w;\n\n";
-
-  code += "    #define N " + std::to_string(batch_size_) + "\n";
-  code += "    #define N_OH " + std::to_string(batch_size_ * OH_) + "\n";
-  code += "    #define IH " + std::to_string(IH_) + "\n";
-  code += "    #define IW " + std::to_string(IW_) + "\n";
-  code += "    #define OH " + std::to_string(OH_) + "\n";
-  code += "    #define OW " + std::to_string(OW_) + "\n";
-  code += "    #define KH " + std::to_string(KH_) + "\n";
-  code += "    #define KW " + std::to_string(KW_) + "\n";
-  code += "    #define strideH " + std::to_string(strideH) + "\n";
-  code += "    #define strideW " + std::to_string(strideW) + "\n";
-  code += "    #define padTop " + std::to_string(padTop) + "\n";
-  code += "    #define padLeft " + std::to_string(padLeft) + "\n";
-  code += "    #define dilationH " + std::to_string(param->dilation_h_) + "\n";
-  code += "    #define dilationW " + std::to_string(param->dilation_w_) + "\n";
-
-  code +=
-    "    if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n"
-    "        return;\n"
-    "    }\n";
-
-  bool check_ow = (OW_ % 2) == 1;
-  if (check_ow) {
-    code +=
-      "    int last_is_double = 1;\n"
-      "    if (ow + 1 >= OW)\n"
-      "        last_is_double = 0;\n\n";
-  }
-
-  code +=
-    "    FLT4 out0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-    "    FLT4 out1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-    "    __global FLT4 *w = weight + co_slice * KH * KW * CI_SLICES * 4;\n"
-    "\n"
-    "    for (int kh = 0; kh < KH; ++kh)\n"
-    "    {\n"
-    "        int ih = kh * dilationH + oh * strideH - padTop;\n"
-    "        for (int kw = 0; kw < KW; ++kw)\n"
-    "        {\n";
-
-  if (padTop || padBottom) {
-    code +=
-      "if (ih >= 0 && ih < IH)\n"
-      "{\n";
-  }
-
-  code += "            int iw0 = kw * dilationW + (ow + 0) * strideW - padLeft;\n";
-  if (check_ow) {
-    code +=
-      "            if (last_is_double)\n"
-      "            {\n";
-  }
-
-  code +=
-    "                int iw1 = kw * dilationW + (ow + 1) * strideW - padLeft;\n"
-    "                for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
-    "                {\n"
-    "                    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n"
-    "                    out0 += w[0] * in0.x;\n"
-    "                    out0 += w[1] * in0.y;\n"
-    "                    out0 += w[2] * in0.z;\n"
-    "                    out0 += w[3] * in0.w;\n"
-    "                    FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(iw1, (n * CI_SLICES + ci_slice) * IH + ih));\n"
-    "                    out1 += w[0] * in1.x;\n"
-    "                    out1 += w[1] * in1.y;\n"
-    "                    out1 += w[2] * in1.z;\n"
-    "                    out1 += w[3] * in1.w;\n"
-    "                    w += 4;\n"
-    "                }\n";
-  if (check_ow) {
-    code +=
-      "            }\n"
-      "            else\n"
-      "            {\n"
-      "                for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
-      "                {\n"
-      "                    FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n"
-      "                    out0 += w[0] * in0.x;\n"
-      "                    out0 += w[1] * in0.y;\n"
-      "                    out0 += w[2] * in0.z;\n"
-      "                    out0 += w[3] * in0.w;\n"
-      "                    w += 4;\n"
-      "                }\n"
-      "            }\n";
-  }
-  if (padTop || padBottom) {
-    code +=
-      "}\n"
-      "else\n"
-      "{\n"
-      "    w += CI_SLICES * 4;\n"
-      "}\n";
-  }
-  code +=
-    "        }\n"
-    "    }\n\n";
-
-  if (has_bias_) {
-    code += "    out0 = out0 + bias[co_slice];\n";
-  }
-
-  if (param->act_type_ == ActType_Relu) {
-    code += "    out0 = max(out0, (FLT4)(0.0f));\n";
-  } else if (param->act_type_ == ActType_Relu6) {
-    code += "    out0 = clamp(out0, (FLT4)(0.0f), (FLT4)(6.0f));\n";
-  }
-  code += "    WRITE_IMAGE(output, (int2)(ow + 0, (n * CO_SLICES + co_slice) * OH + oh), out0);\n";
-
-  if (check_ow) {
-    code +=
-      "    if (last_is_double)"
-      "    {\n";
-  }
-  if (has_bias_) {
-    code += "    out1 = out1 + bias[co_slice];\n";
-  }
-  if (param->act_type_ == ActType_Relu) {
-    code += "    out1 = max(out1, (FLT4)(0.0f));\n";
-  } else if (param->act_type_ == ActType_Relu6) {
-    code += "    out1 = clamp(out1, (FLT4)(0.0f), (FLT4)(6.0f));\n";
-  }
-  code += "    WRITE_IMAGE(output, (int2)(ow + 1, (n * CO_SLICES + co_slice) * OH + oh), out1);\n";
-  if (check_ow) {
-    code += "}\n";
-  }
-  code += "}\n";
-
-  return code;
-}
-
-std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
-  std::string code;
-  code +=
-    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-    "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
-    "#define PAD 1\n"
-    "\n"
-    "__constant sampler_t\n"
-    "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
-    "\n"
-    "constant FLT Bt[36] = {\n"
-    "        1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n"
-    "        0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n"
-    "        0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n"
-    "};\n"
-    "\n"
-    "__kernel void Winograd4x4To36(__read_only image2d_t input,\n"
-    "                              __write_only image2d_t output,\n"
-    "                              int4 input_shape,     // N H W CI_SLICES\n"
-    "                              int4 output_shape)    // N 36 H/4*W/4 CI_SLICES\n"
-    "{\n"
-    "    int tile_xy = get_global_id(0);\n"
-    "    int row = get_global_id(1);\n"
-    "    int slice = get_global_id(2);\n"
-    "\n"
-    "    int TILE_XY = output_shape.z;\n"
-    "    int SLICES = input_shape.w;\n"
-    "    if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n"
-    "    {\n"
-    "        return;\n"
-    "    }\n"
-    "\n"
-    "    int IH = input_shape.y, IW = input_shape.z;\n"
-    "    int TILE_X = UP_DIV(IW, 4);\n"
-    "    int tile_x = tile_xy % TILE_X;\n"
-    "    int tile_y = tile_xy / TILE_X;\n"
-    "\n"
-    "    constant FLT *Bt_row = Bt + row * 6;\n"
-    "    FLT4 BtD_row[6] = {0};\n"
-    "    for (int y = 0; y < 6; y++)\n"
-    "    {\n"
-    "        int ih = tile_y * 4 - PAD + y;\n";
-
-  if (op_format_ == Format_NHWC4) {
-    code += "        int y_idx = ih;\n";
-  } else if (op_format_ == Format_NC4HW4) {
-    code +=
-      "        if(ih < 0 || ih >= IH) {continue;}\n"
-      "        int y_idx = slice * IH + ih;\n";
-  }
-
-  code +=
-    "        for (int x = 0; x < 6; x++)\n"
-    "        {\n"
-    "            int iw = tile_x * 4 - PAD + x;\n";
-
-  if (op_format_ == Format_NHWC4) {
-    code +=
-      "            if(iw < 0 || iw >= IW) {continue;}\n"
-      "            int x_idx = iw * SLICES + slice;\n";
-  } else if (op_format_ == Format_NC4HW4) {
-    code += "            int x_idx = iw;\n";
-  }
-
-  code +=
-    "            BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n"
-    "        }\n"
-    "    }\n"
-    "\n"
-    "    for (int y = 0; y < 6; y++)\n"
-    "    {\n"
-    "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-    "        for (int x = 0; x < 6; x++)\n"
-    "        {\n"
-    "            acc += BtD_row[x] * Bt[y * 6 + x];\n"
-    "        }\n"
-    "        WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W  H=36\n"
-    "    }\n"
-    "}";
-  return code;
-}
-
-std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
-  return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-         "#define CI_TILE 4\n"
-         "#define H 36\n"
-         "__constant sampler_t\n"
-         "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
-         "\n"
-         "__kernel void WinogradConvolution(__read_only image2d_t input,\n"
-         "                                  __write_only image2d_t output,\n"
-         "                                  __global FLT16 *weight,\n"
-         "                                  int4 input_shape,         // N 36 H/4*W/4 CI_SLICES\n"
-         "                                  int4 output_shape)        // N 36 H/4*W/4 CO_SLICES\n"
-         "{\n"
-         "    int w = get_global_id(0) * 2;\n"
-         "    int h = get_global_id(1);\n"
-         "    int co_slice = get_global_id(2) * 2;\n"
-         "\n"
-         "    int CI_SLICES = input_shape.w;\n"
-         "    int W = input_shape.z;\n"
-         "    int CO_SLICES = output_shape.w;\n"
-         "\n"
-         "    if (h >= H || w >= W || co_slice >= CO_SLICES)\n"
-         "    {\n"
-         "        return;\n"
-         "    }\n"
-         "\n"
-         "    FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-         "    FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-         "    FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-         "    FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-         "\n"
-         "    int y_idx = h;\n"
-         "    __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n"
-         "    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
-         "    {\n"
-         "        FLT4 in0 = READ_IMAGE(input, smp_none, (int2)(w + 0, y_idx));\n"
-         "        FLT4 in1 = READ_IMAGE(input, smp_none, (int2)(w + 1, y_idx));\n"
-         "        y_idx += 36;\n"
-         "\n"
-         "        FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n"
-         "        weight_ptr += 2;\n"
-         "\n"
-         "\n"
-         "        out00 += in0.x * weight0.s0123;\n"
-         "        out00 += in0.y * weight0.s4567;\n"
-         "        out00 += in0.z * weight0.s89ab;\n"
-         "        out00 += in0.w * weight0.scdef;\n"
-         "\n"
-         "        out01 += in1.x * weight0.s0123;\n"
-         "        out01 += in1.y * weight0.s4567;\n"
-         "        out01 += in1.z * weight0.s89ab;\n"
-         "        out01 += in1.w * weight0.scdef;\n"
-         "\n"
-         "        out10 += in0.x * weight1.s0123;\n"
-         "        out10 += in0.y * weight1.s4567;\n"
-         "        out10 += in0.z * weight1.s89ab;\n"
-         "        out10 += in0.w * weight1.scdef;\n"
-         "\n"
-         "        out11 += in1.x * weight1.s0123;\n"
-         "        out11 += in1.y * weight1.s4567;\n"
-         "        out11 += in1.z * weight1.s89ab;\n"
-         "        out11 += in1.w * weight1.scdef;\n"
-         "    }\n"
-         "\n"
-         "    WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n"
-         "    if (w + 1 < W)\n"
-         "    {\n"
-         "        WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n"
-         "    }\n"
-         "\n"
-         "    if (co_slice + 1 < CO_SLICES)\n"
-         "    {\n"
-         "        WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n"
-         "        if (w + 1 < W)\n"
-         "        {\n"
-         "            WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n"
-         "        }\n"
-         "    }\n"
-         "}";
-}
-
-std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
-  std::string code =
-    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-    "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
-    "\n"
-    "__constant sampler_t\n"
-    "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
-    "\n"
-    "constant FLT At[24] = {\n"
-    "        1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n"
-    "        0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n"
-    "};\n"
-    "\n"
-    "__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
-    "                              __write_only image2d_t output,\n";
-  if (has_bias_) {
-    code += "                              __global FLT4 *bias,\n";
-  }
-  code +=
-    "                              int4 input_shape,      // N 36 H/4*W/4 CO_SLICES\n"
-    "                              int4 output_shape)     // N H W CO_SLICES\n"
-    "{\n"
-    "    int tile_xy = get_global_id(0);\n"
-    "    int row = get_global_id(1);\n"
-    "    int slice = get_global_id(2);\n"
-    "\n"
-    "    int TILE_XY = input_shape.z;\n"
-    "    int SLICES = input_shape.w;\n"
-    "    int OH = output_shape.y;\n"
-    "    int OW = output_shape.z;\n"
-    "\n"
-    "    if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n"
-    "    {\n"
-    "        return;\n"
-    "    }\n"
-    "\n"
-    "    constant FLT *At_row = At + row * 6;\n"
-    "    FLT4 AtM_row[6] = {0};\n"
-    "    for (int y = 0; y < 6; y++)\n"
-    "    {\n"
-    "        for (int x = 0; x < 6; x++)\n"
-    "        {\n"
-    "            AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + x));\n"
-    "        }\n"
-    "    }\n"
-    "\n"
-    "    int TILE_X = UP_DIV(OW, 4);\n"
-    "    for (int x = 0; x < 4; x++)\n"
-    "    {\n"
-    "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
-    "        for (int y = 0; y < 6; y++)\n"
-    "        {\n"
-    "            acc += AtM_row[y] * At[x * 6 + y];\n"
-    "        }\n";
-  if (has_bias_) {
-    code += "        acc += bias[slice];\n";
-  }
-
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  if (param->act_type_ == ActType_Relu) {
-    code += "        acc = max(acc, (FLT4)(0.0f));\n\n";
-  } else if (param->act_type_ == ActType_Relu6) {
-    code += "        acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n\n";
-  }
-
-  code +=
-    "        int tile_x = tile_xy % TILE_X;\n"
-    "        int tile_y = tile_xy / TILE_X;\n"
-    "        int ow = tile_x * 4 + x;\n"
-    "        int oh = tile_y * 4 + row;\n";
-
-  if (op_format_ == Format_NHWC4) {
-    code += "        if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n";
-  } else if (op_format_ == Format_NC4HW4) {
-    code += "        if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n";
-  }
-
-  code +=
-    "    }\n"
-    "}";
-  return code;
-}
-
 int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
   constexpr size_t work_group_size[] = {4, 4, 1};
   auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize();
@@ -868,10 +320,8 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
     local_nh = global_nh / 2;
   }
 
-  if (op_format_ == Format_NHWC4) {
-    if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
-      local_w = 4;
-    }
+  if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
+    local_w = 4;
   }
 
   global->clear();
@@ -882,14 +332,6 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
   local->push_back(local_nh);
   local->push_back(local_w);
   local->push_back(local_c);
-
-  if (op_format_ == Format_NC4HW4) {
-    // calculate 2 FLT4 along width per work-item
-    global->at(1) = UP_DIV(global->at(1), 2);
-    if (local->at(1) > global->at(1)) {
-      local->at(1) = global->at(1);
-    }
-  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index 6191872d64..75c1094193 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -42,11 +42,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   int InitWeight();
   int InitBias();
   int GenerateWinogradWeight();
-  std::string CodeGenConvolutionNHWC4();
-  std::string CodeGenConvolutionNC4HW4();
-  std::string CodeGenWinograd4x4To36();
-  std::string CodeGenWinogradConvolution();
-  std::string CodeGenWinograd36To4x4();
   int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
 
   size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }
@@ -62,38 +57,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
     return attr_valid && channel_good && hw_good;
   }
 
-  std::string get_code_id() {
-    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-    std::vector<int> vpara{batch_size_,
-                           CI_,
-                           IH_,
-                           IW_,
-                           CO_,
-                           OH_,
-                           OW_,
-                           KH_,
-                           KW_,
-                           param->stride_h_,
-                           param->stride_w_,
-                           param->pad_u_,
-                           param->pad_l_,
-                           param->pad_d_,
-                           param->pad_r_,
-                           param->dilation_h_,
-                           param->dilation_w_,
-                           has_bias_,
-                           use_fp16_,
-                           op_format_,
-                           param->act_type_};
-    std::string code_id;
-    for (auto &iv : vpara) {
-      code_id += "_" + std::to_string(iv);
-    }
-    return code_id;
-  }
-
   bool use_fp16_{false};
-  const schema::Format op_format_{schema::Format_NHWC4};
 
   int batch_size_{};
   int CI_{};
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
index 67185d8718..da72fe4b21 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -16,7 +16,6 @@
 #include <memory>
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
@@ -29,10 +28,7 @@ using mindspore::lite::Tensor;
 using mindspore::schema::Format;
 using mindspore::schema::NodeType_ValueNode;
 using mindspore::schema::Format::Format_KHWC;
-using mindspore::schema::Format::Format_NC4HW4;
-using mindspore::schema::Format::Format_NCHW;
 using mindspore::schema::Format::Format_NHWC;
-using mindspore::schema::Format::Format_NHWC4;
 
 namespace mindspore {
 
@@ -67,70 +63,38 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
   }
   printf("\n");
 
-  float max_err = -1.0f;
-  std::array<int, 5> idx_5d{};
-  int max_err_idx = -1, first_err_idx = -1;
-  auto SLICES = UP_DIV(output->Channel(), 4);
-  int I = 1, J = 1, K = 1, L = 1, M = 1;
-  switch (output->GetFormat()) {
-    case Format_NHWC:
-      I = output->Batch(), J = output->Height(), K = output->Width(), L = output->Channel();
-      break;
-    case Format_NCHW:
-      I = output->Batch(), J = output->Channel(), K = output->Height(), L = output->Width();
-      break;
-    case Format_NHWC4:
-      I = output->Batch(), J = output->Height(), K = output->Width(), L = SLICES, M = 4;
-      break;
-    case Format_NC4HW4:
-      I = output->Batch(), J = SLICES, K = output->Height(), L = output->Width(), M = 4;
-      break;
-    default:
-      break;
-  }
-
-  int cn = 0;
-  for (int i = 0; i < I; ++i) {
-    for (int j = 0; j < J; ++j) {
-      for (int k = 0; k < K; ++k) {
-        for (int l = 0; l < L; ++l) {
-          for (int m = 0; m < M; ++m) {
-            auto err = std::fabs(output_data[cn] - expect_data[cn]);
-            if (first_err_idx == -1 && err > atol) {
-              first_err_idx = cn;
-            }
-            if (err > max_err) {
-              max_err = err;
-              idx_5d = {i, j, k, l, m};
-              max_err_idx = cn;
-            }
-            cn++;
+  bool not_equal = false;
+  int idx = 0;
+  std::array<int, 4> idx_4d{};
+  auto N = output->Batch(), H = output->Height(), W = output->Width(), C = output->Channel();
+  for (int i = 0, cn = 0; i < N; ++i) {
+    for (int j = 0; j < H; ++j) {
+      for (int k = 0; k < W; ++k) {
+        for (int l = 0; l < C; ++l) {
+          auto err = std::fabs(output_data[cn] - expect_data[cn]);
+          if (err > atol) {
+            not_equal = true;
+            idx_4d = {i, j, k, l};
+            goto End;
           }
+          cn++;
         }
       }
     }
   }
 
-  if (max_err > atol) {
-    printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
-           output_data[first_err_idx]);
+End:
+  if (not_equal) {
+    printf("first error at [%d %d %d %d] expect=%.3f output=%.3f\n", idx_4d[0], idx_4d[1], idx_4d[2], idx_4d[3],
+           expect_data[idx], output_data[idx]);
     FAIL();
   } else {
-    float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
-    if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
-      printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
-    } else {
-      printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
-    }
-    printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
-           output_data[max_err_idx], max_err, relative_err * 100);
     printf("COMPARE SUCCESS!\n\n");
   }
 }
 
-void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format,
-               const TypeId data_type, const float atol, const float *input_data, const float *weight_data,
-               const float *bias_data, const float *expect_data) {
+void TEST_MAIN(const std::string &attr, const TypeId data_type, const float atol, const float *input_data,
+               const float *weight_data, const float *bias_data, const float *expect_data) {
   auto param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
   if (param == nullptr) {
     MS_LOG(ERROR) << "ConvParameter create error.";
@@ -145,7 +109,8 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
          &param->dilation_h_, &param->dilation_w_);
 
   MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  auto ocl_runtime = runtime_wrapper.GetInstance();
   ocl_runtime->Init();
   ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16);
   auto allocator = ocl_runtime->GetAllocator();
@@ -155,19 +120,24 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
   std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
   std::vector<int> bias_shape = {param->output_channel_};
   std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
-  auto input = Tensor(data_type, input_shape, input_format, lite::TensorCategory(NodeType_ValueNode));
+  auto input = Tensor(data_type, input_shape, Format_NHWC, lite::TensorCategory(NodeType_ValueNode));
   auto weight = Tensor(data_type, weight_shape, Format_KHWC, lite::TensorCategory(NodeType_ValueNode));
   auto bias = Tensor(data_type, bias_shape, Format_KHWC, lite::TensorCategory(NodeType_ValueNode));
-  auto output = Tensor(data_type, output_shape, output_format, lite::TensorCategory(NodeType_ValueNode));
+  auto output = Tensor(data_type, output_shape, Format_NHWC, lite::TensorCategory(NodeType_ValueNode));
 
   MS_LOG(DEBUG) << "allocate memory and initialize weight/bias";
   weight.MallocData();
-  bias.MallocData();
   LoadData(&weight, weight_data);
-  LoadData(&bias, bias_data);
+  if (bias_data) {
+    bias.MallocData();
+    LoadData(&bias, bias_data);
+  }
 
   MS_LOG(DEBUG) << "create OpenCL Kernel";
-  std::vector<lite::Tensor *> inputs{&input, &weight, &bias};
+  std::vector<lite::Tensor *> inputs{&input, &weight};
+  if (bias_data) {
+    inputs.push_back(&bias);
+  }
   std::vector<lite::Tensor *> outputs{&output};
   auto kernel = std::make_unique<ConvolutionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   kernel->Init();
@@ -186,132 +156,56 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
 
   MS_LOG(DEBUG) << "release resources";
   weight.FreeData();
-  bias.FreeData();
-  input.SetData(nullptr);
-  output.SetData(nullptr);
+  if (bias_data) {
+    bias.FreeData();
+  }
   delete sub_graph;
 }
 
-void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format,
-               const TypeId data_type, const float atol, const std::string &data_path) {
-  auto testcase_path = data_path + "/" + attr + "/";
-  std::map<Format, std::string> format_str{
-    {Format_NCHW, "NCHW"}, {Format_NHWC, "NHWC"}, {Format_NHWC4, "NHWC4"}, {Format_NC4HW4, "NC4HW4"}};
-  auto input_file = testcase_path + "input_" + format_str[input_format] + ".bin";
-  auto weight_file = testcase_path + "weight_OHWI.bin";
-  auto bias_file = testcase_path + "bias_C.bin";
-  auto expect_file = testcase_path + "expect_" + format_str[output_format] + ".bin";
-  MS_LOG(DEBUG) << "input_file  :" << input_file;
-  MS_LOG(DEBUG) << "weight_file :" << weight_file;
-  MS_LOG(DEBUG) << "bias_file   :" << bias_file;
-  MS_LOG(DEBUG) << "expect_file :" << expect_file;
-
-  size_t dst_size;
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_file.c_str(), &dst_size));
-  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_file.c_str(), &dst_size));
-  auto bias_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(bias_file.c_str(), &dst_size));
-  auto expect_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(expect_file.c_str(), &dst_size));
-  printf("input [0-3]: %7.3f  %7.3f  %7.3f\n", input_data[0], input_data[1], input_data[2]);
-  printf("weight[0-3]: %7.3f  %7.3f  %7.3f\n", weight_data[0], weight_data[1], weight_data[2]);
-  printf("bias  [0-3]: %7.3f  %7.3f  %7.3f\n", bias_data[0], bias_data[1], bias_data[2]);
-  printf("expect[0-3]: %7.3f  %7.3f  %7.3f\n", expect_data[0], expect_data[1], expect_data[2]);
-
-  TEST_MAIN(attr, input_format, output_format, op_format, data_type, atol, input_data, weight_data, bias_data,
-            expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
-  std::string attr =
-    "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
-    "1x1";
-  //  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 2e-6f,
-  //  "testcases/mobilenetv2_fp32/"); TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16,
-  //  2e-2f, "testcases/mobilenetv2_fp32/");
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/");
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/");
-}
-
-TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
-  std::string attr =
-    "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_"
-    "1x1";
-  //  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/");
-  //  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/");
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
-}
-
-TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
+TEST_F(TestConvolutionOpenCL, test0) {
   std::string attr =
     "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
   float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   float bias_data[] = {0.0f, 0.0f};
   float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f};
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
-  float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f};
-  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  float bias_data[] = {0.0f, 0.0f};
-  float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f};
-  TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 
-TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
+TEST_F(TestConvolutionOpenCL, test0_no_bias) {
   std::string attr =
     "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f};
+  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  float bias_data[] = {0.0f, 0.0f};
-  float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f,  5.0f,  0.0f, 0.0f,
-                         9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f};
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
-            bias_data, expect_data);
-  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
-            bias_data, expect_data);
-  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
-            bias_data, expect_data);
-  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
-            bias_data, expect_data);
+  float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f};
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, nullptr, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, nullptr, expect_data);
 }
 
-TEST_F(TestConvolutionOpenCL, simple_test1) {
+TEST_F(TestConvolutionOpenCL, test1) {
   std::string attr =
     "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
   float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   float bias_data[] = {0.5f, -0.5f};
   float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f};
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 
-TEST_F(TestConvolutionOpenCL, simple_test2) {
+TEST_F(TestConvolutionOpenCL, test2) {
   std::string attr =
     "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
   float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   float bias_data[] = {0.0f};
   float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f};
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 
-TEST_F(TestConvolutionOpenCL, simple_test3) {
+TEST_F(TestConvolutionOpenCL, test3) {
   std::string attr =
     "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
   float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
@@ -319,13 +213,11 @@ TEST_F(TestConvolutionOpenCL, simple_test3) {
                          9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
   float bias_data[] = {0.5f, -0.5f};
   float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 
-TEST_F(TestConvolutionOpenCL, simple_test3_batch2) {
+TEST_F(TestConvolutionOpenCL, test3_batch2) {
   std::string attr =
     "inputNHWC_2x2x2x2_outputNHWC_2x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
   float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
@@ -334,14 +226,8 @@ TEST_F(TestConvolutionOpenCL, simple_test3_batch2) {
   float bias_data[] = {0.5f, -0.5f};
   float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f,
                          168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
-            expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
-            bias_data, expect_data);
-  TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
-            bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
+  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 
 }  // namespace mindspore