diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
index 6266d14d3c..2a688df136 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
@@ -91,3 +91,16 @@ __kernel void HSwish(__read_only image2d_t input, __write_only image2d_t output,
   result.w = temp.w * (temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f));
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
+
+__kernel void HSigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+    int X = get_global_id(0);  // w*c
+    int Y = get_global_id(1);  // n*h
+    if (X >= img_shape.x || Y >= img_shape.y) return;
+    FLT4 temp = READ_IMAGE(input, smp_zero, (int2)(X, Y));
+    FLT4 result = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    result.x = temp.x <= -3 ? 0 : (temp.x >= 3 ? 1 : temp.x / 6 + 0.5f);
+    result.y = temp.y <= -3 ? 0 : (temp.y >= 3 ? 1 : temp.y / 6 + 0.5f);
+    result.z = temp.z <= -3 ? 0 : (temp.z >= 3 ? 1 : temp.z / 6 + 0.5f);
+    result.w = temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f);
+    WRITE_IMAGE(output, (int2)(X, Y), result);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
similarity index 92%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
index cbed06393a..cdf8e06205 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@@ -28,9 +28,9 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
     return;                                                                       \
   }
 
-__kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
   const int BlockH = 1;
   const int BlockW = 1;
   const int BlockC = 1;
@@ -84,9 +84,9 @@ __kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image
   }
 }
 
-__kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
   const int BlockH = 2;
   const int BlockW = 1;
   const int BlockC = 1;
@@ -161,9 +161,9 @@ __kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image
   }
 }
 
-__kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
   const int BlockH = 2;
   const int BlockW = 1;
   const int BlockC = 2;
@@ -268,9 +268,9 @@ __kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image
   }
 }
 
-__kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
   const int BlockH = 2;
   const int BlockW = 2;
   const int BlockC = 2;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
index cfd4c7d1ca..3ac91e2df0 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
@@ -1,7 +1,9 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
-                           int4 src_size, int4 dst_size, int indices_num, int axis) {
+
+__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
+                     int4 src_size, int4 dst_size, int indices_num, int axis) {
   int X = get_global_id(0);  // w
   int Y = get_global_id(1);  // n*h
   int Z = get_global_id(2);  // c
@@ -40,48 +42,3 @@ __kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_
   }
   WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data);
 }
-
-__kernel void gather_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
-                            int4 src_size, int4 dst_size, int indices_num, int axis) {
-  int X = get_global_id(0);  // w
-  int Y = get_global_id(1);  // n*h
-  int Z = get_global_id(2);  // c
-  if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) {
-    return;
-  }
-  FLT4 res_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int batch = Y / dst_size.y;
-  int height = Y % dst_size.y;
-  if (axis == 0) {
-    int index_y = indices[batch] * src_size.y * src_size.z + Z * src_size.y + height;
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-  } else if (axis == 1) {
-    int index_y = batch * src_size.y * src_size.z + Z * src_size.y + indices[height];
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-  } else if (axis == 2) {
-    int index_y = batch * src_size.y * src_size.z + Z * src_size.y + height;
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X], index_y));
-  } else if (axis == 3) {
-    int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4};
-    FLT tmp[4];
-    FLT res_tmp[4];
-    for (int i = 0; i < indices_num; ++i) {
-      FLT4 rd_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-      int index_y = batch * src_size.y * src_size.z + offset[i] * src_size.y + height;
-      rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-      if (i >= 1 && offset[i] != offset[i - 1]) {
-        rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-      }
-      tmp[0] = rd_data.x;
-      tmp[1] = rd_data.y;
-      tmp[2] = rd_data.z;
-      tmp[3] = rd_data.w;
-      res_tmp[i] = tmp[indices[Z * 4 + i] % 4];
-    }
-    res_data.x = res_tmp[0];
-    res_data.y = res_tmp[1];
-    res_data.z = res_tmp[2];
-    res_data.w = res_tmp[3];
-  }
-  WRITE_IMAGE(dst_data, (int2)(X, (batch * dst_size.y * dst_size.z + Z * dst_size.y + height)), res_data);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl
deleted file mode 100644
index fa7486bb63..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void hswish(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 tensor_shape) {
-  int X = get_global_id(0);  // n*h n: default =1
-  int Y = get_global_id(1);  // w
-  int Z = get_global_id(2);  // c
-  if (X >= tensor_shape.x * tensor_shape.y || Y >= tensor_shape.z || Z >= tensor_shape.w || tensor_shape.y == 0) {
-    return;
-  }
-  int n = X / tensor_shape.y;
-  int h = X % tensor_shape.y;
-  FLT4 temp = READ_IMAGE(src_data, smp_none, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h)));
-  FLT4 result = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  result.x = temp.x <= -3 ? 0 : (temp.x >= 3 ? 1 : temp.x / 6 + 0.5f);
-  result.y = temp.y <= -3 ? 0 : (temp.y >= 3 ? 1 : temp.y / 6 + 0.5f);
-  result.z = temp.z <= -3 ? 0 : (temp.z >= 3 ? 1 : temp.z / 6 + 0.5f);
-  result.w = temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f);
-  WRITE_IMAGE(dst_data, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h)), result);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl
index e979b0ba45..7eba6bf70d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl
@@ -2,35 +2,56 @@
 
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-#define Pad(dataformat, in_x, in_y, out_x, out_y)                                                              \
-  __kernel void Pad_##dataformat(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape, \
-                                 int4 output_shape, int2 pad, float constant_value) {                          \
-    int oh = get_global_id(0);                                                                                 \
-    int ow = get_global_id(1);                                                                                 \
-    int co_slice = get_global_id(2);                                                                           \
-    int OH = output_shape.y;                                                                                   \
-    int OW = output_shape.z;                                                                                   \
-    int CO_SLICES = output_shape.w;                                                                            \
-                                                                                                               \
-    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) {                                                       \
-      return;                                                                                                  \
-    }                                                                                                          \
-                                                                                                               \
-    int IH = input_shape.y;                                                                                    \
-    int IW = input_shape.z;                                                                                    \
-    int CI_SLICES = input_shape.w;                                                                             \
-                                                                                                               \
-    int pad_top = pad.x;                                                                                       \
-    int pad_left = pad.y;                                                                                      \
-    int ih = oh - pad_top;                                                                                     \
-    int iw = ow - pad_left;                                                                                    \
-                                                                                                               \
-    FLT4 result = (FLT4)(constant_value);                                                                      \
-    if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {                                                            \
-      result = READ_IMAGE(input, smp_zero, (int2)(in_x, in_y));                                                \
-    }                                                                                                          \
-    WRITE_IMAGE(output, (int2)(out_x, out_y), result);                                                         \
+__kernel void Pad(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape, int4 output_shape,
+                  int2 io_slices, int4 pad_before, float constant_value) {
+  int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
+  int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
+  int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
+  int on_oh = get_global_id(0);
+  int ow = get_global_id(1);
+  int co_slice = get_global_id(2);
+  int on = on_oh / OH;
+  int oh = on_oh % OH;
+  if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
+    return;
   }
 
-Pad(NHWC4, iw *CI_SLICES + co_slice, ih, ow *CO_SLICES + co_slice, oh);
-Pad(NC4HW4, iw, co_slice *IH + ih, ow, co_slice *OH + oh);
+  int in = on - pad_before.x;
+  int ih = oh - pad_before.y;
+  int iw = ow - pad_before.z;
+  int ci = co_slice * 4 - pad_before.w;
+  if (in < 0 || in >= IN || ih < 0 || ih >= IH || iw < 0 || iw >= IW || ci + 3 < 0 || ci >= CI) {
+    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), (FLT4)(constant_value));
+    return;
+  }
+
+  int offset = ci % 4;
+  if (offset < 0) {
+    offset += 4;
+  }
+  FLT4 src0 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
+  if (offset == 0 && ci >= 0 && ci + 3 < CI) {
+    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), src0);
+    return;
+  }
+  FLT4 src1 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + (ci + 4) / 4, in * IH + ih));
+  FLT4 src_f4;
+  if (offset == 0) {
+    src_f4 = (FLT4)(src0.x, src0.y, src0.z, src0.w);
+  } else if (offset == 1) {
+    src_f4 = (FLT4)(src0.y, src0.z, src0.w, src1.x);
+  } else if (offset == 2) {
+    src_f4 = (FLT4)(src0.z, src0.w, src1.x, src1.y);
+  } else {  // if (offset==3)
+    src_f4 = (FLT4)(src0.w, src1.x, src1.y, src1.z);
+  }
+  FLT src[4] = {src_f4.x, src_f4.y, src_f4.z, src_f4.w};
+  FLT out[4] = {constant_value, constant_value, constant_value, constant_value};
+  for (int i = 0; i < 4; ++i) {
+    if (ci + i >= 0 && ci + i < CI) {
+      out[i] = src[i];
+    }
+  }
+  FLT4 out_f4 = (FLT4)(out[0], out[1], out[2], out[3]);
+  WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out_f4);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
index e3fa1cc1c1..49d267ae81 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -31,6 +31,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_HSIGMOID;
 using mindspore::schema::ActivationType_HSWISH;
 using mindspore::schema::ActivationType_LEAKY_RELU;
 using mindspore::schema::ActivationType_RELU;
@@ -44,9 +45,9 @@ namespace mindspore::kernel {
 
 std::string ActivationOpenCLKernel::GetActTypeString(int act_type) {
   static std::map<int, std::string> supported_act_type = {
-    {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"},
-    {ActivationType_RELU6, "Relu6"},          {ActivationType_TANH, "Tanh"}, {ActivationType_SWISH, "Swish"},
-    {ActivationType_HSWISH, "HSwish"}};
+    {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"},        {ActivationType_SIGMOID, "Sigmoid"},
+    {ActivationType_RELU6, "Relu6"},          {ActivationType_TANH, "Tanh"},        {ActivationType_SWISH, "Swish"},
+    {ActivationType_HSWISH, "HSwish"},        {ActivationType_HSIGMOID, "HSigmoid"}};
   auto result_iter = supported_act_type.find(act_type);
   if (result_iter != supported_act_type.end()) {
     return result_iter->second;
@@ -63,13 +64,12 @@ int ActivationOpenCLKernel::CheckSpecs() {
 }
 
 int ActivationOpenCLKernel::Prepare() {
-  outShape = Image2DInfo(out_tensors_[0]);
+  outShape = GpuTensorInfo(out_tensors_[0]);
   std::string source = activation_source;
-  std::set<std::string> build_options;
   std::string program_name = "Activation";
   ocl_runtime_->LoadSource(program_name, source);
   std::string kernel_name = GetActTypeString(type_);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
   SetConstArgs();
   SetGlobalLocal();
   MS_LOG(DEBUG) << kernel_name << " init Done!";
@@ -101,7 +101,7 @@ int ActivationOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
index 6c0b4635fc..5abac64eba 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -45,7 +45,7 @@ class ActivationOpenCLKernel : public OpenCLKernel {
   cl::Kernel kernel_;
   int type_;
   float alpha_;
-  Image2DInfo outShape = Image2DInfo(nullptr);
+  GpuTensorInfo outShape = GpuTensorInfo(nullptr);
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
index ab53aeb196..8939f9955d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@@ -76,7 +76,7 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
   auto in_shape = in_tensors_[0]->shape();
   auto in_shape_align = in_shape;
   in_shape_align[3] = UP_ROUND(in_shape[3], C4NUM);
-  im_in_ = Image2DInfo(in_tensors_[0]);
+  im_in_ = GpuTensorInfo(in_tensors_[0]);
   auto out_shape_align = in_shape_align;
   out_shape_align.at(param->axis_) = param->axis_ == 3 ? UP_ROUND(param->topk_, C4NUM) : param->topk_;
   int reduce_len = GetUpPow2(in_shape.at(param->axis_));
@@ -152,8 +152,7 @@ int ArgMinMaxOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
   ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
   ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
index 80910d3c12..bf6aa1428e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
@@ -43,7 +43,7 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel {
   cl::Kernel kernel_;
   void *buff_{nullptr};
   void *ids_{nullptr};
-  Image2DInfo im_in_{Image2DInfo(nullptr)};
+  GpuTensorInfo im_in_{GpuTensorInfo(nullptr)};
   cl_int4 src_size_;
   cl_int4 cus_size_;
   cl_int4 strides_;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 859aafa2b2..fff321f789 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -245,10 +245,9 @@ int ArithmeticOpenCLKernel::Prepare() {
     kernel_name_ += "_BUF";
   }
   std::string program_name = "Arithmetic";
-  std::set<std::string> build_options;
   std::string source = arithmetic_source;
   ocl_runtime_->LoadSource(program_name, source);
-  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options);
+  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_);
 #endif
   if (error_code != RET_OK) {
     return error_code;
@@ -270,7 +269,7 @@ int ArithmeticOpenCLKernel::Run() {
   auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
index 30d13d9a2f..2a711722d1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@@ -157,8 +157,7 @@ int ArithmeticSelfOpenCLKernel::Run() {
   int arg_cn = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
index a8720ad60f..f79c4b08c9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
@@ -87,11 +87,10 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
 
-  std::set<std::string> build_options;
   std::string source = batch_to_space_nd_source;
   std::string program_name = "batch_to_space_nd";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
 
   SetGlobalLocal();
@@ -102,9 +101,9 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
 
 int BatchToSpaceNDOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
-  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
 
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
index dd1d4d8b58..7231431e54 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -91,8 +91,7 @@ int BatchNormOpenCLKernel::Run() {
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());   // mean
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c());   // variance
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
index 48ec0d9c94..a48949780b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
@@ -71,12 +71,11 @@ int BiasAddOpenCLKernel::Init() {
     return mindspore::lite::RET_ERROR;
   }
   InitWeights();
-  std::set<std::string> build_options;
   std::string source = biasadd_source;
   std::string program_name = "BiasAdd";
   std::string kernel_name = "BiasAdd";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 
   MS_LOG(DEBUG) << program_name << " Init Done!";
   return mindspore::lite::RET_OK;
@@ -95,7 +94,7 @@ int BiasAddOpenCLKernel::Run() {
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
   std::vector<size_t> local = {1, 1};
   std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
-  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
     return mindspore::lite::RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
index 9b5a0b2fa8..65f94248b9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
@@ -100,8 +100,7 @@ int CastOpenCLKernel::Run() {
   int arg_cn = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 267fabfed9..9c8d3a12b5 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -114,11 +114,10 @@ int ConcatOpenCLKernel::Prepare() {
   }
   kernel_name += "_NHWC4";
   MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
-  std::set<std::string> build_options;
   std::string source = concat_source;
   std::string program_name = "Concat";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   SetGlobalLocal();
   SetConstArgs();
@@ -146,7 +145,7 @@ int ConcatOpenCLKernel::Run() {
     MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
     return RET_ERROR;
   }
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
similarity index 63%
rename from mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
rename to mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index 3909b57589..a4eca9d3cd 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -18,12 +18,12 @@
 #include <set>
 #include <algorithm>
 #include "src/common/utils.h"
-#include "src/runtime/kernel/opencl/kernel/convolution.h"
+#include "src/runtime/kernel/opencl/kernel/conv2d.h"
 #include "src/runtime/kernel/opencl/kernel/fullconnection.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
-#include "src/runtime/kernel/opencl/cl/convolution.cl.inc"
+#include "src/runtime/kernel/opencl/cl/conv2d.cl.inc"
 #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -38,19 +38,43 @@ namespace mindspore::kernel {
 constexpr size_t CI_TILE = C4NUM;
 constexpr size_t CO_TILE = C4NUM;
 
-int ConvolutionOpenCLKernel::Init() {
+int Conv2DOpenCLKernel::CheckSpecs() {
+  if (in_tensors_.size() != 2 && in_tensors_.size() != 3) {
+    MS_LOG(ERROR) << "Conv2D only supports 2 or 3 input Tensor but get " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "Conv2D only supports 1 output Tensor but get " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  if (in_tensors_.front()->shape().size() != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D input Tensor but get " << in_tensors_.front()->shape().size() << "D.";
+    return RET_ERROR;
+  }
+  if (in_tensors_[1]->shape().size() != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D filter Tensor but get " << in_tensors_[1]->shape().size() << "D.";
+    return RET_ERROR;
+  }
+  if (out_tensors_.front()->shape().size() != 4) {
+    MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int Conv2DOpenCLKernel::Prepare() {
   use_fp16_ = ocl_runtime_->GetFp16Enable();
   sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float);
 
-  auto input_tensor = in_tensors_[0];
-  auto output_tensor = out_tensors_[0];
-  batch_size_ = input_tensor->Batch();
-  CI_ = input_tensor->Channel();
-  IH_ = input_tensor->Height();
-  IW_ = input_tensor->Width();
-  CO_ = output_tensor->Channel();
-  OH_ = output_tensor->Height();
-  OW_ = output_tensor->Width();
+  auto input_shape = in_tensors_.front()->shape();
+  auto output_shape = out_tensors_.front()->shape();
+  batch_size_ = input_shape[0];
+  IH_ = input_shape[1];
+  IW_ = input_shape[2];
+  CI_ = input_shape[3];
+  OH_ = output_shape[1];
+  OW_ = output_shape[2];
+  CO_ = output_shape[3];
   CI_SLICES_ = UP_DIV(CI_, C4NUM);
   CO_SLICES_ = UP_DIV(CO_, C4NUM);
   KH_ = param_->kernel_h_;
@@ -63,26 +87,21 @@ int ConvolutionOpenCLKernel::Init() {
   TILES_XY_ = TILES_X_ * TILES_Y_;
   use_winograd_ = UseWinograd4x4To6x6();
 
-  if (!use_winograd_) {
-    SetBlockSize();
-    SetGlobalLocal();
-  }
-
   // build kernel
-  std::set<std::string> build_options;
   if (use_winograd_) {
     MS_LOG(DEBUG) << "use winograd";
-    std::string program_name = "Winograd";
+    std::string program_name = "winograd";
     ocl_runtime_->LoadSource(program_name, winograd_source);
-    ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
-    ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
-    ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
+    ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
+    ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution");
+    ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
   } else {
-    std::string program_name = "Convolution";
-    std::string kernel_name = "Convolution_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) +
-                              "C" + std::to_string(block_size_.C);
-    ocl_runtime_->LoadSource("Convolution", convolution_source);
-    ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name, build_options);
+    SetBlockSize();
+    std::string program_name = "conv2d";
+    std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" +
+                              std::to_string(block_size_.C);
+    ocl_runtime_->LoadSource(program_name, conv2d_source);
+    ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name);
   }
 
   // allocate winograd memory
@@ -102,12 +121,12 @@ int ConvolutionOpenCLKernel::Init() {
   }
 
   InitWeights();
-
-  MS_LOG(DEBUG) << "Convolution Init Done!";
+  SetGlobalLocal();
+  SetConstArgs();
   return RET_OK;
 }
 
-int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
+int Conv2DOpenCLKernel::GenerateWinogradFilter() {
   constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                           0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                           0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
@@ -159,7 +178,7 @@ int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
   return RET_OK;
 }
 
-int ConvolutionOpenCLKernel::InitWeight() {
+int Conv2DOpenCLKernel::InitFilter() {
   auto allocator = ocl_runtime_->GetAllocator();
 
   // allocate memory
@@ -175,7 +194,7 @@ int ConvolutionOpenCLKernel::InitWeight() {
 
   // rearrange weight
   if (use_winograd_) {
-    GenerateWinogradWeight();
+    GenerateWinogradFilter();
   } else {
     auto weight_tensor = in_tensors_[1];
     if (weight_tensor->data_type() == kNumberTypeFloat16) {
@@ -201,7 +220,7 @@ int ConvolutionOpenCLKernel::InitWeight() {
   return RET_OK;
 }
 
-int ConvolutionOpenCLKernel::InitBias() {
+int Conv2DOpenCLKernel::InitBias() {
   auto allocator = ocl_runtime_->GetAllocator();
 
   // align bias from C to C4
@@ -236,15 +255,15 @@ int ConvolutionOpenCLKernel::InitBias() {
   return RET_OK;
 }
 
-int ConvolutionOpenCLKernel::InitWeights() {
-  InitWeight();
+int Conv2DOpenCLKernel::InitWeights() {
+  InitFilter();
   if (has_bias_) {
     InitBias();
   }
   return RET_OK;
 }
 
-void ConvolutionOpenCLKernel::SetBlockSize() {
+void Conv2DOpenCLKernel::SetBlockSize() {
   auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
   auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
   int block_size;
@@ -277,35 +296,44 @@ void ConvolutionOpenCLKernel::SetBlockSize() {
   }
 }
 
-void ConvolutionOpenCLKernel::SetGlobalLocal() {
-  size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H);
-  size_t global_w = UP_DIV(OW_, block_size_.W);
-  size_t global_c = UP_DIV(CO_SLICES_, block_size_.C);
-
-  constexpr int local_c_max = 16;
-  constexpr int local_hw_max = 256;
-  constexpr int OH_threshold = 100;
-  constexpr int OW_threshold = 100;
-  constexpr int OC_threshold = 64;
-  size_t local_c = GetMaxDivisor(global_c, local_c_max);
-  local_c = std::max<size_t>(local_c, 1);
-  size_t local_hw = local_hw_max / local_c;
-  size_t local_h;
-  size_t local_w;
-  if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) {  // c -> w -> h
-    local_w = std::min(global_w, local_hw);
-    local_h = std::min(local_hw / local_w, global_h);
-  } else {  // c -> h -> w
-    local_h = std::min(global_h, local_hw);
-    local_w = std::min(local_hw / local_h, global_w);
-  }
+void AlignWinogradGlobalLocal(const std::vector<int> &global, const std::vector<int> &local, cl::NDRange *global_range,
+                              cl::NDRange *local_range) {
+  *local_range = cl::NDRange(local[0], local[1], local[2]);
+  *global_range =
+    cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
+}
 
-  global_ = {global_h, global_w, global_c};
-  local_ = {local_h, local_w, local_c};
+void Conv2DOpenCLKernel::SetGlobalLocal() {
+  if (use_winograd_) {
+    AlignWinogradGlobalLocal({TILES_XY_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
+    AlignWinogradGlobalLocal({UP_DIV(TILES_XY_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_conv_, &local_conv_);
+    AlignWinogradGlobalLocal({TILES_XY_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
+  } else {
+    size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H);
+    size_t global_w = UP_DIV(OW_, block_size_.W);
+    size_t global_c = UP_DIV(CO_SLICES_, block_size_.C);
+    constexpr int local_c_max = 16;
+    constexpr int local_hw_max = 256;
+    constexpr int OH_threshold = 100;
+    constexpr int OW_threshold = 100;
+    constexpr int OC_threshold = 64;
+    size_t local_c = GetMaxDivisor(global_c, local_c_max);
+    local_c = std::max<size_t>(local_c, 1);
+    size_t local_hw = local_hw_max / local_c;
+    size_t local_h;
+    size_t local_w;
+    if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) {  // c -> w -> h
+      local_w = std::min(global_w, local_hw);
+      local_h = std::min(local_hw / local_w, global_h);
+    } else {  // c -> h -> w
+      local_h = std::min(global_h, local_hw);
+      local_w = std::min(local_hw / local_h, global_w);
+    }
+    AlignGlobalLocal({global_h, global_w, global_c}, {local_h, local_w, local_c});
+  }
 }
 
-int ConvolutionOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
+void Conv2DOpenCLKernel::SetConstArgs() {
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
   cl_int act_type = 0;
   if (param->act_type_ == ActType_Relu) {
@@ -318,37 +346,33 @@ int ConvolutionOpenCLKernel::Run() {
 
   int arg_cn;
   if (use_winograd_) {
-    arg_cn = 0;
+    arg_cn = 1;
     cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
+    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_);
     ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape);
-    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
+    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, _4x4to36_out_shape);
 
     arg_cn = 0;
     cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
     cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, conv_out_shape);
 
-    arg_cn = 0;
+    arg_cn = 2;
     cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, act_type);
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, act_type);
   } else {
-    arg_cn = 0;
+    arg_cn = 2;
     cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_};
     cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
     cl_int2 dilation = {param->dilation_h_, param->dilation_w_};
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
@@ -356,71 +380,86 @@ int ConvolutionOpenCLKernel::Run() {
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad);
     ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation);
-    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, act_type);
+    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, act_type);
   }
+}
 
+int Conv2DOpenCLKernel::Run() {
   if (use_winograd_) {
-    ocl_runtime_->RunKernel(kernel_4x4to36_, std::vector<size_t>({size_t(TILES_XY_), 6, size_t(CI_SLICES_)}),
-                            std::vector<size_t>({8, 6, 4}), nullptr);
-    ocl_runtime_->RunKernel(kernel_conv_,
-                            std::vector<size_t>({size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}),
-                            std::vector<size_t>({8, 6, 2}), nullptr);
-    ocl_runtime_->RunKernel(kernel_36to4x4_, std::vector<size_t>({size_t(TILES_XY_), 4, size_t(CO_SLICES_)}),
-                            std::vector<size_t>({32, 4, 2}), nullptr);
+    ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
+    ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_);
+
+    ocl_runtime_->RunKernel(kernel_conv_, global_conv_, local_conv_);
+
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
+    ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_);
   } else {
-    ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr);
+    ocl_runtime_->SetKernelArg(kernel_conv_, 0, in_tensors_.front()->data_c());
+    ocl_runtime_->SetKernelArg(kernel_conv_, 1, out_tensors_.front()->data_c());
+    ocl_runtime_->RunKernel(kernel_conv_, global_range_, local_range_);
   }
-
   return RET_OK;
 }
 
+bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                      ConvParameter *param) {
+  auto input_shape = inputs.front()->shape();
+  auto output_shape = inputs.front()->shape();
+  // IH=1 IW=1 OH=1 OW=1
+  bool hw_is_1 = input_shape.size() == 4 && input_shape[1] == 1 && input_shape[2] == 1 && output_shape.size() == 4 &&
+                 output_shape[1] == 1 && output_shape[2] == 1;
+  bool attr_valid = param->kernel_h_ == 1 && param->kernel_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1 &&
+                    param->pad_u_ == 0 && param->pad_d_ == 0 && param->pad_l_ == 0 && param->pad_r_ == 0 &&
+                    param->dilation_h_ == 1 && param->dilation_w_ == 1;
+  return hw_is_1 && attr_valid;
+}
+
+OpParameter *CreateFcParam(const ConvParameter *conv_param) {
+  auto fc_param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
+  if (fc_param == nullptr) {
+    MS_LOG(ERROR) << "Create FullConnection kernel param failed.";
+    return nullptr;
+  }
+  fc_param->op_parameter_.type_ = PrimitiveType_FullConnection;
+  fc_param->a_transpose_ = false;
+  fc_param->b_transpose_ = true;
+  fc_param->act_type_ = conv_param->act_type_;
+  return reinterpret_cast<OpParameter *>(fc_param);
+}
+
 kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                    const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                                    const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                                    const mindspore::lite::PrimitiveC *primitive) {
-  kernel::LiteKernel *kernel;
-  bool is_hw1 = inputs[0]->shape().size() == 4 && inputs[0]->shape()[1] == 1 && inputs[0]->shape()[2] == 1 &&
-                outputs[0]->shape().size() == 4 && outputs[0]->shape()[1] == 1 && outputs[0]->shape()[2] == 1;
-  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
-  bool is_pad_stride_ok = conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1 && conv_param->stride_h_ == 1 &&
-                          conv_param->stride_w_ == 1 && conv_param->pad_u_ == 0 && conv_param->pad_d_ == 0 &&
-                          conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->dilation_h_ == 1 &&
-                          conv_param->dilation_w_ == 1;
-
+  kernel::OpenCLKernel *kernel;
   OpParameter *real_param;
-  if (is_hw1 && is_pad_stride_ok) {
-    auto fc_param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
-    if (fc_param == nullptr) {
-      MS_LOG(ERROR) << "Create OpenCL FullConnection kernel param failed!";
-      return nullptr;
-    }
-    fc_param->op_parameter_.type_ = PrimitiveType_FullConnection;
-    fc_param->a_transpose_ = false;
-    fc_param->b_transpose_ = true;
-    fc_param->act_type_ = conv_param->act_type_;
-    kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(fc_param), inputs, outputs);
-    real_param = reinterpret_cast<OpParameter *>(fc_param);
+  auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  if (UseFcReplaceConv(inputs, outputs, conv_param)) {
+    auto *fc_param = CreateFcParam(conv_param);
+    kernel = new (std::nothrow) FullConnectionOpenCLKernel(fc_param, inputs, outputs);
+    real_param = fc_param;
     if (kernel == nullptr) {
-      MS_LOG(ERROR) << "Create OpenCL FullConnection kernel failed!";
+      MS_LOG(ERROR) << "Create FullConnection kernel failed.";
       free(fc_param);
       free(conv_param);
       return nullptr;
     } else {
       free(conv_param);
+      MS_LOG(INFO) << "use FullConnection to replace Convolution.";
     }
   } else {
-    kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
+    kernel = new (std::nothrow) Conv2DOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs);
     real_param = reinterpret_cast<OpParameter *>(conv_param);
     if (kernel == nullptr) {
-      MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!";
+      MS_LOG(ERROR) << "Create Convolution kernel failed.";
       free(conv_param);
       return nullptr;
     }
   }
 
-  auto ret = kernel->Init();
+  int ret = kernel->CheckSpecs();
   if (ret != mindspore::lite::RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
+    MS_LOG(ERROR) << "Init Convolution kernel failed.";
     delete kernel;
     free(real_param);
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
similarity index 76%
rename from mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
rename to mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
index a76386d05c..769bab3d92 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
 
 #include <vector>
 #include <string>
@@ -27,23 +27,27 @@
 
 namespace mindspore::kernel {
 
-class ConvolutionOpenCLKernel : public OpenCLKernel {
+class Conv2DOpenCLKernel : public OpenCLKernel {
  public:
-  ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                          const std::vector<lite::Tensor *> &outputs)
+  Conv2DOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                     const std::vector<lite::Tensor *> &outputs)
       : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {}
-  ~ConvolutionOpenCLKernel() override = default;
+  ~Conv2DOpenCLKernel() override = default;
 
-  int Init() override;
-  int Run() override;
-  int InitWeights() override;
+  int CheckSpecs() override;
+
+  int Prepare() override;
   void SetGlobalLocal() override;
+  int InitWeights() override;
+  void SetConstArgs() override;
+
+  int Run() override;
 
  private:
   void SetBlockSize();
-  int InitWeight();
+  int InitFilter();
   int InitBias();
-  int GenerateWinogradWeight();
+  int GenerateWinogradFilter();
 
   bool UseWinograd4x4To6x6() {
     const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 &&
@@ -58,8 +62,9 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   cl::Kernel kernel_4x4to36_;
   cl::Kernel kernel_conv_;
   cl::Kernel kernel_36to4x4_;
-  std::vector<size_t> global_;
-  std::vector<size_t> local_;
+  cl::NDRange global_4x4to36_, local_4x4to36_;
+  cl::NDRange global_conv_, local_conv_;
+  cl::NDRange global_36to4x4_, local_36to4x4_;
 
   bool use_fp16_{false};
   size_t sizeof_FLT_{4};
@@ -95,4 +100,4 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 };
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index efc5e71ea4..8879fdea41 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -48,10 +48,9 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
   std::string source = conv2d_transpose_source;
-  std::set<std::string> build_options;
   std::string program_name = "conv2d_transpose";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   InitWeights();
   SetGlobalLocal();
@@ -194,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
   int arg_cnt = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 82289cabb0..fdc08ecb62 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -69,10 +69,9 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
   std::string program_name = "DepthwiseConv2d";
-  std::set<std::string> build_options;
   std::string source = depthwise_conv2d_source;
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   InitWeights();
   SetGlobalLocal();
@@ -193,7 +192,7 @@ int DepthwiseConv2dOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
index b2edf58f90..bb6d4432ec 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@@ -71,16 +71,15 @@ int FullConnectionOpenCLKernel::CheckSpecs() {
 
 int FullConnectionOpenCLKernel::Prepare() {
   std::string kernel_name = "FullConnection_NHWC4";
-  inShape = Image2DInfo(in_tensors_[0]);
-  outShape = Image2DInfo(out_tensors_[0]);
+  inShape = GpuTensorInfo(in_tensors_[0]);
+  outShape = GpuTensorInfo(out_tensors_[0]);
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = fullconnection_source;
   std::string program_name = "FullConnection";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   InitWeights();
   SetConstArgs();
@@ -203,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() {
   int arg_count = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
index 1455ff70db..1b2b8556b8 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
@@ -48,8 +48,8 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
   bool transposeB{true};
   float activation_min_{-FLT_MAX};
   float activation_max_{FLT_MAX};
-  Image2DInfo inShape = Image2DInfo(nullptr);
-  Image2DInfo outShape = Image2DInfo(nullptr);
+  GpuTensorInfo inShape = GpuTensorInfo(nullptr);
+  GpuTensorInfo outShape = GpuTensorInfo(nullptr);
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
index 0de2158999..3145e93f0e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@@ -30,47 +30,88 @@ using mindspore::schema::PrimitiveType_Gather;
 
 namespace mindspore::kernel {
 
-int GatherOpenCLKernel::CheckSpecs() { return RET_OK; }
+int GatherOpenCLKernel::CheckSpecs() {
+  if (in_tensors_.size() != 2) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 2 input Tensor but get " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1 output Tensor but get " << out_tensors_.size();
+    return RET_ERROR;
+  }
+
+  if (in_tensors_.at(1)->category() == lite::Tensor::VAR) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports indices Tensor is weight.";
+    return RET_ERROR;
+  }
+
+  int input_ndim = in_tensors_.front()->shape().size();
+  if (input_ndim < 0 || input_ndim > 4) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1-4D input Tensor but get " << input_ndim << "D.";
+    return RET_ERROR;
+  }
+  int indices_ndim = in_tensors_.at(1)->shape().size();
+  if (indices_ndim != 1) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1D indices Tensor but get " << indices_ndim << "D.";
+    return RET_ERROR;
+  }
+
+  TypeId data_type = in_tensors_.at(1)->data_type();
+  if (data_type != kNumberTypeInt32 && data_type != kNumberTypeInt64 && data_type != kNumberTypeFloat32 &&
+      data_type != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Conv2D only supports Int32/Int64/Float32/Float16 indices Tensor.";
+    return RET_ERROR;
+  }
+
+  auto *param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
+  axis_ = param->axis_;
+  if (axis_ < 0) {
+    axis_ += input_ndim;
+  }
+  if (axis_ < 0 || axis_ >= input_ndim) {
+    MS_LOG(ERROR) << "axis is invalid: axis=" << axis_ << ".";
+    return RET_ERROR;
+  } else {
+    return RET_OK;
+  }
+}
 
 void GatherOpenCLKernel::SetConstArgs() {
-  auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
-  param->axis_ = (param->axis_ + in_tensors_[0]->shape().size()) % in_tensors_[0]->shape().size();
-  auto input_shape = in_tensors_[0]->shape();
-  auto output_shape = out_tensors_[0]->shape();
-  int indices_num = in_tensors_[1]->ElementsNum();
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
-  cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
-  cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
-                      (cl_int)out_tensors_[0]->Batch()};
+  auto input = GpuTensorInfo(in_tensors_.front());
+  auto output = GpuTensorInfo(out_tensors_.front());
+  int indices_num = in_tensors_.at(1)->ElementsNum();
+  cl_int4 src_size = {static_cast<cl_int>(input.W), static_cast<cl_int>(input.H), static_cast<cl_int>(input.Slice),
+                      static_cast<cl_int>(input.N)};
+  cl_int4 dst_size = {static_cast<cl_int>(output.W), static_cast<cl_int>(output.H), static_cast<cl_int>(output.Slice),
+                      static_cast<cl_int>(output.N)};
   int arg_cnt = 3;
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, param->axis_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_);
 }
 
 void GatherOpenCLKernel::SetGlobalLocal() {
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  auto output = GpuTensorInfo(out_tensors_.front());
   std::vector<size_t> local = {1, 1, 1};
-  std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(),
-                                (size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4};
+  std::vector<size_t> global = {output.W, output.N * output.H, output.Slice};
   OpenCLKernel::AlignGlobalLocal(global, local);
 }
 
 int GatherOpenCLKernel::Prepare() {
-  std::string kernel_name = "gather_NHWC4";
+  std::string kernel_name = "gather";
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
-  std::string source = gather_source;
   std::string program_name = "gather";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->LoadSource(program_name, gather_source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
 
-  InitWeights();
+  int ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
   SetGlobalLocal();
   SetConstArgs();
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
@@ -79,58 +120,42 @@ int GatherOpenCLKernel::Prepare() {
 
 int GatherOpenCLKernel::InitWeights() {
   auto indices_tensor = in_tensors_.at(1);
-  int indices_num = indices_tensor->ElementsNum();
-  bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
+  auto indices_num = indices_tensor->ElementsNum();
   auto allocator = ocl_runtime_->GetAllocator();
-  if (!isIndicesInt32) {
-    indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
-    if (indices_data_ == nullptr) {
-      MS_LOG(ERROR) << "Memory allocation failed";
-      return RET_ERROR;
-    }
+  indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
+  if (indices_data_ == nullptr) {
+    MS_LOG(ERROR) << "Memory allocation failed";
+    return RET_ERROR;
   }
-  return RET_OK;
-}
 
-int GatherOpenCLKernel::UpdateWeights() {
-  auto indices_tensor = in_tensors_.at(1);
-  int indices_num = indices_tensor->ElementsNum();
-  bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
-  if (!isIndicesInt32) {
-    if (indices_tensor->data_type() == kNumberTypeInt64) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<int64_t *>(indices_tensor->data_c())[i];
-      }
-    } else if (indices_tensor->data_type() == kNumberTypeFloat32) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<float *>(indices_tensor->data_c())[i];
-      }
-    } else if (indices_tensor->data_type() == kNumberTypeFloat16) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<float16_t *>(indices_tensor->data_c())[i];
-      }
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type: " << indices_tensor->data_type();
-      return RET_ERROR;
+  auto data_type = indices_tensor->data_type();
+  auto data = indices_tensor->data_c();
+  if (data_type == kNumberTypeInt32) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<int32_t *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeInt64) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<int64_t *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeFloat32) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<float *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeFloat16) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<float16_t *>(data)[i];
     }
-  } else {
-    indices_data_ = reinterpret_cast<int32_t *>(indices_tensor->data_c());
   }
   return RET_OK;
 }
 
 int GatherOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
-
-  if (UpdateWeights() != RET_OK) {
-    return RET_ERROR;
-  }
-
-  ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+  ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c());
   ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
index b1a3f5a04f..4fb5c0151d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
@@ -45,6 +45,7 @@ class GatherOpenCLKernel : public OpenCLKernel {
  private:
   cl::Kernel kernel_;
   int32_t *indices_data_{nullptr};
+  int axis_ = {0};
 };
 }  // namespace mindspore::kernel
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc
deleted file mode 100644
index 099548aca1..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/opencl/kernel/hswish.h"
-#include <cstring>
-#include <string>
-#include <algorithm>
-#include <set>
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "src/runtime/kernel/opencl/cl/hswish.cl.inc"
-
-using mindspore::kernel::KERNEL_ARCH::kGPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Activation;
-
-namespace mindspore::kernel {
-
-int HswishOpenCLKernel::Init() {
-  if (out_tensors_[0]->shape().size() > 4) {
-    MS_LOG(ERROR) << " only support dim <= 4";
-    return RET_ERROR;
-  }
-
-  std::string kernel_name = "hswish";
-  std::set<std::string> build_options;
-  std::string source = hswish_source;
-  std::string program_name = "hswish";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  return RET_OK;
-}
-
-void HswishGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
-  const int max_divider = 8;
-  const int max_x = 2, max_y = 8;
-  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
-  int yz = max_size / x;
-  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
-  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
-
-  local->clear();
-  local->push_back(x);
-  local->push_back(y);
-  local->push_back(z);
-}
-
-int HswishOpenCLKernel::InferShapeTo4D() {
-  if (in_tensors_[0]->shape().size() <= 4) {
-    if (in_tensors_[0]->shape().size() == 1) {
-      N_ = in_tensors_[0]->shape()[0];
-    } else if (in_tensors_[0]->shape().size() == 2) {
-      N_ = in_tensors_[0]->shape()[0];
-      C_ = in_tensors_[0]->shape()[1];
-    } else if (in_tensors_[0]->shape().size() == 3) {
-      N_ = in_tensors_[0]->shape()[0];
-      W_ = in_tensors_[0]->shape()[1];
-      C_ = in_tensors_[0]->shape()[2];
-    } else {
-      N_ = in_tensors_[0]->shape()[0];
-      H_ = in_tensors_[0]->shape()[1];
-      W_ = in_tensors_[0]->shape()[2];
-      C_ = in_tensors_[0]->shape()[3];
-    }
-  } else {
-    MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size();
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int HswishOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
-  auto output_shape = out_tensors_[0]->shape();
-  InferShapeTo4D();
-  cl_int4 output_shape_ = {static_cast<cl_int>(N_), static_cast<cl_int>(H_), static_cast<cl_int>(W_),
-                           static_cast<cl_int>(UP_DIV(C_, C4NUM))};
-  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
-  std::vector<size_t> local = {1, 1, 1};
-  uint32_t OH = N_ * H_;
-  uint32_t OW = W_;
-  uint32_t OC = UP_DIV(C_, C4NUM);
-  std::vector<size_t> global = {OH, OW, OC};
-  HswishGetWorkGroup(global, &local, max_global[0]);
-  int arg_cn = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
-  return RET_OK;
-}
-
-kernel::LiteKernel *HswishOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                              const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) HswishOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new HswishOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: hswish ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index 3100b45be0..bdee117db1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -59,11 +59,10 @@ int MatMulOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = matmul_source;
   std::string program_name = "MatMul";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   InitWeights();
   SetConstArgs();
@@ -159,7 +158,7 @@ int MatMulOpenCLKernel::Run() {
   int arg_count = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
index 236f3981c3..7dea6d5203 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
@@ -34,8 +34,8 @@ int OneHotOpenCLKernel::CheckSpecs() { return RET_OK; }
 int OneHotOpenCLKernel::Prepare() {
   std::string kernel_name = "OneHot";
   auto param = reinterpret_cast<OneHotParameter *>(op_parameter_);
-  in_shape_ = Image2DInfo(in_tensors_[0]);
-  out_shape_ = Image2DInfo(out_tensors_[0]);
+  in_shape_ = GpuTensorInfo(in_tensors_[0]);
+  out_shape_ = GpuTensorInfo(out_tensors_[0]);
   axis_ = out_shape_.AlignAxis(param->axis_);
   if (in_tensors_[0]->shape().size() == 1 && axis_ == 0) {
     kernel_name += "2DAxis0";
@@ -82,7 +82,7 @@ void OneHotOpenCLKernel::SetConstArgs() {
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, depth_);
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, on_value_);
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, off_value_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<int>(out_shape_.C));
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C));
 }
 void OneHotOpenCLKernel::SetGlobalLocal() {
   global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
@@ -90,10 +90,9 @@ void OneHotOpenCLKernel::SetGlobalLocal() {
 
 int OneHotOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
-  int arg_idx = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h
index c24e21ba96..865a713a9d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h
@@ -44,8 +44,8 @@ class OneHotOpenCLKernel : public OpenCLKernel {
   float on_value_{1.0f};
   float off_value_{0.0f};
   int axis_{0};
-  Image2DInfo in_shape_ = Image2DInfo(nullptr);
-  Image2DInfo out_shape_ = Image2DInfo(nullptr);
+  GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr);
+  GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr);
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
index 77d5dfd966..fb9eee7987 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
@@ -33,91 +33,81 @@ using mindspore::schema::PrimitiveType_Pad;
 
 namespace mindspore::kernel {
 
-int PadOpenCLKernel::Init() {
+int PadOpenCLKernel::CheckSpecs() {
   auto param = reinterpret_cast<PadParameter *>(op_parameter_);
-  std::set<std::string> build_options;
-
-  if (in_tensors_.empty()) {
-    MS_LOG(ERROR) << "PadOpenCLKernel in_tensors is empty";
+  if (in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "Pad only support 1 input Tensor.";
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "Pad only support 1 output Tensor.";
+    return RET_ERROR;
+  }
+  auto in_ndim = in_tensors_.front()->shape().size();
+  if (in_ndim < 1 || in_ndim > 4) {
+    MS_LOG(ERROR) << "Pad only supports 1D-4D input Tensor but get " << in_ndim << "D.";
     return RET_ERROR;
   }
-  if (out_tensors_.empty()) {
-    MS_LOG(ERROR) << "PadOpenCLKernel out_tensors is empty";
+  auto out_ndim = in_tensors_.front()->shape().size();
+  if (out_ndim < 1 || out_ndim > 4) {
+    MS_LOG(ERROR) << "Pad only supports 1D-4D output Tensor but get " << out_ndim << "D.";
     return RET_ERROR;
   }
-  if (param->paddings_[0] || param->paddings_[1] || param->paddings_[6] || param->paddings_[7]) {
-    MS_LOG(ERROR) << "PadOpenCLKernel not support pad at Batch/Channel axis";
+  if (in_ndim != out_ndim) {
+    MS_LOG(ERROR) << "Pad: input ndim != output ndim.";
     return RET_ERROR;
   }
   if (param->pad_mode_ != PaddingMode_CONSTANT) {
-    MS_LOG(ERROR) << "PadOpenCLKernel only support CONSTANT MODE";
+    MS_LOG(ERROR) << "Pad only support CONSTANT MODE.";
     return RET_ERROR;
   }
+  return RET_OK;
+}
 
-  auto input_tensor = in_tensors_[0];
-  auto output_tensor = out_tensors_[0];
-
-  CI_ = input_tensor->Channel();
-  IH_ = input_tensor->Height();
-  IW_ = input_tensor->Width();
-  CO_ = output_tensor->Channel();
-  OH_ = output_tensor->Height();
-  OW_ = output_tensor->Width();
-  CI_SLICES_ = UP_DIV(CI_, C4NUM);
-  CO_SLICES_ = UP_DIV(CO_, C4NUM);
-
+int PadOpenCLKernel::Prepare() {
   const std::string source = pad_source;
   const std::string program_name = "Pad";
-  const std::string kernel_name = "Pad_NHWC4";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-
-  MS_LOG(DEBUG) << "Pad Init Done!";
+  ocl_runtime_->BuildKernel(kernel_, program_name, "Pad");
+  SetConstArgs();
   return RET_OK;
 }
 
-int PadOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
-
-  auto param = reinterpret_cast<PadParameter *>(op_parameter_);
-  cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
-  cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
-  cl_int2 pad_top_left = {param->paddings_[2], param->paddings_[4]};
+void PadOpenCLKernel::SetConstArgs() {
+  auto input = GpuTensorInfo(in_tensors_.front());
+  auto output = GpuTensorInfo(out_tensors_.front());
+  cl_int4 input_shape = {static_cast<cl_int>(input.N), static_cast<cl_int>(input.H), static_cast<cl_int>(input.W),
+                         static_cast<cl_int>(input.C)};
+  cl_int4 output_shape = {static_cast<cl_int>(output.N), static_cast<cl_int>(output.H), static_cast<cl_int>(output.W),
+                          static_cast<cl_int>(output.C)};
+  cl_int2 io_slices = {static_cast<cl_int>(input.Slice), static_cast<cl_int>(output.Slice)};
+
+  int ndim = in_tensors_.front()->shape().size();
+  std::vector<int> pad_before_ori;
+  pad_before_ori.reserve(ndim);
+  for (size_t i = 0; i < ndim; i++) {
+    pad_before_ori.push_back(param_->paddings_[MAX_PAD_SIZE - 2 * ndim + 2 * i]);
+  }
+  cl_int4 pad_before;
+  Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);
 
-  int arg_cn = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+  int arg_cn = 2;
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_top_left);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast<cl_float>(param->constant_value_));
-
-  std::vector<size_t> global = {static_cast<size_t>(OH_), static_cast<size_t>(OW_), static_cast<size_t>(CO_SLICES_)};
-  std::vector<size_t> local = {8, 4, 1};
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_));
 
-  return RET_OK;
+  AlignGlobalLocal({output.N * output.H, output.W, output.Slice}, {8, 4, 1});
 }
 
-kernel::LiteKernel *OpenCLPadKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                           const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                           const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                           const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) PadOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "Create OpenCL Pad kernel failed!";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: Pad";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
+int PadOpenCLKernel::Run() {
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
+  return RET_OK;
 }
 
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pad, OpenCLPadKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pad, OpenCLPadKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pad, OpenCLKernelCreator<PadOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pad, OpenCLKernelCreator<PadOpenCLKernel>)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h
index e274689d3b..604d08640f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h
@@ -30,22 +30,19 @@ class PadOpenCLKernel : public OpenCLKernel {
  public:
   PadOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                   const std::vector<lite::Tensor *> &outputs)
-      : OpenCLKernel(parameter, inputs, outputs) {}
+      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<PadParameter *>(op_parameter_)) {}
   ~PadOpenCLKernel() override = default;
 
-  int Init() override;
+  int CheckSpecs() override;
+
+  int Prepare() override;
+  void SetConstArgs() override;
+
   int Run() override;
 
  private:
   cl::Kernel kernel_;
-  int CI_{};
-  int IH_{};
-  int IW_{};
-  int CO_{};
-  int OH_{};
-  int OW_{};
-  int CI_SLICES_{};
-  int CO_SLICES_{};
+  PadParameter *param_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 1075c68bfb..52d30465c9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -70,11 +70,10 @@ int PoolingOpenCLKernel::Prepare() {
 #else
   kernel_name += "_NHWC4";
   kernel_name += "_IMG";
-  std::set<std::string> build_options;
   std::string source = pooling2d_source;
   std::string program_name = "Pooling2d";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -112,7 +111,7 @@ int PoolingOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
index 7e0fe62409..80eb852b11 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
@@ -35,7 +35,6 @@ int PowerOpenCLKernel::Init() {
   use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
   auto param = reinterpret_cast<PowerParameter *>(this->op_parameter_);
   std::string kernel_name = "power";
-  std::set<std::string> build_options;
   std::string source = power_source;
   std::string program_name = "power";
   broadcast_ = param->broadcast_;
@@ -55,7 +54,7 @@ int PowerOpenCLKernel::Init() {
   scale_ = param->scale_;
   shift_ = param->shift_;
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
@@ -133,7 +132,7 @@ int PowerOpenCLKernel::Run() {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
   }
 
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global, local);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
index 58b5a1a674..2864267312 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@@ -103,12 +103,11 @@ int PReluOpenCLKernel::Init() {
   }
   enable_fp16_ = ocl_runtime_->GetFp16Enable();
 
-  std::set<std::string> build_options;
   std::string source = prelu_source;
   std::string program_name = "PRelu";
   std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector");
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 
   InitWeights();
   MS_LOG(DEBUG) << program_name << " init Done!";
@@ -133,7 +132,7 @@ int PReluOpenCLKernel::Run() {
 
   std::vector<size_t> local = {4, 4, 1};
   std::vector<size_t> global = {static_cast<size_t>(H_), static_cast<size_t>(W_), static_cast<size_t>(CO_SLICES_)};
-  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
     return mindspore::lite::RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
index 174bab76f8..a3861c77a2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -96,7 +96,7 @@ int ReduceOpenCLKernel::CheckSpecs() {
 }
 
 int ReduceOpenCLKernel::Prepare() {
-  outShape = Image2DInfo(out_tensors_[0]);
+  outShape = GpuTensorInfo(out_tensors_[0]);
   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
   if (reduce_param == nullptr) {
     return RET_NULL_PTR;
@@ -120,11 +120,10 @@ int ReduceOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = reduce_source;
   std::string program_name = "Reduce";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -165,7 +164,7 @@ int ReduceOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
index 8d38682365..f316b93b65 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -41,7 +41,7 @@ class ReduceOpenCLKernel : public OpenCLKernel {
   cl_float4 GenC4Mask();
   static std::string GetReduceTypeStr(int type);
   cl::Kernel kernel_;
-  Image2DInfo outShape = Image2DInfo(nullptr);
+  GpuTensorInfo outShape = GpuTensorInfo(nullptr);
   bool use_local_{false};
   bool wc_reduce_{false};
   static const size_t LOCAL_CACHE_THREAD{16};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index 5ef6b4162e..8d436e9820 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -43,8 +43,8 @@ int ReshapeOpenCLKernel::CheckSpecs() {
 }
 
 void ReshapeOpenCLKernel::SetConstArgs() {
-  auto in = Image2DInfo(in_tensors_.front());
-  auto out = Image2DInfo(out_tensors_.front());
+  auto in = GpuTensorInfo(in_tensors_.front());
+  auto out = GpuTensorInfo(out_tensors_.front());
   cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)};
   cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)};
 
@@ -54,7 +54,7 @@ void ReshapeOpenCLKernel::SetConstArgs() {
 }
 
 void ReshapeOpenCLKernel::SetGlobalLocal() {
-  auto out = Image2DInfo(out_tensors_.front());
+  auto out = GpuTensorInfo(out_tensors_.front());
   std::vector<size_t> local = {};
   std::vector<size_t> global{out.width, out.height};
   OpenCLKernel::AlignGlobalLocal(global, local);
@@ -65,11 +65,10 @@ int ReshapeOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = reshape_source;
   std::string program_name = "reshape";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
 
   SetGlobalLocal();
@@ -82,7 +81,7 @@ int ReshapeOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
index b50738ce1d..252897696f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
@@ -63,11 +63,10 @@ int ResizeOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = resize_source;
   std::string program_name = "Resize";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -102,7 +101,7 @@ void ResizeOpenCLKernel::SetConstArgs() {
 
 void ResizeOpenCLKernel::SetGlobalLocal() {
   local_range_ = {};
-  auto out_shape = Image2DInfo(out_tensors_[0]);
+  auto out_shape = GpuTensorInfo(out_tensors_[0]);
   global_range_ = {out_shape.Slice, out_shape.W, out_shape.H};
 }
 
@@ -111,7 +110,7 @@ int ResizeOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
index 15e1d32789..a5c6e0b340 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -49,7 +49,7 @@ ScaleOpenCLKernel::~ScaleOpenCLKernel() {
 
 void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
   local_size_ = {16, 16};
-  auto image2d_info = Image2DInfo(out_tensors_[0]);
+  auto image2d_info = GpuTensorInfo(out_tensors_[0]);
   global_size_ = {image2d_info.width, image2d_info.height};
 }
 
@@ -69,7 +69,7 @@ int ScaleOpenCLKernel::InitWeights() {
       offset_ptr_ = allocator->Malloc(in_tensors_[2]->ElementsNum(), img_size, in_tensors_[2]->data_c());
       return RET_OK;
     }
-    auto image2d_info = Image2DInfo(in_tensors_[1]);
+    auto image2d_info = GpuTensorInfo(in_tensors_[1]);
     int pack_weight_size = image2d_info.ElementsC4Num;
     int plane = image2d_info.H * image2d_info.W;
     int channel = image2d_info.C;
@@ -185,10 +185,9 @@ int ScaleOpenCLKernel::Init() {
     kernel_name += "_BUF";
   }
   std::string program_name = "Scale";
-  std::set<std::string> build_options;
   std::string source = scale_source;
   ocl_runtime_->LoadSource(program_name, source);
-  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   if (error_code != RET_OK) {
     return error_code;
@@ -244,7 +243,7 @@ int ScaleOpenCLKernel::Run() {
     }
   }
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type);
-  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
index ada387fcc1..ae7929ca4a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -67,7 +67,7 @@ int SoftmaxOpenCLKernel::CheckSpecs() {
 int SoftmaxOpenCLKernel::Prepare() {
   std::string kernel_name = "SoftMax";
 
-  out_shape = Image2DInfo(out_tensors_[0]);
+  out_shape = GpuTensorInfo(out_tensors_[0]);
   std::string source = softmax_source;
   if (out_shape.H == 1 && out_shape.W == 1 && axis_ == 3) {
     // support 4d tensor
@@ -81,10 +81,9 @@ int SoftmaxOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string program_name = "SoftMax";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -135,7 +134,7 @@ int SoftmaxOpenCLKernel::Run() {
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
   // run opengl kernel
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
index a68cf41105..9ba280b6be 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
@@ -51,7 +51,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
   std::vector<size_t> local_size_;
   std::vector<size_t> global_size_;
   int axis_{0};
-  Image2DInfo out_shape = Image2DInfo(nullptr);
+  GpuTensorInfo out_shape = GpuTensorInfo(nullptr);
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
index 9fffb80b43..9776508fc6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
@@ -90,11 +90,10 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
 
-  std::set<std::string> build_options;
   std::string source = space_to_batch_nd_source;
   std::string program_name = "space_to_batch_nd";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
 
   SetGlobalLocal();
@@ -106,9 +105,9 @@ int SpaceToBatchNDOpenCLKernel::Prepare() {
 int SpaceToBatchNDOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
 
-  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
 
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
index 77acb688f1..035755a66c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
@@ -35,8 +35,8 @@ int SpaceToDepthOpenCLKernel::CheckSpecs() { return RET_OK; }
 
 int SpaceToDepthOpenCLKernel::Prepare() {
   std::string kernel_name;
-  in_shape_ = Image2DInfo(in_tensors_[0]);
-  out_shape_ = Image2DInfo(out_tensors_[0]);
+  in_shape_ = GpuTensorInfo(in_tensors_[0]);
+  out_shape_ = GpuTensorInfo(out_tensors_[0]);
   if (in_shape_.C % C4NUM != 0) {
     kernel_name = "SpaceToDepth";
   } else {
@@ -45,11 +45,10 @@ int SpaceToDepthOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = space_to_depth_source;
   std::string program_name = "SpaceToDepth";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -78,7 +77,7 @@ int SpaceToDepthOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h
index 55b1d48792..b7df317841 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h
@@ -39,8 +39,8 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel {
 
  private:
   cl::Kernel kernel_;
-  Image2DInfo in_shape_ = Image2DInfo(nullptr);
-  Image2DInfo out_shape_ = Image2DInfo(nullptr);
+  GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr);
+  GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr);
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
index b4382e6958..a1d203250c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
@@ -96,7 +96,7 @@ int SparseToDenseOpenCLKernel::CheckSpecs() {
                   << out_tensors_[0]->shape().size();
     return RET_ERROR;
   }
-  if (out_tensors_[0]->shape().size() > 2 || in_tensors_.size() < 3) {
+  if (out_tensors_[0]->shape().size() > 3 || in_tensors_.size() < 3) {
     MS_LOG(ERROR) << " only support dim <= 2 and in_tensors_.size >= 3";
     return RET_ERROR;
   }
@@ -121,7 +121,7 @@ int SparseToDenseOpenCLKernel::CheckSpecs() {
 
 void SparseToDenseOpenCLKernel::SetConstArgs() {
   auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  Image2DInfo img_info(out_tensors_[0]);
+  GpuTensorInfo img_info(out_tensors_[0]);
   size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
   stride_w = img_info.RowPitch() / dtype;
   cl_int2 input_shape = {n_ * h_, w_ * UP_DIV(c_, C4NUM)};
@@ -148,11 +148,10 @@ int SparseToDenseOpenCLKernel::Prepare() {
   inshapeindex1_dim = in_tensors_[0]->shape()[1];
   weight_scalar_ = in_tensors_[2]->IsScalar();
   std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
-  std::set<std::string> build_options;
   std::string source = sparse_to_dense_source;
   std::string program_name = "SparseToDense";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 
   if (in_tensors_.size() > 3) {
     auto input_tensor3 = in_tensors_[3];
@@ -210,7 +209,7 @@ int SparseToDenseOpenCLKernel::Run() {
   } else {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
   }
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
index 5ce9b34c5a..b5639a34f0 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@@ -73,11 +73,10 @@ int StackOpenCLKernel::Init() {
     return RET_ERROR;
   }
   MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
-  std::set<std::string> build_options;
   std::string source = stack_source;
   std::string program_name = "stack";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 
   return RET_OK;
 }
@@ -184,7 +183,7 @@ int StackOpenCLKernel::Run() {
   std::vector<size_t> global = {OH_, OW_, OC_};
   StackGetWorkGroup(global, &local, max_global[0]);
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global, local);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
index 2c0f1e3efb..a39edd1b6d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
@@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_StridedSlice;
 
 namespace mindspore::kernel {
 
-int SliceOpenCLKernel::CheckSpecs() {
+int StridedSliceOpenCLKernel::CheckSpecs() {
   const std::string kernel_name = op_parameter_->type_ == PrimitiveType_Slice ? "Slice" : "StridedSlice";
   if (in_tensors_.size() != 1) {
     MS_LOG(ERROR) << kernel_name + " only supports 1 input Tensor.";
@@ -54,25 +54,24 @@ int SliceOpenCLKernel::CheckSpecs() {
     return RET_ERROR;
   }
   if (InitConstArgs() != RET_OK) {
-    MS_LOG(ERROR) << "call SliceOpenCLKernel::InitConstArgs() failed";
+    MS_LOG(ERROR) << "call InitConstArgs() failed";
     return RET_ERROR;
   }
   return RET_OK;
 }
 
-int SliceOpenCLKernel::Prepare() {
-  std::set<std::string> build_options;
+int StridedSliceOpenCLKernel::Prepare() {
   std::string program_name = "strided_slice";
   ocl_runtime_->LoadSource(program_name, strided_slice_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice");
   SetConstArgs();
   SetGlobalLocal();
   return RET_OK;
 }
 
-int SliceOpenCLKernel::InitConstArgs() {
-  auto input_info = Image2DInfo(in_tensors_.front());
-  auto output_info = Image2DInfo(out_tensors_.front());
+int StridedSliceOpenCLKernel::InitConstArgs() {
+  auto input_info = GpuTensorInfo(in_tensors_.front());
+  auto output_info = GpuTensorInfo(out_tensors_.front());
   input_shape_ = {static_cast<cl_int>(input_info.N), static_cast<cl_int>(input_info.H),
                   static_cast<cl_int>(input_info.W), static_cast<cl_int>(input_info.C)};
   output_shape_ = {static_cast<cl_int>(output_info.N), static_cast<cl_int>(output_info.H),
@@ -81,19 +80,19 @@ int SliceOpenCLKernel::InitConstArgs() {
 
   if (op_parameter_->type_ == PrimitiveType_Slice) {
     auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
-    Broadcast2GpuShape(param->begin_, begin_.s, param->param_length_, 0);
-    Broadcast2GpuShape(param->size_, size_.s, param->param_length_, -1);
+    Broadcast2GpuShape(begin_.s, param->begin_, param->param_length_, 0);
+    Broadcast2GpuShape(size_.s, param->size_, param->param_length_, -1);
     for (int i = 0; i < 4; ++i) {
       if (begin_.s[i] < 0) {
         begin_.s[i] += input_shape_.s[i];
       }
       if (begin_.s[i] < 0 || begin_.s[i] >= input_shape_.s[i]) {
-        MS_LOG(ERROR) << "Slice kernel only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
+        MS_LOG(ERROR) << "Slice only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
                       << " input_shape[i]=" << input_shape_.s[i];
         return RET_ERROR;
       }
       if (size_.s[i] < -1 || size_.s[i] == 0) {
-        MS_LOG(ERROR) << "Slice kernel only supports size=-1 or size>0 but size[i]=" << size_.s[i];
+        MS_LOG(ERROR) << "Slice only supports size=-1 or size>0 but size[i]=" << size_.s[i];
         return RET_ERROR;
       }
       if (size_.s[i] == -1 || begin_.s[i] + size_.s[i] > input_shape_.s[i]) {
@@ -103,9 +102,9 @@ int SliceOpenCLKernel::InitConstArgs() {
   } else {
     auto param = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
     cl_int4 end = input_shape_;
-    Broadcast2GpuShape(param->begins_, begin_.s, param->num_axes_, 0);
-    Broadcast2GpuShape(param->strides_, stride_.s, param->num_axes_, 1);
-    Broadcast2GpuShape(param->ends_, end.s, param->num_axes_);
+    Broadcast2GpuShape(begin_.s, param->begins_, param->num_axes_, 0);
+    Broadcast2GpuShape(stride_.s, param->strides_, param->num_axes_, 1);
+    Broadcast2GpuShape(end.s, param->ends_, param->num_axes_);
 
     for (int i = 0; i < 4; ++i) {
       // begin is negative
@@ -143,9 +142,9 @@ int SliceOpenCLKernel::InitConstArgs() {
   // check size
   std::vector<int> shape_not_1;
   std::vector<int> size_not_1;
-  std::copy_if(out_tensors_.front()->shape().begin(), out_tensors_.front()->shape().end(), shape_not_1.begin(),
-               [](int x) { return x > 1; });
-  std::copy_if(size_.s, size_.s + 4, size_not_1.begin(), [](int x) { return x > 1; });
+  auto output_shape = out_tensors_.front()->shape();
+  std::copy_if(output_shape.begin(), output_shape.end(), std::back_inserter(shape_not_1), [](int x) { return x > 1; });
+  std::copy_if(size_.s, size_.s + 4, std::back_inserter(size_not_1), [](int x) { return x > 1; });
   if (shape_not_1 != size_not_1) {
     MS_LOG(ERROR) << "Slice/StridedSlice kernel output shape infer error";
     return RET_ERROR;
@@ -153,7 +152,7 @@ int SliceOpenCLKernel::InitConstArgs() {
   return RET_OK;
 }
 
-void SliceOpenCLKernel::SetConstArgs() {
+void StridedSliceOpenCLKernel::SetConstArgs() {
   int arg_cn = 2;
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
@@ -163,8 +162,8 @@ void SliceOpenCLKernel::SetConstArgs() {
   ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
 }
 
-void SliceOpenCLKernel::SetGlobalLocal() {
-  auto output_info = Image2DInfo(out_tensors_.front());
+void StridedSliceOpenCLKernel::SetGlobalLocal() {
+  auto output_info = GpuTensorInfo(out_tensors_.front());
   std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};
 
   const int max_divider = 8;
@@ -177,16 +176,16 @@ void SliceOpenCLKernel::SetGlobalLocal() {
   AlignGlobalLocal(global, local);
 }
 
-int SliceOpenCLKernel::Run() {
+int StridedSliceOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
   ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<StridedSliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<StridedSliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<StridedSliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<StridedSliceOpenCLKernel>);
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
index 7edbf437c1..58fb7d3d8b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
@@ -23,21 +23,22 @@
 
 namespace mindspore::kernel {
 
-class SliceOpenCLKernel : public OpenCLKernel {
+class StridedSliceOpenCLKernel : public OpenCLKernel {
  public:
-  SliceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                    const std::vector<lite::Tensor *> &outputs)
+  StridedSliceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                           const std::vector<lite::Tensor *> &outputs)
       : OpenCLKernel(parameter, inputs, outputs) {}
 
-  ~SliceOpenCLKernel() override = default;
-
-  int Prepare() override;
-  int Run() override;
+  ~StridedSliceOpenCLKernel() override = default;
 
   int CheckSpecs() override;
+
+  int Prepare() override;
   void SetConstArgs() override;
   void SetGlobalLocal() override;
 
+  int Run() override;
+
  private:
   int InitConstArgs();
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 640a147cd6..3e469cf496 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -33,24 +33,27 @@ using mindspore::schema::PrimitiveType_ToFormat;
 namespace mindspore::kernel {
 
 int ToFormatOpenCLKernel::CheckSpecs() {
-  if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
-    MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
+  auto data_type = in_tensors_.front()->data_type();
+  if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Unsupported data type " << data_type;
     return RET_ERROR;
   }
   auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
   out_mem_type_ = parameter->out_mem_type;
   return RET_OK;
 }
+
 void ToFormatOpenCLKernel::SetConstArgs() {
   cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
   cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1};
   ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
   ocl_runtime_->SetKernelArg(kernel_, 3, shape);
 }
+
 void ToFormatOpenCLKernel::SetGlobalLocal() {
   std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
   std::vector<size_t> local = {8, 16, 3};
-  size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
+  size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
   if (max_work_group_size < 384) {
     local[2] = 1;
   }
@@ -61,9 +64,9 @@ int ToFormatOpenCLKernel::Prepare() {
   std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}};
   std::string kernel_name;
   if (out_mem_type_ == MemType::IMG) {
-    kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()];
+    kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()];
   } else {
-    kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()];
+    kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()];
   }
   this->set_name(kernel_name);
 
@@ -71,52 +74,30 @@ int ToFormatOpenCLKernel::Prepare() {
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
   std::string program_name = "to_format";
-  std::set<std::string> build_options;
   std::string source = to_format_source;
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
 
-  InitNHWC();
+  auto output = GpuTensorInfo(out_tensors_.front());
+  N_ = output.N;
+  H_ = output.H;
+  W_ = output.W;
+  C_ = output.C;
+
   SetGlobalLocal();
   SetConstArgs();
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
 
-int ToFormatOpenCLKernel::InitNHWC() {
-  std::vector<int> out_shape = out_tensors_[0]->shape();
-  if (out_shape.size() == 1) {
-    N_ = out_shape[0];
-    H_ = 1;
-    W_ = 1;
-    C_ = 1;
-  } else if (out_shape.size() == 2) {
-    N_ = out_shape[0];
-    H_ = 1;
-    W_ = 1;
-    C_ = out_shape[1];
-  } else if (out_shape.size() == 3) {
-    N_ = out_shape[0];
-    H_ = 1;
-    W_ = out_shape[1];
-    C_ = out_shape[2];
-  } else if (out_shape.size() == 4) {
-    N_ = out_shape[0];
-    H_ = out_shape[1];
-    W_ = out_shape[2];
-    C_ = out_shape[3];
-  }
-  return RET_OK;
-}
-
 int ToFormatOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
   auto dst_mem_type = out_mem_type_;
-  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
-  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type);
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
index b18b3c7a80..ee4801ea58 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
@@ -38,8 +38,6 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
   void SetGlobalLocal() override;
 
  private:
-  int InitNHWC();
-
   cl::Kernel kernel_;
   size_t N_{1};
   size_t H_{1};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index ce34c0b42a..d6b53ffc95 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -68,11 +68,10 @@ int TransposeOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
   std::string source = transpose_source;
   std::string program_name = "transpose";
   ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   SetConstArgs();
   SetGlobalLocal();
@@ -109,7 +108,7 @@ int TransposeOpenCLKernel::Run() {
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index 4ebecd360f..beae7c39f9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -35,7 +35,7 @@ struct OpenCLToFormatParameter {
 };
 
 template <typename SrcT, typename DstT>
-void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num) {
+void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
   auto *N = dst;
   auto *H = dst + 1;
   auto *W = dst + 2;
@@ -60,37 +60,26 @@ void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num) {
 }
 
 template <typename SrcT, typename DstT>
-void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num, DstT default_value) {
+void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
   for (int i = 0; i < 4; ++i) {
     dst[i] = default_value;
   }
-  Broadcast2GpuShape(src, dst, src_num);
+  Broadcast2GpuShape(dst, src, src_num);
 }
 
-struct Image2DInfo {
-  explicit Image2DInfo(const lite::Tensor *tensor) {
+struct GpuTensorInfo {
+  explicit GpuTensorInfo(const lite::Tensor *tensor) {
     if (tensor == nullptr) {
       return;
     }
-    auto shape = tensor->shape();
-    OriDim = shape.size();
-    if (OriDim == 1) {
-      N = shape[0];
-    } else if (OriDim == 2) {
-      N = shape[0];
-      C = shape[1];
-    } else if (OriDim == 3) {
-      N = shape[0];
-      W = shape[1];
-      C = shape[2];
-    } else if (OriDim == 4) {
-      N = shape[0];
-      H = shape[1];
-      W = shape[2];
-      C = shape[3];
-    } else if (OriDim >= 5) {
-      MS_LOG(ERROR) << "GPU doesn't support Tensor with ndim>=" << OriDim;
-    }
+    auto shape_ori = tensor->shape();
+    NDim = shape_ori.size();
+    cl_int4 shape;
+    Broadcast2GpuShape(shape.s, shape_ori.data(), shape_ori.size(), 1);
+    N = shape.s[0];
+    H = shape.s[1];
+    W = shape.s[2];
+    C = shape.s[3];
     Slice = UP_DIV(C, C4NUM);
 
     FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
@@ -117,14 +106,14 @@ struct Image2DInfo {
   }
 
   int AlignAxis(int oriAxis) const {
-    if (OriDim == 0) {
+    if (NDim == 0) {
       return 0;
     }
-    int no_neg_axis = (oriAxis + OriDim) % OriDim;
+    int no_neg_axis = static_cast<int>((oriAxis + NDim) % NDim);
     if (no_neg_axis == 0) {
       return 0;
     }
-    return no_neg_axis + 4 - OriDim;
+    return static_cast<int>(no_neg_axis + 4 - NDim);
   }
 
   size_t N{1};
@@ -140,7 +129,7 @@ struct Image2DInfo {
   size_t ElementsC4Num{};
   size_t OriginSize{};
   size_t Image2DSize{};
-  size_t OriDim{};
+  size_t NDim{};
 };
 
 class OpenCLKernel : public LiteKernel {
@@ -205,7 +194,7 @@ class OpenCLKernel : public LiteKernel {
     if (idx >= out_tensors_.size()) {
       return RET_ERROR;
     }
-    auto img_info = Image2DInfo(out_tensors_[idx]);
+    auto img_info = GpuTensorInfo(out_tensors_[idx]);
     size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT;
     *img_size = {img_info.width, img_info.height, img_dtype};
     return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
index bef9e33664..fa3e8fc5c5 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -260,7 +260,7 @@ void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std:
     return;
   }
 
-  Image2DInfo img_info(tensor);
+  GpuTensorInfo img_info(tensor);
   auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
   std::vector<char> data(size);
   auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
index 72a622c14f..e82f09072e 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@@ -112,11 +112,11 @@ class OpenCLRuntime {
   std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program);
   bool LoadSource(const std::string &program_name, const std::string &source);
   int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
-                  const std::set<std::string> &build_options);
+                  const std::set<std::string> &build_options = {});
   int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
-                cl::CommandQueue *command_queue);  // !!!To be deleted
+                cl::CommandQueue *command_queue = nullptr);  // !!!To be deleted
   int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
-                cl::CommandQueue *command_queue);
+                cl::CommandQueue *command_queue = nullptr);
   bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
                            bool sync = false) const;
   bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
index 2f175d091e..bb53b44e43 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
@@ -13,110 +13,83 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/fp32/activation_fp32.h"
 
-namespace mindspore {
-class TestActivationOpenCL : public mindspore::CommonTest {
- public:
-  TestActivationOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseActivation(void *input_data0, const std::vector<int> &input_shape, void *output_data,
-                           const std::vector<int> &out_shape, bool enable_fp16, int act_type) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_Activation;
-  param->type_ = act_type;
-  auto tensor_x_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel = kernel::OpenCLKernelCreator<kernel::ActivationOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  inputs[0]->MallocData(allocator);
+class TestOpenCL_Activation : public CommonTest {};
 
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
+namespace {
+// PrimitiveType_Activation: src/ops/populate/activation_populate.cc
+OpParameter *CreateParameter(schema::ActivationType act_type) {
+  auto *param = test::CreateParameter<ActivationParameter>(schema::PrimitiveType_Activation);
+  param->type_ = act_type;
+  param->alpha_ = 0.0f;
+  param->min_val_ = 0.0f;
+  param->max_val_ = 0.0f;
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
 
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
+TEST_F(TestOpenCL_Activation, RELU) {
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {-1, 1, 2, 3, -1, -2, 3, -4, 5, -6, 7, 9};
+  float output_data[] = {0, 1, 2, 3, 0, 0, 3, 0, 5, 0, 7, 9};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::ActivationType_RELU);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
+}
 
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
+TEST_F(TestOpenCL_Activation, RELU6) {
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {-1, 1, 2, 3, -1, -2, 3, -4, 5, -6, 7, 9};
+  float output_data[] = {0, 1, 2, 3, 0, 0, 3, 0, 5, 0, 6, 6};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::ActivationType_RELU6);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  MS_LOG(INFO) << "TestActivation passed";
 }
 
-TEST_F(TestActivationOpenCL, ActivationReLUFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f};
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f};
-  RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, false,
-                        schema::ActivationType_RELU);
+TEST_F(TestOpenCL_Activation, HSIGMOID) {
+  std::vector<int> input_shape = {2, 10, 1, 4};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {2.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5};
+  float output_data[] = {0.9166667, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
+                         1,         1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+                         0,         1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::ActivationType_HSIGMOID);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
+             fp16_enable ? 1e-3 : 1e-4);
+  }
 }
 
-TEST_F(TestActivationOpenCL, ActivationReLUFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f};
-  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f};
-  RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, true, schema::ActivationType_RELU);
+TEST_F(TestOpenCL_Activation, HSWISH) {
+  std::vector<int> input_shape = {2, 10, 1, 4};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {2.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5,
+                        7.5, 6, -7.4, -3.5, 5.9, 6.5, -8, 7.4, 5.9, 6.5, -8, 7.4, 7.5, 6, -7.4, -3.5};
+  float output_data[] = {2.29166667, 6, 0, 0, 5.9, 6.5, 0, 7.4, 5.9, 6.5, 0, 7.4, 7.5, 6, 0, 0,
+                         7.5,        6, 0, 0, 5.9, 6.5, 0, 7.4, 5.9, 6.5, 0, 7.4, 7.5, 6, 0, 0,
+                         7.5,        6, 0, 0, 5.9, 6.5, 0, 7.4, 5.9, 6.5, 0, 7.4, 7.5, 6, 0, 0,
+                         7.5,        6, 0, 0, 5.9, 6.5, 0, 7.4, 5.9, 6.5, 0, 7.4, 7.5, 6, 0, 0,
+                         7.5,        6, 0, 0, 5.9, 6.5, 0, 7.4, 5.9, 6.5, 0, 7.4, 7.5, 6, 0, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::ActivationType_HSWISH);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
+             fp16_enable ? 1e-2 : 1e-4);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc
index b2cf515ed5..070eff9a08 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc
@@ -13,271 +13,176 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h"
-
-namespace mindspore {
-class TestArgMinMaxOpenCL : public mindspore::CommonTest {
- public:
-  TestArgMinMaxOpenCL() {}
-};
-template <typename T>
-void test_main_argminmax(void *input_data, void *correct_data, const std::vector<int> &input_shape,
-                         const std::vector<int> &output_shape, ArgMinMaxParameter *param, TypeId data_type,
-                         schema::Format format) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime_wrap = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = ocl_runtime_wrap.GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  auto tensor_a = lite::Tensor(TypeId(data_type), input_shape, format);
-  auto tensor_c = lite::Tensor(TypeId(data_type), output_shape, format);
-  std::vector<lite::Tensor *> inputs{&tensor_a};
-  std::vector<lite::Tensor *> outputs{&tensor_c};
-  size_t input_size = tensor_a.Size();
-
-  auto *pkernel =
-    new (std::nothrow) kernel::ArgMinMaxOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pkernel == nullptr) {
-    MS_LOG(INFO) << "new SpaceToBatchNDOpenCLKernel failed ";
-    return;
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/arg_min_max_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_ArgMinMax : public CommonTest {};
+
+namespace {
+// PrimitiveType_ArgMin: src/ops/populate/argmin_populate.cc
+// PrimitiveType_ArgMax: src/ops/populate/argmax_populate.cc
+OpParameter *CreateParameter(schema::PrimitiveType type, int axis, int topk, bool out_value, bool keep_dims = false,
+                             int axis_type = 0) {
+  auto *param = test::CreateParameter<ArgMinMaxParameter>(type);
+  param->axis_ = axis;
+  param->topk_ = topk;
+  param->axis_type_ = axis_type;
+  param->out_value_ = out_value;
+  param->keep_dims_ = keep_dims;
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
+
+TEST_F(TestOpenCL_ArgMinMax, axis0topk2index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 0;
+  int topk = 2;
+  bool out_value = false;
+  std::vector<int> input_shape = {3, 2, 2, 2};
+  std::vector<int> output_shape = {2, 2, 2, 2};
+  float input_data[] = {100, 2, 4, 50, 11, 12, 34, 35, 10, 20, 40, 5, 7, 80, 10, 11, 55, 25, 5, 15, 18, 8, 15, 16};
+  float output_data[] = {0, 2, 1, 0, 2, 1, 0, 0, 2, 1, 2, 2, 0, 0, 2, 2};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  pkernel->Init();
+}
 
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
+TEST_F(TestOpenCL_ArgMinMax, axis0topk2value) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 0;
+  int topk = 2;
+  bool out_value = true;
+  std::vector<int> input_shape = {3, 2, 2, 2};
+  std::vector<int> output_shape = {2, 2, 2, 2};
+  float input_data[] = {100, 2, 4, 50, 11, 12, 34, 35, 10, 20, 40, 5, 7, 80, 10, 11, 55, 25, 5, 15, 18, 8, 15, 16};
+  float output_data[] = {100, 25, 40, 50, 18, 80, 34, 35, 55, 20, 5, 15, 11, 12, 15, 16};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
+}
 
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{pkernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    delete pkernel;
-    MS_LOG(INFO) << " new SubGraphOpenCLKernel failed ";
-    return;
+TEST_F(TestOpenCL_ArgMinMax, axis1topk2index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 1;
+  int topk = 2;
+  bool out_value = false;
+  std::vector<int> input_shape = {2, 3, 2, 3};
+  std::vector<int> output_shape = {2, 2, 2, 3};
+  float input_data[] = {100, 2,  200, 4,  50, 6,  11, 12, 13, 34, 35, 36,  9,  6, 17, 10, 20, 30,
+                        10,  20, 30,  40, 5,  60, 7,  80, 90, 10, 11, 120, 18, 5, 16, 9,  22, 23};
+  float output_data[] = {0, 1, 0, 1, 0, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 0, 2, 1, 0, 0, 0, 1, 1, 0};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  sub_graph->Init();
+}
 
-  MS_LOG(INFO) << " init tensors ";
-  T *input_ptr = reinterpret_cast<T *>(inputs[0]->MutableData());
-  memcpy(input_ptr, input_data, input_size);
-  std::cout << "==================input data================" << std::endl;
-  for (auto i = 0; i < inputs[0]->ElementsNum(); ++i) {
-    std::cout << input_ptr[i] << ", ";
+TEST_F(TestOpenCL_ArgMinMax, axis1topk2value) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 1;
+  int topk = 2;
+  bool out_value = true;
+  std::vector<int> input_shape = {2, 3, 2, 3};
+  std::vector<int> output_shape = {2, 2, 2, 3};
+  float input_data[] = {100, 2,  200, 4,  50, 6,  11, 12, 13, 34, 35, 36,  9,  6, 17, 10, 20, 30,
+                        10,  20, 30,  40, 5,  60, 7,  80, 90, 10, 11, 120, 18, 5, 16, 9,  22, 23};
+  float output_data[] = {100, 12, 200, 34, 50, 36,  11, 6,  17, 10, 35, 30,
+                         18,  80, 90,  40, 22, 120, 10, 20, 30, 10, 11, 60};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::cout << std::endl;
-
-  sub_graph->Run();
+}
 
-  auto *output_data = reinterpret_cast<T *>(outputs[0]->MutableData());
-  std::cout << "==================output data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << output_data[i] << ", ";
+TEST_F(TestOpenCL_ArgMinMax, axis2topk1index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 2;
+  int topk = 1;
+  bool out_value = false;
+  std::vector<int> input_shape = {2, 3, 3, 3};
+  std::vector<int> output_shape = {2, 3, 1, 3};
+  float input_data[] = {10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12,
+                        10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12,
+                        10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12};
+  float output_data[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::cout << std::endl;
-  std::cout << "==================correct data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << static_cast<T *>(correct_data)[i] << ", ";
-  }
-  std::cout << std::endl;
-  CommonTest::CompareOutputData<T>(output_data, static_cast<T *>(correct_data), outputs[0]->ElementsNum(), 0.0001);
-  delete sub_graph;
 }
-TEST_F(TestArgMinMaxOpenCL, axis0topk2index) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  std::vector<float> in_data = {100, 2,  4,  50, 11, 12, 34, 35, 10, 20, 40, 5,
-                                7,   80, 10, 11, 55, 25, 5,  15, 18, 8,  15, 16};
-  std::vector<float> except_out = {0, 2, 1, 0, 2, 1, 0, 0, 2, 1, 2, 2, 0, 0, 2, 2};
-  param->dims_size_ = 4;
-  param->axis_ = 0;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = false;
-  std::vector<int> in_shape = {3, 2, 2, 2};
-  std::vector<int> out_shape = {2, 2, 2, 2};
 
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
-}
-TEST_F(TestArgMinMaxOpenCL, axis0topk2value) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
+TEST_F(TestOpenCL_ArgMinMax, axis2topk2value) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 2;
+  int topk = 2;
+  bool out_value = true;
+  std::vector<int> input_shape = {2, 2, 3, 5};
+  std::vector<int> output_shape = {1, 2, 2, 5};
+  float input_data[] = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
+                        20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
+                        30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
+  float output_data[] = {30, 45, 30, 50, 90, 20, 20, 25, 40, 50, 30, 45, 30, 50, 90, 20, 20, 25, 40, 50};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::vector<float> in_data = {100, 2,  4,  50, 11, 12, 34, 35, 10, 20, 40, 5,
-                                7,   80, 10, 11, 55, 25, 5,  15, 18, 8,  15, 16};
-  std::vector<float> except_out = {100, 25, 40, 50, 18, 80, 34, 35, 55, 20, 5, 15, 11, 12, 15, 16};
-  param->dims_size_ = 4;
-  param->axis_ = 0;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = true;
-  std::vector<int> in_shape = {3, 2, 2, 2};
-  std::vector<int> out_shape = {2, 2, 2, 2};
-
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
 }
-TEST_F(TestArgMinMaxOpenCL, axis1topk2index) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  std::vector<float> in_data = {100, 2,  200, 4,  50, 6,  11, 12, 13, 34, 35, 36,  9,  6, 17, 10, 20, 30,
-                                10,  20, 30,  40, 5,  60, 7,  80, 90, 10, 11, 120, 18, 5, 16, 9,  22, 23};
-  std::vector<float> except_out = {0, 1, 0, 1, 0, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 0, 2, 1, 0, 0, 0, 1, 1, 0};
-  param->dims_size_ = 4;
-  param->axis_ = 1;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = false;
-  std::vector<int> in_shape = {2, 3, 2, 3};
-  std::vector<int> out_shape = {2, 2, 2, 3};
 
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
-}
-TEST_F(TestArgMinMaxOpenCL, axis1topk2value) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
+TEST_F(TestOpenCL_ArgMinMax, axis2topk2index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 2;
+  int topk = 2;
+  bool out_value = false;
+  std::vector<int> input_shape = {2, 2, 3, 5};
+  std::vector<int> output_shape = {2, 2, 2, 5};
+  float input_data[] = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
+                        20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
+                        30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
+  float output_data[] = {2, 2, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 0, 1,
+                         2, 2, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 0, 1};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::vector<float> in_data = {100, 2,  200, 4,  50, 6,  11, 12, 13, 34, 35, 36,  9,  6, 17, 10, 20, 30,
-                                10,  20, 30,  40, 5,  60, 7,  80, 90, 10, 11, 120, 18, 5, 16, 9,  22, 23};
-  std::vector<float> except_out = {100, 12, 200, 34, 50, 36,  11, 6,  17, 10, 35, 30,
-                                   18,  80, 90,  40, 22, 120, 10, 20, 30, 10, 11, 60};
-  param->dims_size_ = 4;
-  param->axis_ = 1;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = true;
-  std::vector<int> in_shape = {2, 3, 2, 3};
-  std::vector<int> out_shape = {2, 2, 2, 3};
-
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
 }
-TEST_F(TestArgMinMaxOpenCL, axis2topk1index) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  param->dims_size_ = 4;
-  param->axis_ = 2;
-  param->topk_ = 1;
-  param->get_max_ = true;
-  param->out_value_ = false;
-  std::vector<float> in_data = {10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12,
-                                10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12,
-                                10, 20, 30, 11, 15, 10, 5, 10, 12, 10, 20, 30, 11, 15, 10, 5, 10, 12};
-  std::vector<float> except_out = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
-  std::vector<int> in_shape = {2, 3, 3, 3};
-  std::vector<int> out_shape = {2, 3, 1, 3};
 
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
-}
-TEST_F(TestArgMinMaxOpenCL, axis2topk2value) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
+TEST_F(TestOpenCL_ArgMinMax, axis3topk2index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 3;
+  int topk = 2;
+  bool out_value = false;
+  std::vector<int> input_shape = {2, 2, 3, 5};
+  std::vector<int> output_shape = {2, 2, 3, 2};
+  float input_data[] = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
+                        20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
+                        30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
+  float output_data[] = {4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::vector<float> in_data = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
-                                20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
-                                30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
-  std::vector<float> except_out = {30, 45, 30, 50, 90, 20, 20, 25, 40, 50, 30, 45, 30, 50, 90, 20, 20, 25, 40, 50,
-                                   30, 45, 30, 50, 90, 20, 20, 25, 40, 50, 30, 45, 30, 50, 90, 20, 20, 25, 40, 50};
-  param->dims_size_ = 4;
-  param->axis_ = 2;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = true;
-  std::vector<int> in_shape = {2, 2, 3, 5};
-  std::vector<int> out_shape = {1, 2, 2, 5};
-
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
 }
-TEST_F(TestArgMinMaxOpenCL, axis2topk2index) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  std::vector<float> in_data = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
-                                20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
-                                30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
-  std::vector<float> except_out = {2, 2, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 0, 1,
-                                   2, 2, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 0, 1};
-  param->dims_size_ = 4;
-  param->axis_ = 2;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = false;
-  std::vector<int> in_shape = {2, 2, 3, 5};
-  std::vector<int> out_shape = {2, 2, 2, 5};
 
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
-}
-TEST_F(TestArgMinMaxOpenCL, axis3topk2index) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
+TEST_F(TestOpenCL_ArgMinMax, axis3topk2value) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 3;
+  int topk = 2;
+  bool out_value = true;
+  std::vector<int> input_shape = {2, 2, 3, 5};
+  std::vector<int> output_shape = {2, 2, 3, 2};
+  float input_data[] = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
+                        20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
+                        30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
+  float output_data[] = {90, 40, 50, 20, 50, 45, 90, 40, 50, 20, 50, 45,
+                         90, 40, 50, 20, 50, 45, 90, 40, 50, 20, 50, 45};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  std::vector<float> in_data = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
-                                20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
-                                30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
-  std::vector<float> except_out = {4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1, 4, 3, 4, 0, 3, 1};
-  param->dims_size_ = 4;
-  param->axis_ = 3;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = false;
-  std::vector<int> in_shape = {2, 2, 3, 5};
-  std::vector<int> out_shape = {2, 2, 3, 2};
-
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
 }
-TEST_F(TestArgMinMaxOpenCL, axis3topk2value) {
-  ArgMinMaxParameter *param = std::make_unique<ArgMinMaxParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  std::vector<float> in_data = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90,
-                                20, 11, 15, 1,  50, 30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50,
-                                30, 45, 25, 50, 30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50, 30};
-  std::vector<float> except_out = {90, 40, 50, 20, 50, 45, 90, 40, 50, 20, 50, 45,
-                                   90, 40, 50, 20, 50, 45, 90, 40, 50, 20, 50, 45};
-  param->dims_size_ = 4;
-  param->axis_ = 3;
-  param->topk_ = 2;
-  param->get_max_ = true;
-  param->out_value_ = true;
-  std::vector<int> in_shape = {2, 2, 3, 5};
-  std::vector<int> out_shape = {2, 2, 3, 2};
 
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_argminmax<float>(in_data.data(), except_out.data(), in_shape, out_shape, param, data_type, format);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
index 2191fdfc39..99aaf1f626 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
@@ -13,307 +13,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h"
-
-namespace mindspore {
-class TestArithmeticSelfOpenCLfp16 : public mindspore::CommonTest {
- public:
-  TestArithmeticSelfOpenCLfp16() {}
-};
-
-class TestArithmeticSelfOpenCLCI : public mindspore::CommonTest {
- public:
-  TestArithmeticSelfOpenCLCI() {}
-};
-
-template <typename T>
-void CompareOutputData1(T *input_data1, T *output_data, T *correct_data, int size, float err_bound) {
-  for (size_t i = 0; i < size; i++) {
-    T abs = fabs(output_data[i] - correct_data[i]);
-    ASSERT_LE(abs, err_bound);
-  }
-}
-
-TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, output_size;
-  std::string input1Ppath = "./test_data/in_arithmetic_selffp16.bin";
-  std::string correctOutputPath = "./test_data/out_arithmetic_selffp16.bin";
-  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto correctOutput =
-    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-
-  MS_LOG(INFO) << " init tensors ";
-
-  std::vector<int> shape = {1, 2, 2, 144};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-  if (input_tensor == nullptr || output_tensor == nullptr) {
-    MS_LOG(INFO) << " new input_tensor or output_tensor failed ";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{input_tensor};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-
-  MS_LOG(INFO) << " initialize param ";
-  auto param = reinterpret_cast<ArithmeticSelfParameter *>(malloc(sizeof(ArithmeticSelfParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_Sin;
-  auto *arithmeticself_kernel =
-    new (std::nothrow) kernel::ArithmeticSelfOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (arithmeticself_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ArithmeticSelfOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  arithmeticself_kernel->Init();
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor_ : inputs) {
-    input_tensor_->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete arithmeticself_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, input1_size);
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
-  CompareOutputData1(input_data1, output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/arithmetic_self_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_ArithmeticSelf : public CommonTest {};
+
+namespace {
+// PrimitiveType_Abs
+// PrimitiveType_Cos
+// PrimitiveType_Sin
+// PrimitiveType_Log
+// PrimitiveType_Neg
+// PrimitiveType_NegGrad
+// PrimitiveType_LogGrad
+// PrimitiveType_Sqrt
+// PrimitiveType_Square
+// PrimitiveType_Rsqrt
+// PrimitiveType_LogicalNot
+// PrimitiveType_Floor
+// PrimitiveType_Ceil
+// PrimitiveType_Round: src/ops/populate/arithmetic_self_populate.cc
+OpParameter *CreateParameter(schema::PrimitiveType type) {
+  auto *param = test::CreateParameter<ArithmeticSelfParameter>(type);
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f,
-                         0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f};
-  float correctOutput[] = {1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f,
-                           1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
-
-  MS_LOG(INFO) << " init tensors ";
+TEST_F(TestOpenCL_ArithmeticSelf, Round) {
   std::vector<int> shape = {1, 1, 4, 4};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-  if (input_tensor == nullptr || output_tensor == nullptr) {
-    MS_LOG(INFO) << " new input_tensor or output_tensor failed ";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{input_tensor};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-
-  MS_LOG(INFO) << " initialize param ";
-  auto param = reinterpret_cast<ArithmeticSelfParameter *>(malloc(sizeof(ArithmeticSelfParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_Round;
-  auto *arithmeticself_kernel =
-    new (std::nothrow) kernel::ArithmeticSelfOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (arithmeticself_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ArithmeticSelfOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  arithmeticself_kernel->Init();
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor_ : inputs) {
-    input_tensor_->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete arithmeticself_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  CompareOutputData1(input_data1, output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
-}
-
-TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfdim2Fp16) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, output_size;
-  std::string input1Ppath = "./test_data/in_arithmetic_selffp16.bin";
-  std::string correctOutputPath = "./test_data/out_arithmetic_selffp16.bin";
-  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto correctOutput =
-    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-
-  MS_LOG(INFO) << " init tensors ";
-
-  std::vector<int> shape = {1, 512};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NC, tensor_type);
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NC, tensor_type);
-  if (input_tensor == nullptr || output_tensor == nullptr) {
-    MS_LOG(INFO) << " new input_tensor or output_tensor failed ";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{input_tensor};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << " initialize param ";
-  auto param = reinterpret_cast<ArithmeticSelfParameter *>(malloc(sizeof(ArithmeticSelfParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_Sin;
-  auto *arithmeticself_kernel =
-    new (std::nothrow) kernel::ArithmeticSelfOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (arithmeticself_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ArithmeticSelfOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  arithmeticself_kernel->Init();
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor_ : inputs) {
-    input_tensor_->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete arithmeticself_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, input1_size);
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
-  CompareOutputData1(input_data1, output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
+  float input_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37};
+  float output_data[] = {1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Round);
+    TestMain({{shape, input_data, VAR}}, {shape, output_data}, param, fp16_enable);
   }
-  delete sub_graph;
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
index 2f751d42cf..b2990d4e05 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -13,176 +13,117 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/arithmetic_common.h"
 
-namespace mindspore {
-class TestArithmeticOpenCL : public mindspore::CommonTest {
- public:
-  TestArithmeticOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseArithmetic(void *input_data0, const std::vector<int> &input_shape, void *input_data1,
-                           const std::vector<int> &weight_shape, void *output_data, const std::vector<int> &out_shape,
-                           bool enable_fp16, int op_type, int act_type = schema::ActivationType_NO_ACTIVATION) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<ArithmeticParameter *>(malloc(sizeof(ArithmeticParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  int input0_size = std::accumulate(input_shape.begin(), input_shape.end(), 1LL, std::multiplies<int>());
-  int input1_size = std::accumulate(weight_shape.begin(), weight_shape.end(), 1LL, std::multiplies<int>());
+class TestOpenCL_Arithmetic : public CommonTest {};
+
+namespace {
+// PrimitiveType_RealDiv
+// PrimitiveType_LogicalAnd
+// PrimitiveType_LogicalOr
+// PrimitiveType_Equal
+// PrimitiveType_Less
+// PrimitiveType_Greater
+// PrimitiveType_GreaterEqual
+// PrimitiveType_NotEqual
+// PrimitiveType_LessEqual
+// PrimitiveType_Maximum
+// PrimitiveType_Minimum
+// PrimitiveType_FloorDiv
+// PrimitiveType_FloorMod
+// PrimitiveType_SquaredDifference: src/ops/populate/arithmetic_populate.cc
+// PrimitiveType_Add:               src/ops/populate/add_populate.cc
+// PrimitiveType_Sub:               src/ops/populate/sub_populate.cc
+// PrimitiveType_Mul:               src/ops/populate/mul_populate.cc
+// PrimitiveType_Div:               src/ops/populate/div_populate.cc
+// PrimitiveType_Eltwise:           src/ops/populate/eltwise_populate.cc
+// PrimitiveType_BiasAdd:           src/ops/populate/bias_add_populate.cc
+OpParameter *CreateParameter(schema::PrimitiveType type, const std::vector<int> &input0_shape,
+                             const std::vector<int> &input1_shape,
+                             schema::ActivationType act_type = schema::ActivationType_NO_ACTIVATION) {
+  auto *param = test::CreateParameter<ArithmeticParameter>(type);
+  int input0_size = std::accumulate(input0_shape.begin(), input0_shape.end(), 1, std::multiplies<>());
+  int input1_size = std::accumulate(input1_shape.begin(), input1_shape.end(), 1, std::multiplies<>());
   if (input0_size != input1_size) {
     param->broadcasting_ = true;
   }
-  param->op_parameter_.type_ = op_type;
   param->activation_type_ = act_type;
-  auto tensor_x_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-
-  auto tensor_w_ptr = std::make_unique<lite::Tensor>(
-    TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape, schema::Format_NHWC,
-    input1_size != 1 ? lite::Tensor::Category::CONST_TENSOR : lite::Tensor::Category::CONST_SCALAR);
-  auto tensor_w = tensor_w_ptr.get();
-  if (tensor_w == nullptr) {
-    MS_LOG(ERROR) << "tensor_w create error.";
-    return;
-  }
-  tensor_w->set_data(input_data1);
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel = kernel::OpenCLKernelCreator<kernel::ArithmeticOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  inputs[0]->MallocData(allocator);
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
 
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
+TEST_F(TestOpenCL_Arithmetic, ElementwiseAdd) {
+  std::vector<int> input0_shape = {1, 2, 2, 3};
+  std::vector<int> input1_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 2, 2, 3};
+  float input0_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float input1_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float output_data[] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24};
 
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Add, input0_shape, input1_shape);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
   }
-
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-  MS_LOG(INFO) << "TestArithmetic passed";
 }
 
-TEST_F(TestArithmeticOpenCL, ArithmeticElementwiseAddFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> in_shape1 = {n, h, w, c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> output_data = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f};
-  RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape,
-                        false, schema::PrimitiveType_Add);
+TEST_F(TestOpenCL_Arithmetic, ScalarMul) {
+  std::vector<int> input0_shape = {1, 2, 2, 3};
+  std::vector<int> input1_shape = {1};
+  std::vector<int> output_shape = {1, 2, 2, 3};
+  float input0_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float input1_data[] = {2};
+  float output_data[] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Mul, input0_shape, input1_shape);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
 
-TEST_F(TestArithmeticOpenCL, ArithmeticScalarMulFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> in_shape1 = {1};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> weight_data = {2.0f};
-  std::vector<float> output_data = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f};
-  RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape,
-                        false, schema::PrimitiveType_Mul);
+TEST_F(TestOpenCL_Arithmetic, BroadcastSubReLU6) {
+  std::vector<int> input0_shape = {1, 2, 2, 3};
+  std::vector<int> input1_shape = {3};
+  std::vector<int> output_shape = {1, 2, 2, 3};
+  float input0_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float input1_data[] = {1, 2, 3};
+  float output_data[] = {0, 0, 0, 3, 3, 3, 6, 6, 6, 6, 6, 6};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Sub, input0_shape, input1_shape, schema::ActivationType_RELU6);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
 
-TEST_F(TestArithmeticOpenCL, ArithmeticBroadcastSubReLU6Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> in_shape1 = {c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> weight_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {0.0f, 0.0f, 0.0f, 3.0f, 3.0f, 3.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f};
-  RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape,
-                        false, schema::PrimitiveType_Sub, schema::ActivationType_RELU6);
+TEST_F(TestOpenCL_Arithmetic, BroadcastSub2) {
+  std::vector<int> input0_shape = {1, 3};
+  std::vector<int> input1_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 2, 2, 3};
+  float input0_data[] = {1, 2, 3};
+  float input1_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float output_data[] = {0, 0, 0, -3, -3, -3, -6, -6, -6, -9, -9, -9};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Sub, input0_shape, input1_shape);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
 
-TEST_F(TestArithmeticOpenCL, ArithmeticBroadcastSub2Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, c};
-  std::vector<int> in_shape1 = {n, h, w, c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> output_data = {0.0f, 0.0f, 0.0f, -3.0f, -3.0f, -3.0f, -6.0f, -6.0f, -6.0f, -9.0f, -9.0f, -9.0f};
-  RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape,
-                        false, schema::PrimitiveType_Sub);
+TEST_F(TestOpenCL_Arithmetic, ElementwiseDiv) {
+  std::vector<int> input0_shape = {1, 2, 2, 3};
+  std::vector<int> input1_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 2, 2, 3};
+  float input0_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float input1_data[] = {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2};
+  float output_data[] = {1, 2, 3, 2, 2.5, 3, 7, 8, 9, 5, 5.5, 6};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(schema::PrimitiveType_Div, input0_shape, input1_shape);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
 
-TEST_F(TestArithmeticOpenCL, ArithmeticElementwiseDivFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> in_shape1 = {n, h, w, c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f};
-  std::vector<float16_t> output_data = {1.0f, 2.0f, 3.0f, 2.0f, 2.5, 3.0f, 7.0f, 8.0f, 9.0f, 5.0f, 5.5, 6.0f};
-  RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape,
-                        true, schema::PrimitiveType_Div);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc
index c8643c5176..984d24854f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc
@@ -13,100 +13,33 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h"
+#include <array>
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/batch_to_space.h"
 
-namespace mindspore {
-class TestBatchToSpaceNDOpenCL : public mindspore::CommonTest {
- public:
-  TestBatchToSpaceNDOpenCL() {}
-};
-template <typename T>
-void test_main_batch_to_space_nd(void *input_data, void *correct_data, const std::vector<int> &input_shape,
-                                 BatchToSpaceParameter *param, TypeId data_type, schema::Format format) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime_wrap = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = ocl_runtime_wrap.GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
+namespace mindspore::lite::opencl::test {
 
-  std::vector<int> output_shape = input_shape;
-  output_shape[0] = input_shape[0] / param->block_shape_[0] / param->block_shape_[1];
-  output_shape[1] = input_shape[1] * param->block_shape_[0] - param->crops_[0] - param->crops_[1];
-  output_shape[2] = input_shape[2] * param->block_shape_[1] - param->crops_[2] - param->crops_[3];
+class TestOpenCL_BatchToSpaceND : public CommonTest {};
 
-  auto tensor_a = lite::Tensor(TypeId(data_type), input_shape, format);
-  auto tensor_c = lite::Tensor(TypeId(data_type), output_shape, format);
-  std::vector<lite::Tensor *> inputs{&tensor_a};
-  std::vector<lite::Tensor *> outputs{&tensor_c};
-  size_t input_size = tensor_a.Size();
-
-  auto *pkernel =
-    new (std::nothrow) kernel::BatchToSpaceNDOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pkernel == nullptr) {
-    MS_LOG(INFO) << "new BatchToSpaceNDOpenCLKernel failed ";
-    return;
-  }
-  pkernel->Init();
-
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{pkernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    delete pkernel;
-    MS_LOG(INFO) << " new SubGraphOpenCLKernel failed ";
-    return;
-  }
-  sub_graph->Init();
-
-  MS_LOG(INFO) << " init tensors ";
-  T *input_ptr = reinterpret_cast<T *>(inputs[0]->MutableData());
-  memcpy(input_ptr, input_data, input_size);
-  std::cout << "==================input data================" << std::endl;
-  for (auto i = 0; i < inputs[0]->ElementsNum(); ++i) {
-    std::cout << input_ptr[i] << ", ";
-  }
-  std::cout << std::endl;
-
-  sub_graph->Run();
-
-  auto *output_data = reinterpret_cast<T *>(outputs[0]->MutableData());
-  std::cout << "==================output data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << output_data[i] << ", ";
-  }
-  std::cout << std::endl;
-  std::cout << "==================correct data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << static_cast<T *>(correct_data)[i] << ", ";
-  }
-  std::cout << std::endl;
-  CommonTest::CompareOutputData<T>(output_data, static_cast<T *>(correct_data), outputs[0]->ElementsNum(), 0.0001);
-  delete sub_graph;
+namespace {
+// PrimitiveType_BatchToSpaceND: src/ops/populate/batch_to_space_populate.cc
+OpParameter *CreateParameter(int block_shape[], int crops[], const std::vector<int> &input_shape,
+                             std::vector<int> *output_shape) {
+  auto *param = test::CreateParameter<BatchToSpaceParameter>(schema::PrimitiveType_BatchToSpaceND);
+  memcpy(param->block_shape_, block_shape, sizeof(param->block_shape_));
+  memcpy(param->crops_, crops, sizeof(param->crops_));
+  *output_shape = {input_shape[0] / param->block_shape_[0] / param->block_shape_[1],
+                   input_shape[1] * param->block_shape_[0] - param->crops_[0] - param->crops_[1],
+                   input_shape[2] * param->block_shape_[1] - param->crops_[2] - param->crops_[3], input_shape[3]};
+  return reinterpret_cast<OpParameter *>(param);
 }
-TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H2W2Pad2020) {
-  std::vector<int> input_shape{4, 5, 5, 4};
-  BatchToSpaceParameter *param = std::make_unique<BatchToSpaceParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  param->block_shape_[0] = 2;
-  param->block_shape_[1] = 2;
-  param->crops_[0] = 2;
-  param->crops_[1] = 0;
-  param->crops_[2] = 2;
-  param->crops_[3] = 0;
+}  // namespace
+
+TEST_F(TestOpenCL_BatchToSpaceND, H2W2Pad2020) {
+  std::vector<int> input_shape = {4, 5, 5, 4};
+  int block_shape[] = {2, 2};
+  int crops[] = {2, 0, 2, 0};
+  std::vector<int> output_shape;
   float input_data[] = {
     172, 47,  117, 192, 67,  251, 195, 103, 9,   211, 21,  242, 36,  87,  70,  216, 88,  140, 58,  193, 230, 39,  87,
     174, 88,  81,  165, 25,  77,  72,  9,   148, 115, 208, 243, 197, 254, 79,  175, 192, 82,  99,  216, 177, 243, 29,
@@ -125,9 +58,8 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H2W2Pad2020) {
     131, 46,  218, 178, 108, 3,   31,  9,   138, 27,  173, 199, 167, 61,  85,  97,  44,  34,  162, 88,  33,  133, 232,
     36,  0,   203, 34,  197, 126, 181, 254, 80,  190, 136, 189, 129, 209, 112, 35,  120, 91,  168, 116, 36,  176, 25,
     67,  103, 252, 35,  114, 30,  29,  241, 33,  146, 17,  221, 84,  253, 2,   69,  101, 140, 44,  117, 253, 66,  111,
-    91,  85,  167, 39,  203, 150, 158, 145, 198,
-  };
-  float correct_data[] = {
+    91,  85,  167, 39,  203, 150, 158, 145, 198};
+  float output_data[] = {
     88,  81,  165, 25,  85,  48,  49,  69,  77,  72,  9,   148, 169, 163, 192, 95,  115, 208, 243, 197, 197, 94,
     0,   113, 254, 79,  175, 192, 178, 36,  162, 48,  237, 139, 252, 86,  218, 178, 108, 3,   205, 121, 109, 75,
     31,  9,   138, 27,  184, 16,  152, 157, 173, 199, 167, 61,  149, 110, 25,  208, 85,  97,  44,  34,  243, 29,
@@ -140,22 +72,18 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H2W2Pad2020) {
     32,  182, 35,  102, 119, 11,  128, 38,  19,  174, 174, 82,  91,  128, 42,  115, 184, 188, 142, 99,  53,  140,
     232, 77,  30,  24,  230, 35,  214, 254, 101, 140, 44,  117, 189, 197, 215, 43,  253, 66,  111, 91,  32,  11,
     104, 212, 85,  167, 39,  203, 138, 182, 235, 165, 150, 158, 145, 198};
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_batch_to_space_nd<float>(input_data, correct_data, input_shape, param, data_type, format);
-}
-TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H3W3Pad0101) {
-  std::vector<int> input_shape{9, 3, 3, 4};
-  BatchToSpaceParameter *param = std::make_unique<BatchToSpaceParameter>().release();
-  if (param == nullptr) {
-    return;
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_shape, crops, input_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  param->block_shape_[0] = 3;
-  param->block_shape_[1] = 3;
-  param->crops_[0] = 0;
-  param->crops_[1] = 1;
-  param->crops_[2] = 0;
-  param->crops_[3] = 1;
+}
+
+TEST_F(TestOpenCL_BatchToSpaceND, H3W3Pad0101) {
+  std::vector<int> input_shape = {9, 3, 3, 4};
+  int block_shape[] = {3, 3};
+  int crops[] = {0, 1, 0, 1};
+  std::vector<int> output_shape;
   float input_data[] = {
     172, 47,  117, 192, 67,  251, 195, 103, 9,   211, 21,  242, 36,  87,  70,  216, 88,  140, 58,  193, 230, 39,
     87,  174, 88,  81,  165, 25,  77,  72,  9,   148, 115, 208, 243, 197, 254, 79,  175, 192, 82,  99,  216, 177,
@@ -172,7 +100,7 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H3W3Pad0101) {
     182, 207, 11,  166, 111, 93,  249, 129, 223, 118, 44,  216, 125, 24,  67,  210, 239, 3,   234, 204, 230, 35,
     214, 254, 189, 197, 215, 43,  32,  11,  104, 212, 138, 182, 235, 165, 125, 156, 111, 232, 2,   27,  211, 217,
     151, 53,  51,  174, 148, 181, 29,  67,  35,  39,  137, 73,  41,  151, 131, 46};
-  float correct_data[] = {
+  float output_data[] = {
     172, 47,  117, 192, 254, 79,  175, 192, 38,  232, 244, 17,  67,  251, 195, 103, 82,  99,  216, 177, 79,  132,
     105, 42,  9,   211, 21,  242, 243, 29,  147, 147, 127, 244, 131, 204, 205, 112, 231, 149, 43,  104, 11,  2,
     100, 180, 232, 78,  201, 127, 0,   138, 51,  80,  32,  182, 143, 148, 227, 186, 114, 43,  186, 127, 180, 67,
@@ -185,40 +113,11 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H3W3Pad0101) {
     203, 114, 142, 99,  53,  140, 77,  72,  9,   148, 183, 28,  34,  128, 121, 170, 84,  203, 115, 208, 243, 197,
     128, 164, 53,  133, 197, 94,  0,   113, 227, 148, 209, 50,  226, 107, 13,  112, 178, 36,  162, 48,  155, 14,
     41,  58,  40,  72,  19,  95,  93,  131, 98,  42,  193, 36,  10,  86};
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_batch_to_space_nd<float>(input_data, correct_data, input_shape, param, data_type, format);
-}
-TEST_F(TestBatchToSpaceNDOpenCL, NC4HW4H2W2Pad2222) {
-  std::vector<int> input_shape{4, 5, 5, 4};
-  BatchToSpaceParameter *param = std::make_unique<BatchToSpaceParameter>().release();
-  if (param == nullptr) {
-    return;
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_shape, crops, input_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  param->block_shape_[0] = 2;
-  param->block_shape_[1] = 2;
-  param->crops_[0] = 2;
-  param->crops_[1] = 2;
-  param->crops_[2] = 2;
-  param->crops_[3] = 2;
-  float input_data[] = {172, 47,  117, 192, 67,  251, 195, 103, 9,   211, 21,  242, 36,  87,  70,  216, 88,  140,
-                        58,  193, 230, 39,  87,  174, 88,  81,  165, 25,  77,  72,  9,   148, 115, 208, 243, 197,
-                        254, 79,  175, 192, 82,  99,  216, 177, 243, 29,  147, 147, 142, 167, 32,  193, 9,   185,
-                        127, 32,  31,  202, 244, 151, 163, 254, 203, 114, 183, 28,  34,  128, 128, 164, 53,  133,
-                        38,  232, 244, 17,  79,  132, 105, 42,  186, 31,  120, 1,   65,  231, 169, 57,  35,  102,
-                        119, 11,  174, 82,  91,  128, 142, 99,  53,  140, 121, 170, 84,  203, 68,  6,   196, 47,
-                        127, 244, 131, 204, 100, 180, 232, 78,  143, 148, 227, 186, 23,  207, 141, 117, 85,  48,
-                        49,  69,  169, 163, 192, 95,  197, 94,  0,   113, 178, 36,  162, 48,  93,  131, 98,  42};
-  float correct_data[] = {88,  81,  165, 25,  85,  48,  49,  69,  77,  72,  9,   148, 169, 163, 192, 95,  115, 208,
-                          243, 197, 197, 94,  0,   113, 237, 139, 252, 86,  218, 178, 108, 3,   205, 121, 109, 75,
-                          31,  9,   138, 27,  184, 16,  152, 157, 173, 199, 167, 61,  243, 29,  147, 147, 205, 112,
-                          231, 149, 142, 167, 32,  193, 201, 127, 0,   138, 9,   185, 127, 32,  114, 43,  186, 127,
-                          189, 83,  161, 104, 232, 36,  0,   203, 160, 228, 251, 251, 34,  197, 126, 181, 121, 70,
-                          213, 31,  254, 80,  190, 136, 183, 28,  34,  128, 123, 195, 82,  174, 128, 164, 53,  133,
-                          227, 148, 209, 50,  38,  232, 244, 17,  155, 14,  41,  58,  182, 207, 11,  166, 116, 36,
-                          176, 25,  111, 93,  249, 129, 67,  103, 252, 35,  223, 118, 44,  216, 114, 30,  29,  241};
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NCHW;
-  test_main_batch_to_space_nd<float>(input_data, correct_data, input_shape, param, data_type, format);
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
index 644380e851..8cfb153530 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
@@ -13,380 +13,50 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/batchnorm_parameter.h"
 
-namespace mindspore {
-class TestBatchnormOpenCLfp32 : public mindspore::CommonTest {
- public:
-  TestBatchnormOpenCLfp32() {}
-};
-class TestBatchnormOpenCLfp16 : public mindspore::CommonTest {
- public:
-  TestBatchnormOpenCLfp16() {}
-};
-class TestBatchnormOpenCLCI : public mindspore::CommonTest {
- public:
-  TestBatchnormOpenCLCI() {}
-};
+namespace mindspore::lite::opencl::test {
 
-TEST_F(TestBatchnormOpenCLCI, Batchnormfp32CI) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
+class TestOpenCL_BatchNorm : public CommonTest {};
 
-  MS_LOG(INFO) << " Read tensors from .bin ";
+namespace {
+// PrimitiveType_BatchNorm: src/ops/populate/batch_norm_populate.cc
+OpParameter *CreateParameter(float epsilon) {
+  auto *param = test::CreateParameter<BatchNormParameter>(schema::PrimitiveType_BatchNorm);
+  param->epsilon_ = epsilon;
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
+
+TEST_F(TestOpenCL_BatchNorm, test0) {
   std::vector<int> input_shape = {1, 2, 2, 8};
+  std::vector<int> weight_shape = {1, 1, 1, input_shape[3]};
   std::vector<int> output_shape = {1, 2, 2, 8};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-
   float input_data[] = {2.471454,   -2.1379554,  -0.0904604, 1.2928944,  -0.19215967, -0.8677279, -0.12759617,
                         1.2242758,  -0.06398406, -0.4041858, 0.20352598, -2.067808,   0.52113044, -1.567617,
                         0.28003863, 0.41367245,  0.77298605, 0.29908583, 1.4015813,   1.330567,   1.760135,
                         0.6320845,  0.6995399,   -1.208123,  -1.9738104, -1.3283046,  1.022744,   0.02741058,
                         0.84505165, -0.89434445, 1.983211,   -0.5485428};
-  float correct_data[] = {0.7505676,  0.515882,   0.26147857, 1.6026789,  0.47575232, 0.50116986, 0.33589783,
-                          1.4884706,  0.56019205, 0.7832671,  0.53893626, -0.5093127, 0.71395767, 0.18509413,
-                          0.33990562, 0.891792,   0.6230367,  0.89172685, 1.6696336,  1.6263539,  1.1277269,
-                          1.1784974,  0.34403008, -0.3019984, 0.4167911,  0.6407478,  1.3120956,  0.80740136,
-                          0.8221321,  0.4891496,  0.3566509,  0.18351318};
-  float mean_data[] = {0.3016613, -0.89284, 0.63434774, 0.145766, 0.73353934, -0.6744012, 0.7087985, -0.02967937};
-  float var_data[] = {2.5604038, 0.84985304, 0.36261332, 1.9083935, 0.4920925, 0.6476224, 0.6269014, 0.8567283};
   float scale_data[] = {0.1201471, 0.142174, 0.5683258, 0.86815494, 0.23426804, 0.3634345, 0.0077846, 0.6813278};
   float offset_data[] = {0.58764684, 0.70790595, 0.945536, 0.8817803, 0.78489226, 0.5884778, 0.3441211, 0.5654443};
-
-  MS_LOG(INFO) << " construct tensors ";
-  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_mean =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_var =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_scale =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_offset =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
-      tensor_offset == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    return;
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    delete tensor_data;
-    delete tensor_mean;
-    delete tensor_var;
-    delete tensor_scale;
-    delete tensor_offset;
-    return;
-  }
-  std::vector<lite::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<BatchNormParameter *>(malloc(sizeof(BatchNormParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new BatchNormParameter failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->epsilon_ = pow(10, -5);
-  auto *batchnorm_kernel =
-    new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (batchnorm_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  batchnorm_kernel->Init();
-
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete batchnorm_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " init tensors ";
-  memcpy(inputs[0]->data_c(), input_data, sizeof(input_data));
-  memcpy(inputs[1]->data_c(), scale_data, sizeof(scale_data));
-  memcpy(inputs[2]->data_c(), offset_data, sizeof(offset_data));
-  memcpy(inputs[3]->data_c(), mean_data, sizeof(mean_data));
-  memcpy(inputs[4]->data_c(), var_data, sizeof(var_data));
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
-}
-
-TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
-  MS_LOG(INFO) << "begin test";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(INFO) << " Read tensors from .bin ";
-  std::vector<int> input_shape = {1, 256, 256, 48};
-  std::vector<int> output_shape = {1, 256, 256, 48};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-
-  // get the input from .bin
-  size_t input_size, output_size;
-  std::string input_path = "./test_data/batchnorm_in_datafp16.bin";
-  std::string mean_path = "./test_data/batchnorm_meanfp16.bin";
-  std::string var_path = "./test_data/batchnorm_varfp16.bin";
-  std::string offset_path = "./test_data/batchnorm_offsetfp16.bin";
-  std::string scale_path = "./test_data/batchnorm_scalefp16.bin";
-  std::string output_path = "./test_data/batchnorm_correctdatafp16.bin";
-  auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  size_t mean_size, var_size, scale_size, offset_size;
-  auto mean_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(mean_path.c_str(), &mean_size));
-  auto var_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(var_path.c_str(), &var_size));
-  auto scale_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
-  auto offset_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
-
-  MS_LOG(INFO) << " construct tensors ";
-  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_mean =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_var =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_scale =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_offset =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
-      tensor_offset == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    return;
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    delete tensor_data;
-    delete tensor_mean;
-    delete tensor_var;
-    delete tensor_scale;
-    delete tensor_offset;
-    return;
-  }
-  std::vector<lite::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<BatchNormParameter *>(malloc(sizeof(BatchNormParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new BatchNormParameter failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->epsilon_ = pow(10, -5);
-  auto *batchnorm_kernel =
-    new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (batchnorm_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  batchnorm_kernel->Init();
-
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete batchnorm_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " init tensors ";
-  memcpy(inputs[0]->data_c(), input_data, input_size);
-  memcpy(inputs[1]->data_c(), scale_data, scale_size);
-  memcpy(inputs[2]->data_c(), offset_data, offset_size);
-  memcpy(inputs[3]->data_c(), mean_data, mean_size);
-  memcpy(inputs[4]->data_c(), var_data, var_size);
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
+  float mean_data[] = {0.3016613, -0.89284, 0.63434774, 0.145766, 0.73353934, -0.6744012, 0.7087985, -0.02967937};
+  float var_data[] = {2.5604038, 0.84985304, 0.36261332, 1.9083935, 0.4920925, 0.6476224, 0.6269014, 0.8567283};
+  float output_data[] = {0.7505676,  0.515882,   0.26147857, 1.6026789,  0.47575232, 0.50116986, 0.33589783,
+                         1.4884706,  0.56019205, 0.7832671,  0.53893626, -0.5093127, 0.71395767, 0.18509413,
+                         0.33990562, 0.891792,   0.6230367,  0.89172685, 1.6696336,  1.6263539,  1.1277269,
+                         1.1784974,  0.34403008, -0.3019984, 0.4167911,  0.6407478,  1.3120956,  0.80740136,
+                         0.8221321,  0.4891496,  0.3566509,  0.18351318};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(1e-5);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, scale_data, VAR},
+              {weight_shape, offset_data, VAR},
+              {weight_shape, mean_data, VAR},
+              {weight_shape, var_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-5);
   }
-  delete sub_graph;
 }
 
-TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(INFO) << " Read tensors from .bin ";
-  std::vector<int> input_shape = {1, 256, 256, 47};
-  std::vector<int> output_shape = {1, 256, 256, 47};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-
-  // get the input from .bin
-  size_t input_size, output_size;
-  std::string input_path = "./test_data/batchnorm_in_datafp32.bin";
-  std::string mean_path = "./test_data/batchnorm_meanfp32.bin";
-  std::string var_path = "./test_data/batchnorm_varfp32.bin";
-  std::string offset_path = "./test_data/batchnorm_offsetfp32.bin";
-  std::string scale_path = "./test_data/batchnorm_scalefp32.bin";
-  std::string output_path = "./test_data/batchnorm_out_datafp32.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  size_t mean_size, var_size, scale_size, offset_size;
-  auto mean_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(mean_path.c_str(), &mean_size));
-  auto var_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(var_path.c_str(), &var_size));
-  auto scale_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
-  auto offset_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
-
-  MS_LOG(INFO) << " construct tensors ";
-  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_mean =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_var =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_scale =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  lite::Tensor *tensor_offset =
-    new (std::nothrow) lite::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
-  if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
-      tensor_offset == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    return;
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " init tensor failed ";
-    delete tensor_data;
-    delete tensor_mean;
-    delete tensor_var;
-    delete tensor_scale;
-    delete tensor_offset;
-    return;
-  }
-  std::vector<lite::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
-  std::vector<lite::Tensor *> outputs{output_tensor};
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<BatchNormParameter *>(malloc(sizeof(BatchNormParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new BatchNormParameter failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->epsilon_ = pow(10, -5);
-  auto *batchnorm_kernel =
-    new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (batchnorm_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  batchnorm_kernel->Init();
-
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete batchnorm_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " init tensors ";
-  memcpy(inputs[0]->data_c(), input_data, input_size);
-  memcpy(inputs[1]->data_c(), scale_data, scale_size);
-  memcpy(inputs[2]->data_c(), offset_data, offset_size);
-  memcpy(inputs[3]->data_c(), mean_data, mean_size);
-  memcpy(inputs[4]->data_c(), var_data, var_size);
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
index 932e0660c1..45935df3a9 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
@@ -29,7 +29,10 @@ using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore {
-class TestBiasAddOpenCL : public mindspore::CommonTest {};
+
+// PrimitiveType_BiasAdd: src/ops/populate/bias_add_populate.cc
+
+class TestBiasAddOpenCL : public CommonTest {};
 
 void LoadDataBiasAdd(void *dst, size_t dst_size, const std::string &file_path) {
   if (file_path.empty()) {
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
index 2f30341eb0..556d99e17f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
@@ -22,8 +22,10 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h"
 
-namespace mindspore {
-class TestCastSelfOpenCL : public mindspore::CommonTest {
+// PrimitiveType_Cast: src/ops/populate/cast_populate.cc
+
+namespace mindspore::lite::opencl::test {
+class TestCastSelfOpenCL : public CommonTest {
  public:
   TestCastSelfOpenCL() {}
 };
@@ -208,4 +210,4 @@ TEST_F(TestCastSelfOpenCL, Castfp16tofp32) {
   }
   delete sub_graph;
 }
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
new file mode 100644
index 0000000000..a2f14be286
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <set>
+#include <algorithm>
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "nnacl/conv_parameter.h"
+
+using mindspore::kernel::LiteKernel;
+using mindspore::kernel::SubGraphOpenCLKernel;
+using mindspore::lite::KernelRegistry;
+using mindspore::schema::Format::Format_NHWC;
+
+namespace mindspore::lite::opencl::test {
+
+void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
+              OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
+  auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
+  static std::set<schema::PrimitiveType> packed_op = {
+    schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D, schema::PrimitiveType_DepthwiseConv2D,
+    schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_MatMul};
+
+  // simulating benchmark: session::LiteSession::CreateSession() -> session->Init()
+  MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  auto ocl_runtime = runtime_wrapper.GetInstance();
+  ocl_runtime->SetFp16Enable(fp16_enable);
+  EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
+
+  // simulating benchmark:  session_->CompileGraph() -> ConvertTensors()
+  MS_LOG(DEBUG) << "create Tensors & init weight data";
+  std::vector<Tensor> tensors;
+  // firstly, create all Tensors
+  tensors.reserve(input_infos.size());  // vector's capacity() is 0, so call reserve() avoiding vector re-malloc
+  for (auto input_info : input_infos) {
+    auto &shape = std::get<0>(input_info);
+    auto category = std::get<2>(input_info);
+    auto data_type = std::get<3>(input_info);
+    tensors.emplace_back(data_type, shape, Format_NHWC, category);
+  }
+  // secondly, init weight Tensor's data
+  std::vector<Tensor *> kernel_inputs;
+  std::vector<Tensor *> subgraph_inputs;
+  std::map<Tensor *, float *> subgraph_inputs_data;
+  for (int i = 0; i < tensors.size(); ++i) {
+    auto *tensor = &tensors[i];
+    auto *input_data = std::get<1>(input_infos[i]);
+    kernel_inputs.push_back(tensor);
+    if (tensor->category() != VAR) {  // tensor is weight
+      // simulating src/lite_session.cc:WeightTensorNeedCopy()
+      if (packed_op.count(primitive_type)) {
+        tensor->set_data(input_data);
+      } else {
+        memcpy(tensor->MutableData(), input_data, tensor->Size());
+      }
+    } else {
+      EXPECT_TRUE(tensor->data_type() == kNumberTypeFloat32);
+      subgraph_inputs.push_back(tensor);
+      subgraph_inputs_data[tensor] = reinterpret_cast<float *>(input_data);
+    }
+  }
+
+  const std::vector<int> &output_shape = std::get<0>(output_info);
+  float *expect_data = std::get<1>(output_info);
+  auto output = Tensor(kNumberTypeFloat32, output_shape, Format_NHWC, VAR);
+
+  // simulating benchmark:  session_->CompileGraph() -> scheduler.Schedule() -> BuildKernels()
+  MS_LOG(DEBUG) << "create OpenCLKernel";
+  kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  if (creator == nullptr) {
+    std::cerr << "can't get registry function for: " << schema::EnumNamePrimitiveType(primitive_type)
+              << ". Maybe you forget setting op_parameter_.type_ for OpParameter." << std::endl;
+    free(op_parameter);
+    FAIL();
+  }
+  auto *kernel = creator(kernel_inputs, {&output}, op_parameter, nullptr, key, nullptr);
+  if (kernel == nullptr) {
+    std::cerr << "call registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
+    free(op_parameter);
+    FAIL();
+  }
+  kernel->set_name(schema::EnumNamesPrimitiveType()[primitive_type]);
+
+  // simulating benchmark:  session_->CompileGraph() -> scheduler.Schedule() -> ConstructSubGraphs()
+  MS_LOG(DEBUG) << "create SubGraph";
+  std::vector<LiteKernel *> kernels{kernel};
+  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    return;
+  }
+
+  // simulating benchmark:  session_->CompileGraph() -> PrepareKernels() -> SubGraphOpenCLKernel.Prepare()
+  MS_LOG(DEBUG) << "call sub_graph->Prepare()";
+  EXPECT_TRUE(sub_graph->Prepare() == RET_OK);  // will set Tensor's allocator be OpenCLAllocator
+
+  // simulating benchmark:  model->Free(), clear weight data in input_infos
+  std::vector<std::unique_ptr<uint8_t[]>> saved_weights;
+  for (int i = 0; i < tensors.size(); ++i) {
+    auto *tensor = &tensors[i];
+    if (tensor->category() != VAR) {
+      saved_weights.emplace_back(new uint8_t[tensor->Size()]);
+      auto *weight_data = std::get<1>(input_infos[i]);
+      memcpy(saved_weights.back().get(), weight_data, tensor->Size());
+      srand(time(nullptr));
+      memset(weight_data, rand(), tensor->Size());
+    }
+  }
+
+  // simulating benchmark: LoadInput()
+  MS_LOG(DEBUG) << "malloc and init input data";
+  for (auto input : subgraph_inputs) {
+    EXPECT_TRUE(input->MutableData() != nullptr);  // malloc Image2D & call MapBuffer()
+    memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
+  }
+
+  // simulating benchmark:  MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> SubGraphOpenCLKernel->Run()
+  MS_LOG(DEBUG) << "run SubGraph & compare result";
+  EXPECT_TRUE(sub_graph->Run() == RET_OK);  // will call UnmapBuffer() for input
+
+  // check result
+  ocl_runtime->GetAllocator()->MapBuffer(output.data_c(), CL_MAP_READ, nullptr, true);
+  CompareOutput<float>(output.data_c(), expect_data, output.ElementsNum(), atol, rtol, print_data);
+  ocl_runtime->GetAllocator()->UnmapBuffer(output.data_c());
+
+  MS_LOG(DEBUG) << "release resources";
+  for (auto &tensor : tensors) {
+    if (tensor.category() != VAR && packed_op.count(primitive_type)) {
+      tensor.set_data(nullptr);
+    }
+  }
+  for (int i = 0, j = 0; i < tensors.size(); ++i) {  // resume weight data to input_infos
+    auto *tensor = &tensors[i];
+    if (tensor->category() != VAR) {
+      auto *weight_data = std::get<1>(input_infos[i]);
+      memcpy(weight_data, saved_weights[j++].get(), tensor->Size());
+    }
+  }
+  delete sub_graph;
+}
+
+void TestMain(const std::vector<ArgsTuple> &input_infos, std::tuple<std::vector<int>, float *> output_info,
+              OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
+  std::vector<ArgsTupleWithDtype> input_infos_new;
+  auto transform_fun = [](ArgsTuple in) -> ArgsTupleWithDtype {
+    return ArgsTupleWithDtype(std::get<0>(in), std::get<1>(in), std::get<2>(in), kNumberTypeFloat32);
+  };
+  std::transform(input_infos.begin(), input_infos.end(), std::back_inserter(input_infos_new), transform_fun);
+  TestMain(input_infos_new, output_info, op_parameter, fp16_enable, atol, rtol, print_data);
+}
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h
new file mode 100644
index 0000000000..75cc0186d3
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h
@@ -0,0 +1,102 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TEST_UT_SRC_RUNTIME_KERNEL_OPENCL_COMMON_H_
+#define MINDSPORE_LITE_TEST_UT_SRC_RUNTIME_KERNEL_OPENCL_COMMON_H_
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <tuple>
+#include <map>
+#include <memory>
+#include "nnacl/op_base.h"
+#include "ir/dtype/type_id.h"
+#include "src/tensor.h"
+#include "src/common/file_utils.h"
+#include "common/common_test.h"
+
+using Tensor = mindspore::lite::Tensor;
+using ArgsTuple = std::tuple<std::vector<int>, void *, Tensor::Category>;
+using ArgsTupleWithDtype = std::tuple<std::vector<int>, void *, Tensor::Category, mindspore::TypeId>;
+constexpr Tensor::Category VAR = Tensor::VAR;
+constexpr Tensor::Category CONST_TENSOR = Tensor::Category::CONST_TENSOR;
+constexpr Tensor::Category CONST_SCALAR = Tensor::Category::CONST_SCALAR;
+
+namespace mindspore::lite::opencl::test {
+
+template <typename T>
+void CompareOutput(void *output, void *expect, size_t elem_num, T atol, float rtol = 1e-9, bool print_data = false) {
+  T *output_data = reinterpret_cast<T *>(output);
+  T *expect_data = reinterpret_cast<T *>(expect);
+
+  if (print_data) {
+    for (int i = 0; i < elem_num; ++i) {
+      printf("%d: expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
+    }
+  }
+
+  int mismatch_num = 0;
+  int first_err_idx = -1;
+  for (int i = 0; i < elem_num; ++i) {
+    auto delta = static_cast<float>(std::fabs(output_data[i] - expect_data[i]));
+    auto tolerance = static_cast<float>(atol + rtol * std::fabs(expect_data[i]));
+    if (delta > tolerance) {
+      mismatch_num++;
+      if (first_err_idx == -1) {
+        first_err_idx = i;
+      }
+    }
+  }
+  if (mismatch_num > 0) {
+    printf("(mismatch %4.1f%%)\n", 100 * static_cast<float>(mismatch_num) / elem_num);
+    printf("Not equal to tolerance atol=%.0e, rtol=%.0e\n", atol, rtol);
+    printf("first error at idx=%d expect=%.1f output=%.1f\n", first_err_idx, expect_data[first_err_idx],
+           output_data[first_err_idx]);
+    FAIL();
+  }
+}
+
+template <typename T>
+void CompareOutput(Tensor *output_tensor, const std::string &file_path, float atol, float rtol = 1e-9) {
+  size_t output_size;
+  auto expect_data = lite::ReadFile(file_path.c_str(), &output_size);
+  CompareOutput<T>(output_tensor->data_c(), expect_data, output_tensor->ElementsNum(), atol, rtol);
+}
+
+template <typename T>
+T *CreateParameter(schema::PrimitiveType type) {
+  auto *param = static_cast<T *>(malloc(sizeof(T)));
+  if (param == nullptr) {
+    MS_LOG(ERROR) << std::string("create Parameter failed for ") + schema::EnumNamePrimitiveType(type) << std::endl;
+    return nullptr;
+  }
+  memset(param, 0, sizeof(T));
+  param->op_parameter_.type_ = type;
+  return param;
+}
+
+void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
+              OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
+              bool print_output = false);
+
+void TestMain(const std::vector<ArgsTuple> &input_infos, std::tuple<std::vector<int>, float *> output_info,
+              OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
+              bool print_output = false);
+
+}  // namespace mindspore::lite::opencl::test
+
+#endif  // MINDSPORE_LITE_TEST_UT_SRC_RUNTIME_KERNEL_OPENCL_COMMON_H_
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
index b2337d7dea..d2ec96c396 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
@@ -13,513 +13,35 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/concat_parameter.h"
 
-namespace mindspore {
-class TestConcatOpenCLfp32 : public mindspore::CommonTest {
- public:
-  TestConcatOpenCLfp32() {}
-};
-class TestConcatOpenCLfp16 : public mindspore::CommonTest {
- public:
-  TestConcatOpenCLfp16() {}
-};
+namespace mindspore::lite::opencl::test {
 
-class TestConcatOpenCLCI : public mindspore::CommonTest {
- public:
-  TestConcatOpenCLCI() {}
-};
+class TestOpenCL_Concat : public CommonTest {};
 
-TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  constexpr int INPUT_NUM = 2;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 1, 1, 8}, std::vector<int>{1, 1, 1, 8}};
-  std::vector<int> output_shape = {2, 1, 1, 8};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f};
-  float input_data2[] = {0.5f, 0.6f, 0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f};
-  float correctOutput[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f,  0.59f, 0.03f, 0.37f,
-                           0.5f,  0.6f,  0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f};
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs;
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 0;
-  auto *concat_kernel =
-    new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  concat_kernel->Init();
-  // to do allocate memory for inputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{concat_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete concat_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
+namespace {
+// PrimitiveType_Concat: src/ops/populate/concat_populate.cc
+OpParameter *CreateParameter(int axis) {
+  auto *param = test::CreateParameter<ConcatParameter>(schema::PrimitiveType_Concat);
+  param->axis_ = axis;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestConcatOpenCLfp16, ConcatFp16_4input_dim4_axis1) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, input2_size, input3_size, input4_size, output_size;
-  std::string input1Ppath = "./test_data/concatfp16_input1.bin";
-  std::string input2Ppath = "./test_data/concatfp16_input2.bin";
-  std::string input3Ppath = "./test_data/concatfp16_input3.bin";
-  std::string input4Ppath = "./test_data/concatfp16_input4.bin";
-  std::string correctOutputPath = "./test_data/concatfp16_output.bin";
-  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
-  auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
-  auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
-  auto correctOutput =
-    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-
-  MS_LOG(INFO) << " init tensors ";
-  constexpr int INPUT_NUM = 4;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 19, 19, 96}, std::vector<int>{1, 19, 19, 96}, std::vector<int>{1, 19, 19, 96},
-    std::vector<int>{1, 19, 19, 96}};
-  std::vector<int> output_shape = {1, 76, 19, 96};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  std::vector<lite::Tensor *> inputs;
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    return;
-  }
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 1;
-  auto *concat_kernel =
-    new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  concat_kernel->Init();
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{concat_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete concat_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  if (inputs.size() == 2) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-  } else if (inputs.size() == 3) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-  } else if (inputs.size() == 4) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-    memcpy(inputs[3]->data_c(), input_data4, input4_size);
-  } else {
-    MS_LOG(ERROR) << " input size must be 2 or 3 or 4";
-  }
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
-}
-
-TEST_F(TestConcatOpenCLfp32, ConcatFp32_3input_dim4_axis1) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, input2_size, input3_size, output_size;
-  std::string input1Ppath = "./test_data/concatfp32_input1.bin";
-  std::string input2Ppath = "./test_data/concatfp32_input2.bin";
-  std::string input3Ppath = "./test_data/concatfp32_input3.bin";
-  std::string correctOutputPath = "./test_data/concatfp32_output.bin";
-  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
-  auto input_data3 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
-  auto correctOutput = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-
-  MS_LOG(INFO) << " init tensors ";
-  constexpr int INPUT_NUM = 3;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
-  std::vector<int> output_shape = {1, 48, 256, 80};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  std::vector<lite::Tensor *> inputs;
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    return;
-  }
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << " input_shapes size=: " << input_shapes.size();
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 1;
-  auto *concat_kernel =
-    new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  concat_kernel->Init();
-  // to do allocate memory for inputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{concat_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete concat_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  if (inputs.size() == 2) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-  } else if (inputs.size() == 3) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-  } else {
-    MS_LOG(ERROR) << " input size must be 2 or 3 ";
-  }
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
-}
-
-TEST_F(TestConcatOpenCLfp16, ConcatFp16_6input_dim4_axis1) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, output_size;
-  std::string input1Ppath = "./test_data/concatfp16_input1.bin";
-  std::string input2Ppath = "./test_data/concatfp16_input2.bin";
-  std::string input3Ppath = "./test_data/concatfp16_input3.bin";
-  std::string input4Ppath = "./test_data/concatfp16_input4.bin";
-  std::string input5Ppath = "./test_data/concatfp16_input5.bin";
-  std::string input6Ppath = "./test_data/concatfp16_input6.bin";
-  std::string correctOutputPath = "./test_data/concatfp16_output.bin";
-  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
-  auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
-  auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
-  auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size));
-  auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size));
-  auto correctOutput =
-    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-
-  MS_LOG(INFO) << " init tensors ";
-  constexpr int INPUT_NUM = 6;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 1200, 3, 4}, std::vector<int>{1, 600, 3, 4}, std::vector<int>{1, 150, 3, 4},
-    std::vector<int>{1, 50, 3, 4},   std::vector<int>{1, 30, 3, 4},  std::vector<int>{1, 4, 3, 4}};
-  std::vector<int> output_shape = {1, 2034, 3, 4};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  std::vector<lite::Tensor *> inputs;
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    return;
-  }
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ConcatParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 1;
-  auto *concat_kernel =
-    new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  concat_kernel->Init();
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{concat_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete concat_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  if (inputs.size() == 2) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-  } else if (inputs.size() == 3) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-  } else if (inputs.size() == 4) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-    memcpy(inputs[3]->data_c(), input_data4, input4_size);
-  } else if (inputs.size() == 6) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-    memcpy(inputs[3]->data_c(), input_data4, input4_size);
-    memcpy(inputs[4]->data_c(), input_data5, input5_size);
-    memcpy(inputs[5]->data_c(), input_data6, input6_size);
-  } else {
-    MS_LOG(ERROR) << " input size must be 2 or 3 or 4";
-  }
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
+TEST_F(TestOpenCL_Concat, input2_axis0) {
+  std::vector<int> input0_shape = {1, 1, 1, 8};
+  std::vector<int> input1_shape = {1, 1, 1, 8};
+  std::vector<int> output_shape = {2, 1, 1, 8};
+  int axis = 0;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
   }
-  delete sub_graph;
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc
new file mode 100644
index 0000000000..cb9fdec16d
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc
@@ -0,0 +1,272 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/conv_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_Conv2D : public CommonTest {};
+
+namespace {
+// PrimitiveType_Concat: src/ops/populate/conv2d_populate.cc
+ConvParameter *CreateParameter(const std::string &attr, ActType act_type) {
+  auto *param = test::CreateParameter<ConvParameter>(schema::PrimitiveType_Conv2D);
+  param->act_type_ = act_type;
+  sscanf(attr.c_str(),
+         "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_"
+         "dilationHW_%dx%d",
+         &param->input_batch_, &param->input_h_, &param->input_w_, &param->input_channel_, &param->output_batch_,
+         &param->output_h_, &param->output_w_, &param->output_channel_, &param->kernel_h_, &param->kernel_w_,
+         &param->stride_h_, &param->stride_w_, &param->pad_u_, &param->pad_d_, &param->pad_l_, &param->pad_r_,
+         &param->dilation_h_, &param->dilation_w_);
+  return param;
+}
+}  // namespace
+
+void TestMain_Conv2D(const std::string &attr, float *input_data, float *weight_data, float *bias_data,
+                     float *output_data, ActType act_type, bool fp16_enable, float atol = 1e-9) {
+  auto *param = CreateParameter(attr, act_type);
+  std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_};
+  std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
+  std::vector<int> bias_shape = {param->output_channel_};
+  std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
+  std::vector<ArgsTuple> input_infos = {{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}};
+  if (bias_data) {
+    input_infos.emplace_back(bias_shape, bias_data, CONST_TENSOR);
+  }
+  TestMain(input_infos, {output_shape, output_data}, reinterpret_cast<OpParameter *>(param), fp16_enable, atol);
+}
+
+TEST_F(TestOpenCL_Conv2D, test0) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
+  std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+  float input_data[] = {0, 1, 2, 3, 4, 5, -6, -7};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_data[] = {0, 0};
+
+  float output_data[] = {1, 1, 5, 5, 9, 9, -13, -13};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, true, 1e-6f);
+
+  float output_data_relu[] = {1, 1, 5, 5, 9, 9, 0, 0};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data_relu, ActType_Relu, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data_relu, ActType_Relu, true, 1e-6f);
+
+  float output_data_relu6[] = {1, 1, 5, 5, 6, 6, 0, 0};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data_relu6, ActType_Relu6, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data_relu6, ActType_Relu6, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test0_no_bias) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  float output_data[] = {1, 1, 5, 5, 9, 9, 13, 13};
+  TestMain_Conv2D(attr, input_data, weight_data, nullptr, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, nullptr, output_data, ActType_No, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test1) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  float bias_data[] = {0.5, -0.5};
+  float output_data[] = {2.5, 3.5, 8.5, 17.5, 14.5, 31.5, 20.5, 45.5};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test2) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_data[] = {0};
+  float output_data[] = {28, 18, 22, 13};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test3) {
+  std::string attr =
+    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  float bias_data[] = {0.5, -0.5};
+  float output_data[] = {168.5, 391.5, 80.5, 223.5, 60.5, 235.5, 20.5, 123.5};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test3_batch2) {
+  std::string attr =
+    "inputNHWC_2x2x2x2_outputNHWC_2x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  float bias_data[] = {0.5, -0.5};
+  float output_data[] = {168.5, 391.5, 80.5, 223.5, 60.5, 235.5, 20.5, 123.5,
+                         168.5, 391.5, 80.5, 223.5, 60.5, 235.5, 20.5, 123.5};
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, false, 1e-3f);
+  TestMain_Conv2D(attr, input_data, weight_data, bias_data, output_data, ActType_No, true, 1e-6f);
+}
+
+TEST_F(TestOpenCL_Conv2D, test4) {
+  std::vector<std::tuple<std::string, std::string, std::vector<float>, std::vector<float>, std::vector<float>,
+                         std::vector<float>, ActType>>
+    cases = {
+      {"SimpleTestFloat32WithAnisotropicStrides",
+       "inputNHWC_1x3x6x1_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x3_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4, 5, 4, 3, -3, -4, -5},
+       {1, 2, 3, 4},
+       {-1},
+       {30, -24, 40, -34},
+       ActType_No},
+      {"SimpleTestFloat32",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x2x3_kernelHW_2x2_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3},
+       ActType_No},
+      {"SimpleTestFloat32SingleThreaded",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x2x3_kernelHW_2x2_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3},
+       ActType_No},
+      {"SimpleTestFloat32WithChannels",
+       "inputNHWC_2x2x4x2_outputNHWC_2x1x2x3_kernelHW_2x2_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1,   1,   1, 1, 1,   1,   1, 1,
+        0.5, 0.5, 1,   1,   1.5, 1.5, 2,   2,   0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2},
+       {1, 1, 2, 2, 3, 3, 4, 4, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3},
+       ActType_No},
+      {"InputAndweightSameWidthHeight",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x1x1_kernelHW_2x4_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4},
+       {1, 2, 3, 4, -1, -1, 1, 1},
+       {0},
+       {10, 34},
+       ActType_No},
+      {"ActivationRelu6Test",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x2x3_kernelHW_2x2_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {6, 2, 5, 6, 2, 5, 6, 4, 3, 6, 4, 3},
+       ActType_Relu6},
+      {"StrideTest",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x3x3_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 3, 2, 1, 2, 3, 4, 1, 2, 4, 4},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 22, 3, 6, 21, 1, 6, 17, 4, 3, 31, 5, 4, 40, 3, 4},
+       ActType_No},
+      {"PaddingTest",
+       "inputNHWC_1x2x4x1_outputNHWC_1x2x4x3_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 3, 2},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 22, 3, 6, 21, 1, 6, 8, -1, 4, 7, 2, -1, 9, 3, -2, 8, 1, -2, 3, 0, 1},
+       ActType_No},
+      {"PointwiseFloat32",
+       "inputNHWC_2x2x4x2_outputNHWC_2x2x4x1_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1,   1,   1, 1, 1,   1,   1, 1,
+        0.5, 0.5, 1,   1,   1.5, 1.5, 2,   2,   0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2},
+       {1, 2},
+       {0},
+       {1.5, 1.5, 1.5, 1.5, 3, 3, 3, 3, 1.5, 3, 4.5, 6, 1.5, 3, 4.5, 6},
+       ActType_No},
+      {"SimpleTestFloat32WithAnisotropicStrides",
+       "inputNHWC_1x3x6x1_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x3_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4, 5, 4, 3, -3, -4, -5},
+       {1, 2, 3, 4},
+       {-1},
+       {30, -24, 40, -34},
+       ActType_No},
+      {"HandCalculatedFloat32",
+       "inputNHWC_1x3x4x1_outputNHWC_1x3x4x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_1x1",
+       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+       {1, 4, 7, 2, 5, 8, 3, 6, 9},
+       {0},
+       {105, 150, 183, 95, 235, 312, 357, 178, 187, 234, 261, 121},
+       ActType_No},
+      {"HandCalculatedFloat32WithConstweight",
+       "inputNHWC_1x3x4x1_outputNHWC_1x3x4x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_1x1",
+       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+       {1, 4, 7, 2, 5, 8, 3, 6, 9},
+       {0},
+       {105, 150, 183, 95, 235, 312, 357, 178, 187, 234, 261, 121},
+       ActType_No},
+      {"HandCalculatedWithBiasFloat32",
+       "inputNHWC_1x3x4x1_outputNHWC_1x3x4x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_1x1",
+       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+       {1, 4, 7, 2, 5, 8, 3, 6, 9},
+       {10},
+       {115, 160, 193, 105, 245, 322, 367, 188, 197, 244, 271, 131},
+       ActType_No},
+      {"HandCalculatedWithReluFloat32",
+       "inputNHWC_1x3x4x1_outputNHWC_1x3x4x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_1x1",
+       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+       {1, 4, 7, 2, 5, 8, 3, 6, 9},
+       {-200},
+       {0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0},
+       ActType_Relu},
+      {"HandCalculatedValidFloat32",
+       "inputNHWC_1x3x4x1_outputNHWC_1x1x2x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+       {1, 4, 7, 2, 5, 8, 3, 6, 9},
+       {0},
+       {312, 357},
+       ActType_No},
+      {"SimpleTestFloatWithDilation",
+       "inputNHWC_1x9x9x1_outputNHWC_1x3x3x1_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_3x3",
+       {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {1, 2, 3, 4, 5, 6, 7, 8, 9},
+       {0},
+       {5, 5, 5, 5, 5, 5, 5, 5, 5},
+       ActType_No},
+      {"SimpleTestQuantizedOutputMultiplierGreaterThan1",
+       "inputNHWC_2x2x4x1_outputNHWC_2x1x2x3_kernelHW_2x2_strideHW_2x2_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1",
+       {1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4},
+       {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1},
+       {1, 2, 3},
+       {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3},
+       ActType_No},
+    };
+
+  for (auto &case_ : cases) {
+    auto &name = std::get<0>(case_);
+    auto &attr = std::get<1>(case_);
+    auto input_data = std::get<2>(case_).data();
+    auto weight_data = std::get<3>(case_).data();
+    auto bias_data = std::get<4>(case_).data();
+    auto expect_data = std::get<5>(case_).data();
+    auto act_type = std::get<6>(case_);
+    std::cout << name << std::endl;
+    TestMain_Conv2D(attr, input_data, weight_data, bias_data, expect_data, act_type, false);
+    TestMain_Conv2D(attr, input_data, weight_data, bias_data, expect_data, act_type, true);
+  }
+}
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
index 0f82879c47..cff19fb617 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
@@ -13,161 +13,64 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/conv_parameter.h"
 
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
-#include "src/common/log_adapter.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+namespace mindspore::lite::opencl::test {
 
-namespace mindspore {
-class TestConv2dTransposeOpenCL : public mindspore::CommonTest {
- public:
-  TestConv2dTransposeOpenCL() {}
-};
+class TestOpenCL_Conv2dTranspose : public CommonTest {};
+
+namespace {
+// PrimitiveType_DeConv2D: src/ops/populate/deconv2d_populate.cc
+OpParameter *CreateParameter(int n, int h, int w, int ci, int co, int kh, int kw, int pad,
+                             std::vector<int> *input_shape, std::vector<int> *weight_shape,
+                             std::vector<int> *bias_shape, std::vector<int> *output_shape) {
+  auto *param = test::CreateParameter<ConvParameter>(schema::PrimitiveType_DeConv2D);
+  param->kernel_h_ = kh;
+  param->kernel_w_ = kw;
+  param->stride_h_ = 2;
+  param->stride_w_ = 2;
+  param->pad_u_ = pad;
+  param->pad_d_ = pad;
+  param->pad_l_ = pad;
+  param->pad_r_ = pad;
+  param->dilation_h_ = 1;
+  param->dilation_w_ = 1;
+  param->act_type_ = ActType_No;
 
-void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data, void *weight_data, void *bias_data,
-                                void *output_data, bool enable_fp16) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  int pad = shape[0];
-  int n = shape[1];
-  int h = shape[2];
-  int w = shape[3];
-  int kh = shape[4];
-  int kw = shape[5];
-  int ci = shape[6];
-  int co = shape[7];
   int oh = 2 * h - 1 + 2 * (kh - 1 - pad) - kh + 1;
   int ow = 2 * w - 1 + 2 * (kw - 1 - pad) - kw + 1;
-  std::vector<int> input_shape = {n, h, w, ci};
-  auto tensor_x_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-
-  std::vector<int> weight_shape = {co, kh, kw, ci};
-  auto tensor_w_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape);
-  auto tensor_w = tensor_w_ptr.get();
-  if (tensor_w == nullptr) {
-    MS_LOG(ERROR) << "tensor_w create error.";
-    return;
-  }
-  tensor_w->set_data(weight_data);
-
-  std::vector<int> bias_shape = {co};
-  auto tensor_bias_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), bias_shape);
-  auto tensor_bias = tensor_bias_ptr.get();
-  if (tensor_bias == nullptr) {
-    MS_LOG(ERROR) << "tensor_bias create error.";
-    return;
-  }
-  tensor_bias->set_data(bias_data);
-
-  std::vector<int> out_shape = {1, oh, ow, co};
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto opParameter = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "opParameter create error.";
-    return;
-  }
-  opParameter->kernel_h_ = kh;
-  opParameter->kernel_w_ = kw;
-  opParameter->stride_h_ = 2;
-  opParameter->stride_w_ = 2;
-  opParameter->pad_u_ = pad;
-  opParameter->pad_l_ = pad;
-  opParameter->input_channel_ = ci;
-  opParameter->output_channel_ = co;
-  auto op_kernel = kernel::OpenCLKernelCreator<kernel::Conv2dTransposeOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(opParameter), nullptr, kernel::KernelKey(), nullptr);
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  op_kernel->set_name("DeConv");
-
-  inputs[0]->MallocData(allocator);
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, n * h * w * ci * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, n * oh * ow * co, static_cast<float16_t>(1e-3), 2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, n * oh * ow * co, static_cast<float>(1e-5));
-  }
-
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
+  *input_shape = {n, h, w, ci};
+  *weight_shape = {co, kh, kw, ci};
+  *bias_shape = {co};
+  *output_shape = {1, oh, ow, co};
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
-  int pad = 0;
+TEST_F(TestOpenCL_Conv2dTranspose, test0) {
   int n = 1;
   int h = 2;
   int w = 2;
-  int kh = 2;
-  int kw = 2;
   int ci = 2;
   int co = 1;
-  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
-  std::vector<float> bias_data = {0.5f};
-  std::vector<float> output_data = {5.5f,  6.5f,  17.5f, 22.5f, 7.5f,  8.5f,  27.5f, 32.5f,
-                                    29.5f, 38.5f, 41.5f, 54.5f, 47.5f, 56.5f, 67.5f, 80.5f};
-  RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false);
-}
-
-TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) {
-  int pad = 0;
-  int n = 1;
-  int h = 2;
-  int w = 2;
   int kh = 2;
   int kw = 2;
-  int ci = 2;
-  int co = 1;
-  std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  std::vector<float16_t> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
-  std::vector<float16_t> bias_data = {0.5f};
-  std::vector<float16_t> output_data = {5.5f,  6.5f,  17.5f, 22.5f, 7.5f,  8.5f,  27.5f, 32.5f,
-                                        29.5f, 38.5f, 41.5f, 54.5f, 47.5f, 56.5f, 67.5f, 80.5f};
+  int pad = 0;
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  float bias_data[] = {0.5};
+  float output_data[] = {5.5, 6.5, 17.5, 22.5, 7.5, 8.5, 27.5, 32.5, 29.5, 38.5, 41.5, 54.5, 47.5, 56.5, 67.5, 80.5};
 
-  RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true);
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param =
+      CreateParameter(n, h, w, ci, co, kh, kw, pad, &input_shape, &weight_shape, &bias_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
deleted file mode 100644
index 3a0f10b389..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
-#include "nnacl/pack.h"
-
-using mindspore::kernel::ConvolutionOpenCLKernel;
-using mindspore::kernel::LiteKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
-using mindspore::lite::Tensor;
-using mindspore::schema::Format;
-using mindspore::schema::NodeType_ValueNode;
-using mindspore::schema::Format::Format_KHWC;
-using mindspore::schema::Format::Format_NHWC;
-
-namespace mindspore {
-
-class TestConvolutionOpenCL : public mindspore::CommonTest {};
-
-void LoadData(Tensor *tensor, const float *src) {
-  if (tensor->data_type() == kNumberTypeFloat16) {
-    auto num = tensor->Size() / sizeof(float16_t);
-    auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
-    for (int i = 0; i < num; ++i) {
-      tensor_data[i] = static_cast<float16_t>(src[i]);
-    }
-  } else {
-    memcpy(tensor->data_c(), src, tensor->Size());
-  }
-}
-
-void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
-  auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
-  std::vector<float> output_data(num);
-  if (output->data_type() == kNumberTypeFloat16) {
-    for (int i = 0; i < output_data.size(); ++i) {
-      output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
-    }
-  } else {
-    memcpy(output_data.data(), output->data_c(), output->Size());
-  }
-
-  printf("output:");
-  for (int i = 0; i < std::min(10, output->ElementsNum()); i++) {
-    printf("%7.3f  ", output_data[i]);
-  }
-  printf("\n");
-
-  bool not_equal = false;
-  int idx = 0;
-  std::array<int, 4> idx_4d{};
-  auto N = output->Batch(), H = output->Height(), W = output->Width(), C = output->Channel();
-  for (int i = 0, cn = 0; i < N; ++i) {
-    for (int j = 0; j < H; ++j) {
-      for (int k = 0; k < W; ++k) {
-        for (int l = 0; l < C; ++l) {
-          auto err = std::fabs(output_data[cn] - expect_data[cn]);
-          if (err > atol) {
-            not_equal = true;
-            idx_4d = {i, j, k, l};
-            goto End;
-          }
-          cn++;
-        }
-      }
-    }
-  }
-
-End:
-  if (not_equal) {
-    printf("first error at [%d %d %d %d] expect=%.3f output=%.3f\n", idx_4d[0], idx_4d[1], idx_4d[2], idx_4d[3],
-           expect_data[idx], output_data[idx]);
-    FAIL();
-  } else {
-    printf("COMPARE SUCCESS!\n\n");
-  }
-}
-
-void TEST_MAIN(const std::string &attr, const TypeId data_type, const float atol, const float *input_data,
-               const float *weight_data, const float *bias_data, const float *expect_data) {
-  auto param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "ConvParameter create error.";
-    return;
-  }
-  sscanf(attr.c_str(),
-         "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_"
-         "dilationHW_%dx%d",
-         &param->input_batch_, &param->input_h_, &param->input_w_, &param->input_channel_, &param->output_batch_,
-         &param->output_h_, &param->output_w_, &param->output_channel_, &param->kernel_h_, &param->kernel_w_,
-         &param->stride_h_, &param->stride_w_, &param->pad_u_, &param->pad_d_, &param->pad_l_, &param->pad_r_,
-         &param->dilation_h_, &param->dilation_w_);
-
-  MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = runtime_wrapper.GetInstance();
-  ocl_runtime->Init();
-  ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16);
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(DEBUG) << "create Tensors";
-  std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_};
-  std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
-  std::vector<int> bias_shape = {param->output_channel_};
-  std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
-  auto input = Tensor(data_type, input_shape, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto weight = Tensor(data_type, weight_shape, Format_KHWC, lite::Tensor::CONST_TENSOR);
-  auto bias = Tensor(data_type, bias_shape, Format_KHWC, lite::Tensor::CONST_TENSOR);
-  auto output = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::CONST_TENSOR);
-
-  MS_LOG(DEBUG) << "allocate memory and initialize weight/bias";
-  weight.MallocData();
-  LoadData(&weight, weight_data);
-  if (bias_data) {
-    bias.MallocData();
-    LoadData(&bias, bias_data);
-  }
-
-  MS_LOG(DEBUG) << "create OpenCL Kernel";
-  std::vector<lite::Tensor *> inputs{&input, &weight};
-  if (bias_data) {
-    inputs.push_back(&bias);
-  }
-  std::vector<lite::Tensor *> outputs{&output};
-  auto kernel = std::make_unique<ConvolutionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  kernel->Init();
-
-  MS_LOG(DEBUG) << "create SubGraph";
-  std::vector<kernel::LiteKernel *> kernels{kernel.release()};
-  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input}, {&output}, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    return;
-  }
-  input.MallocData(allocator);
-  sub_graph->Init();
-  LoadData(&input, input_data);
-  sub_graph->Run();
-  CompareOutput(&output, expect_data, atol);
-
-  MS_LOG(DEBUG) << "release resources";
-  weight.FreeData();
-  if (bias_data) {
-    bias.FreeData();
-  }
-  delete sub_graph;
-}
-
-TEST_F(TestConvolutionOpenCL, test0) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  float bias_data[] = {0.0f, 0.0f};
-  float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, test0_no_bias) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, nullptr, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, nullptr, expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, test1) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
-  float bias_data[] = {0.5f, -0.5f};
-  float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, test2) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  float bias_data[] = {0.0f};
-  float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, test3) {
-  std::string attr =
-    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                         9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  float bias_data[] = {0.5f, -0.5f};
-  float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
-}
-
-TEST_F(TestConvolutionOpenCL, test3_batch2) {
-  std::string attr =
-    "inputNHWC_2x2x2x2_outputNHWC_2x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
-  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  float weight_data[] = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                         9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  float bias_data[] = {0.5f, -0.5f};
-  float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f,
-                         168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
-  TEST_MAIN(attr, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
-  TEST_MAIN(attr, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
-}
-
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
index 3081929a76..4cd1c22236 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
@@ -13,154 +13,51 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "nnacl/pack.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h"
-
-namespace mindspore {
-class TestConvolutionDwOpenCL : public mindspore::CommonTest {
- public:
-  TestConvolutionDwOpenCL() {}
-};
-
-template <class T1, class T2>
-void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_data, T2 *gnd_data, schema::Format format,
-                       TypeId dtype = kNumberTypeFloat32, bool is_compare = true, T2 err_max = 1e-5) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  if (dtype == kNumberTypeFloat16) {
-    ocl_runtime->SetFp16Enable(true);
-  }
-
-  // pack input
-  int input_size = conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
-  std::function<T2(T2)> to_dtype = [](T2 x) -> T2 { return x; };
-
-  // pack weight
-  int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_;
-  T1 *packed_weight = weight_data;
-
-  // T1 bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  T1 bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  size_t output_size =
-    conv_param->output_batch_ * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
-
-  std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
-  std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out;
-  std::vector<int> shape_in;
-  if (format == schema::Format_NHWC || format == schema::Format_NHWC4 || format == schema::Format_NC4HW4) {
-    shape_in = std::vector<int>(
-      {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, conv_param->input_channel_});
-    shape_out = std::vector<int>(
-      {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, conv_param->output_channel_});
-  } else if (format == schema::Format_NCHW) {
-    shape_in = std::vector<int>(
-      {conv_param->input_batch_, conv_param->input_channel_, conv_param->input_h_, conv_param->input_w_});
-    shape_out = std::vector<int>(
-      {conv_param->output_batch_, conv_param->output_channel_, conv_param->output_h_, conv_param->output_w_});
-  } else {
-    MS_LOG(ERROR) << "Unsupported format: " << format;
-    return;
-  }
-  auto tensor_a = lite::Tensor(TypeId(dtype), shape_in, format);
-  auto tensor_b = lite::Tensor(TypeId(dtype), shape_filter, schema::Format_NHWC);
-  auto tensor_c = lite::Tensor(TypeId(dtype), shape_bias, schema::Format_NHWC);
-  auto tensor_d = lite::Tensor(TypeId(dtype), shape_out, format);
-  std::vector<lite::Tensor *> inputs{&tensor_a, &tensor_b, &tensor_c};
-  std::vector<lite::Tensor *> outputs{&tensor_d};
-
-  // freamework to do!!!
-  inputs[1]->set_data(packed_weight);
-  inputs[2]->set_data(bias_data);
-
-  OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param);
-  auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
-  if (pKernel.get() == nullptr) {
-    return;
-  }
-  pKernel->Init();
-
-  std::vector<kernel::LiteKernel *> kernels{pKernel.release()};
-  std::vector<lite::Tensor *> inputs_{&tensor_a};
-  auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  if (pGraph.get() == nullptr) {
-    return;
-  }
-  pGraph->Init();
-
-  // freamework to do!!!
-  inputs[0]->MallocData(allocator);
-  memcpy(inputs[0]->data_c(), input_data, sizeof(T2) * input_size);
-
-  pGraph->Run();
-  if (is_compare) {
-    T2 *output_data = reinterpret_cast<T2 *>(outputs[0]->data_c());
-
-    printf("==================input_data=================\n");
-    std::cout << std::endl;
-    for (int i = 0; i < input_size; i++) {
-      std::cout << input_data[i] << ", ";
-    }
-    std::cout << std::endl;
-    printf("==================weight data=================\n");
-    std::cout << std::endl;
-    for (int i = 0; i < pack_weight_size; i++) {
-      std::cout << packed_weight[i] << ", ";
-    }
-    std::cout << std::endl;
-    printf("==================output data=================\n");
-    std::cout << std::endl;
-    for (int i = 0; i < output_size; i++) {
-      std::cout << output_data[i] << ", ";
-    }
-    std::cout << std::endl;
-    printf("==================expected output data=================\n");
-    for (int i = 0; i < output_size; i++) {
-      std::cout << gnd_data[i] << ", ";
-    }
-    std::cout << std::endl;
-    // compare
-    CommonTest::CompareOutputData<T2>(output_data, gnd_data, output_size, err_max);
-  }
-
-  inputs[1]->set_data(nullptr);
-  inputs[2]->set_data(nullptr);
-  inputs[0]->set_data(nullptr);
-  outputs[0]->set_data(nullptr);
-  return;
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/conv_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_DepthwiseConv2d : public CommonTest {};
+
+namespace {
+// PrimitiveType_DepthwiseConv2D: src/ops/populate/depthwise_conv2d_populate.cc
+OpParameter *CreateParameter(int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_u, int pad_d, int pad_l,
+                             int pad_r, int dilation_h, int dilation_w, ActType act_type, int input_channel) {
+  auto *param = test::CreateParameter<ConvParameter>(schema::PrimitiveType_DepthwiseConv2D);
+  param->kernel_h_ = kernel_h;
+  param->kernel_w_ = kernel_w;
+  param->stride_h_ = stride_h;
+  param->stride_w_ = stride_w;
+  param->pad_u_ = pad_u;
+  param->pad_d_ = pad_d;
+  param->pad_l_ = pad_l;
+  param->pad_r_ = pad_r;
+  param->input_channel_ = input_channel;
+  param->dilation_h_ = dilation_h;
+  param->dilation_w_ = dilation_w;
+  param->act_type_ = act_type;
+  return reinterpret_cast<OpParameter *>(param);
 }
-
-TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 4;
-    conv_param->input_w_ = 4;
-    conv_param->input_channel_ = 4;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 2;
-    conv_param->output_w_ = 2;
-    conv_param->output_channel_ = 4;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 0;
-    conv_param->pad_l_ = 0;
-  }
-
-  // nhwc
+}  // namespace
+
+TEST_F(TestOpenCL_DepthwiseConv2d, NoPad) {
+  int kernel_h = 3;
+  int kernel_w = 3;
+  int stride_h = 1;
+  int stride_w = 1;
+  int pad_u = 0;
+  int pad_d = 0;
+  int pad_l = 0;
+  int pad_r = 0;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  ActType act_type = ActType_No;
+
+  std::vector<int> input_shape = {1, 4, 4, 4};
+  std::vector<int> output_shape = {1, 2, 2, 4};
+  std::vector<int> weight_shape = {1, kernel_h, kernel_w, output_shape.back()};
+  std::vector<int> bias_shape = {output_shape.back()};
   float input_data[] = {0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
                         0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
                         0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
@@ -169,396 +66,70 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
                         0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
                         0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
                         0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
-
-  // co h w ci
+  float bias_data[] = {0, 0, 0, 0};
   float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
                          0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772,
                          0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051,
                          0.5759465,  0.9292962,  0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136,
                          0.5865129,  0.02010755, 0.82894003, 0.00469548};
+  float output_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
+                         2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  // pack correct data, nhwc
-  float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                      2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
-
-  DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4);
-}
-
-TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 3;
-    conv_param->input_w_ = 3;
-    conv_param->input_channel_ = 5;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 3;
-    conv_param->output_w_ = 3;
-    conv_param->output_channel_ = 5;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 1;
-    conv_param->pad_l_ = 1;
-  }
-
-  // nhwc
-  float input_data[] = {0.5488135,  0.3834415,  0.77815676, 0.9446689, 0.6120957,  0.71518934, 0.79172504, 0.87001216,
-                        0.5218483,  0.616934,   0.60276335, 0.5288949, 0.9786183,  0.41466194, 0.94374806, 0.5448832,
-                        0.56804454, 0.7991586,  0.2645556,  0.6818203, 0.4236548,  0.92559665, 0.46147937, 0.7742337,
-                        0.3595079,  0.6458941,  0.07103606, 0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,
-                        0.11827443, 0.56843394, 0.6976312,  0.891773,  0.0202184,  0.639921,   0.0187898,  0.06022547,
-                        0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667};
-  // float input_data[]={
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  };
-  // co h w ci
-  float weight_data[] = {0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,
-                         0.10204481, 0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958,
-                         0.11037514, 0.6563296,  0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,
-                         0.09609841, 0.97645944, 0.4686512,  0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696,
-                         0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146,
-                         0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
-  // float weight_data[]={
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 };
-  // pack correct data, nhwc
-  float gnd_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
-                      0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
-                      1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
-                      2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
-                      0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
-                      1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
-
-  DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4);
-}
-
-TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 4;
-    conv_param->input_w_ = 4;
-    conv_param->input_channel_ = 4;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 2;
-    conv_param->output_w_ = 2;
-    conv_param->output_channel_ = 4;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 0;
-    conv_param->pad_l_ = 0;
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(kernel_h, kernel_w, stride_h, stride_w, pad_u, pad_d, pad_l, pad_r, dilation_h,
+                                  dilation_w, act_type, input_shape.back());
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-2 : 1e-5);
   }
-
-  // nhwc
-  float input_data[] = {0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
-                        0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
-                        0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
-                        0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
-                        0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
-                        0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
-                        0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
-                        0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
-
-  // co h w ci
-  float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
-                         0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772,
-                         0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051,
-                         0.5759465,  0.9292962,  0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136,
-                         0.5865129,  0.02010755, 0.82894003, 0.00469548};
-
-  // pack correct data, nhwc
-  float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                      2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
-
-  DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4);
-  // delete conv_param;
 }
 
-TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 3;
-    conv_param->input_w_ = 3;
-    conv_param->input_channel_ = 5;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 3;
-    conv_param->output_w_ = 3;
-    conv_param->output_channel_ = 5;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 1;
-    conv_param->pad_l_ = 1;
-  }
-
-  // nhwc
+TEST_F(TestOpenCL_DepthwiseConv2d, Pad) {
+  int kernel_h = 3;
+  int kernel_w = 3;
+  int stride_h = 1;
+  int stride_w = 1;
+  int pad_u = 1;
+  int pad_d = 1;
+  int pad_l = 1;
+  int pad_r = 1;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  ActType act_type = ActType_No;
+
+  std::vector<int> input_shape = {1, 3, 3, 5};
+  std::vector<int> output_shape = {1, 3, 3, 5};
+  std::vector<int> weight_shape = {1, kernel_h, kernel_w, output_shape.back()};
+  std::vector<int> bias_shape = {output_shape.back()};
   float input_data[] = {0.5488135,  0.3834415,  0.77815676, 0.9446689, 0.6120957,  0.71518934, 0.79172504, 0.87001216,
                         0.5218483,  0.616934,   0.60276335, 0.5288949, 0.9786183,  0.41466194, 0.94374806, 0.5448832,
                         0.56804454, 0.7991586,  0.2645556,  0.6818203, 0.4236548,  0.92559665, 0.46147937, 0.7742337,
                         0.3595079,  0.6458941,  0.07103606, 0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,
                         0.11827443, 0.56843394, 0.6976312,  0.891773,  0.0202184,  0.639921,   0.0187898,  0.06022547,
                         0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667};
-  // float input_data[]={
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  };
-  // co h w ci
   float weight_data[] = {0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,
                          0.10204481, 0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958,
                          0.11037514, 0.6563296,  0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,
                          0.09609841, 0.97645944, 0.4686512,  0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696,
                          0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146,
                          0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
-  // float weight_data[]={
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 };
-  // pack correct data, nhwc
-  float gnd_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
-                      0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
-                      1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
-                      2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
-                      0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
-                      1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
-
-  DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4);
-}
-
-TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 4;
-    conv_param->input_w_ = 4;
-    conv_param->input_channel_ = 4;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 2;
-    conv_param->output_w_ = 2;
-    conv_param->output_channel_ = 4;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 0;
-    conv_param->pad_l_ = 0;
-  }
-
-  // nhwc
-  float16_t input_data[] = {
-    0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
-    0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
-    0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
-    0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
-    0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
-    0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
-    0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
-    0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
-
-  // co h w ci
-  float16_t weight_data[] = {
-    0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,  0.9767611,
-    0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298,
-    0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962,  0.31856894,
-    0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136, 0.5865129,  0.02010755, 0.82894003, 0.00469548};
-
-  // pack correct data, nhwc
-  float16_t gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                          2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
-
-  DepthWiseTestMain<float16_t, float16_t>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4,
-                                          kNumberTypeFloat16, true, 1e-2);
-}
-
-TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
-  auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = 3;
-    conv_param->input_w_ = 3;
-    conv_param->input_channel_ = 5;
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = 3;
-    conv_param->output_w_ = 3;
-    conv_param->output_channel_ = 5;
-    conv_param->kernel_h_ = 3;
-    conv_param->kernel_w_ = 3;
-    conv_param->stride_h_ = 1;
-    conv_param->stride_w_ = 1;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-    conv_param->pad_u_ = 1;
-    conv_param->pad_l_ = 1;
+  float bias_data[] = {0, 0, 0, 0, 0};
+  float output_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
+                         0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
+                         1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
+                         2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
+                         0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
+                         1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(kernel_h, kernel_w, stride_h, stride_w, pad_u, pad_d, pad_l, pad_r, dilation_h,
+                                  dilation_w, act_type, input_shape.back());
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-2 : 1e-5);
   }
-
-  // nhwc
-  float16_t input_data[] = {
-    0.5488135, 0.3834415,  0.77815676, 0.9446689,  0.6120957,  0.71518934, 0.79172504, 0.87001216, 0.5218483,
-    0.616934,  0.60276335, 0.5288949,  0.9786183,  0.41466194, 0.94374806, 0.5448832,  0.56804454, 0.7991586,
-    0.2645556, 0.6818203,  0.4236548,  0.92559665, 0.46147937, 0.7742337,  0.3595079,  0.6458941,  0.07103606,
-    0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,  0.11827443, 0.56843394, 0.6976312,  0.891773,
-    0.0202184, 0.639921,   0.0187898,  0.06022547, 0.96366274, 0.83261985, 0.14335328, 0.6176355,  0.6667667};
-  // float16_t input_data[]={
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  ,
-  //   1  , 1  , 1 , 1  , 1  };
-  // co h w ci
-  float16_t weight_data[] = {
-    0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481,
-    0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,
-    0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
-    0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318,
-    0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
-  // float16_t weight_data[]={
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 ,
-  //   1  , 1  , 1 };
-  // pack correct data, nhwc
-  float16_t gnd_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
-                          0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
-                          1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
-                          2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
-                          0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
-                          1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
-
-  DepthWiseTestMain<float16_t, float16_t>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4,
-                                          kNumberTypeFloat16, true, 1e-2);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2Fp32) {
-  std::vector<std::vector<int>> src_shape{
-    {1, 32, 112, 112}, {1, 96, 112, 112}, {1, 144, 56, 56}, {1, 144, 56, 56}, {1, 192, 28, 28},
-    {1, 192, 28, 28},  {1, 384, 14, 14},  {1, 576, 14, 14}, {1, 576, 14, 14}, {1, 960, 7, 7},
-  };
-  std::vector<std::vector<int>> dst_shape{
-    {1, 32, 112, 112}, {1, 96, 56, 56},  {1, 144, 56, 56}, {1, 144, 28, 28}, {1, 192, 28, 28},
-    {1, 192, 14, 14},  {1, 384, 14, 14}, {1, 576, 14, 14}, {1, 576, 7, 7},   {1, 960, 7, 7},
-  };
-  std::vector<std::vector<int>> filter_shape{
-    {32, 1, 1, 1},  {96, 3, 3, 1},  {144, 1, 1, 1}, {144, 3, 3, 1}, {192, 1, 1, 1},
-    {192, 3, 3, 1}, {384, 1, 1, 1}, {576, 1, 1, 1}, {576, 3, 3, 1}, {960, 1, 1, 1},
-  };
-
-  // nhwc
-  const size_t in_size = 96 * 112 * 112;
-  float *input_data = new (std::nothrow) float[in_size];
-  if (input_data == nullptr) {
-    return;
-  }
-  memset(input_data, 0, in_size * sizeof(float_t));
-  for (auto i = 0; i < in_size; ++i) {
-    input_data[i] = 1;
-  }
-  // co h w ci
-  const size_t wt_size = 576 * 3 * 3;
-  float *weight_data = new (std::nothrow) float[wt_size];
-  if (weight_data == nullptr) {
-    delete[] input_data;
-    return;
-  }
-  memset(weight_data, 0, wt_size);
-  for (auto i = 0; i < wt_size; ++i) {
-    weight_data[i] = 1;
-  }
-  for (size_t i = 0; i < src_shape.size(); ++i) {
-    const int MAX_RUN_TIMES = 1;
-    for (int j = 0; j < MAX_RUN_TIMES; ++j) {
-      printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n",
-             src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], dst_shape[i][0], dst_shape[i][1],
-             dst_shape[i][2], dst_shape[i][3], j);
-      auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-      {
-        conv_param->input_batch_ = 1;
-        conv_param->input_h_ = src_shape[i][2];
-        conv_param->input_w_ = src_shape[i][3];
-        conv_param->input_channel_ = src_shape[i][1];
-        conv_param->output_batch_ = 1;
-        conv_param->output_h_ = dst_shape[i][2];
-        conv_param->output_w_ = dst_shape[i][3];
-        conv_param->output_channel_ = dst_shape[i][1];
-        conv_param->kernel_h_ = filter_shape[i][1];
-        conv_param->kernel_w_ = filter_shape[i][2];
-        conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
-        conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
-        conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
-        conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
-        conv_param->dilation_h_ = 1;
-        conv_param->dilation_w_ = 1;
-      }
-      DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4,
-                                      kNumberTypeFloat32, false);
-    }
-  }
-  delete[] input_data;
-  delete[] weight_data;
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
index c2ce6719b6..e9c7cf4c99 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
@@ -24,7 +24,10 @@ using mindspore::lite::Tensor;
 using mindspore::schema::PrimitiveType_Fill;
 using mindspore::schema::PrimitiveType_Shape;
 using mindspore::schema::Format::Format_NHWC;
-namespace mindspore {
+
+// PrimitiveType_Fill: src/ops/populate/fill_populate.cc
+
+namespace mindspore::lite::opencl::test {
 class TestFillOpenCLCI : public mindspore::CommonTest {
  public:
   TestFillOpenCLCI() {}
@@ -142,4 +145,4 @@ TEST_F(TestFillOpenCLCI, Fp32testshape) {
   ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
   delete sub_graph;
 }
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc
index d6b3b39a56..f48a853e14 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc
@@ -13,183 +13,78 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/matmul_parameter.h"
 
-namespace mindspore {
-class TestFullConnectionOpenCL : public mindspore::CommonTest {
- public:
-  TestFullConnectionOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseFullConnection(const std::vector<int> &shape, void *input_data, void *weight_data, void *bias_data,
-                               void *output_data, bool enable_fp16, int dims) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  std::vector<int> input_shape, output_shape, weight_shape, bias_shape;
-  if (dims == 2) {
-    int ci = shape[0];
-    int co = shape[1];
-    input_shape = {1, ci};
-    output_shape = {1, co};
-    weight_shape = {co, ci};
-    bias_shape = {co};
-  } else if (dims == 4) {
-    int n = shape[0];
-    int h = shape[1];
-    int w = shape[2];
-    int ci = shape[3];
-    int co = shape[4];
-    input_shape = {n, h, w, ci};
-    output_shape = {n, co};
-    weight_shape = {co, h * w * ci};
-    bias_shape = {co};
-  }
-  auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
+class TestOpenCL_FullConnection : public CommonTest {};
+
+namespace {
+// PrimitiveType_FullConnection: src/ops/populate/full_connection_populate.cc
+OpParameter *CreateParameter(std::vector<int> *input_shape, std::vector<int> *weight_shape,
+                             std::vector<int> *bias_shape, std::vector<int> *output_shape, int ndim, int ci, int co,
+                             int n = 1, int h = 1, int w = 1) {
+  auto *param = test::CreateParameter<MatMulParameter>(schema::PrimitiveType_FullConnection);
   param->a_transpose_ = false;
   param->b_transpose_ = true;
   param->has_bias_ = true;
   param->act_type_ = ActType_No;
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-
-  auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     weight_shape, schema::Format_NC);
-  auto tensor_w = tensor_w_ptr.get();
-  if (tensor_w == nullptr) {
-    MS_LOG(ERROR) << "tensor_w create error.";
-    return;
-  }
-  tensor_w->set_data(weight_data);
-
-  auto tensor_bias_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                        bias_shape, schema::Format_NC);
-  auto tensor_bias = tensor_bias_ptr.get();
-  if (tensor_bias == nullptr) {
-    MS_LOG(ERROR) << "tensor_w create error.";
-    return;
-  }
-  tensor_bias->set_data(bias_data);
-
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       output_shape, schema::Format_NC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel = kernel::OpenCLKernelCreator<kernel::FullConnectionOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
-
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
-  }
 
-  for (auto t : inputs) {
-    t->set_data(nullptr);
+  if (ndim == 2) {
+    *input_shape = {1, ci};
+    *output_shape = {1, co};
+    *weight_shape = {co, ci};
+    *bias_shape = {co};
+  } else if (ndim == 4) {
+    *input_shape = {n, h, w, ci};
+    *output_shape = {n, co};
+    *weight_shape = {co, h * w * ci};
+    *bias_shape = {co};
   }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-  MS_LOG(INFO) << "TestFullConnection passed";
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestFullConnectionOpenCL, FullConnection2DFp32) {
+TEST_F(TestOpenCL_FullConnection, 2D) {
+  int ndim = 2;
   int ci = 5;
   int co = 3;
-  std::vector<int> shape = {ci, co};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> bias_data = {1.0f, 1.0f, 1.0f};
-  std::vector<float> output_data = {11.f, 11.f, 11.f};
-  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false,
-                            2);
-}
+  float input_data[] = {0, 1, 2, 3, 4};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_data[] = {1, 1, 1};
+  float output_data[] = {11, 11, 11};
 
-TEST_F(TestFullConnectionOpenCL, FullConnection2DFp16) {
-  int ci = 5;
-  int co = 3;
-  std::vector<int> shape = {ci, co};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> bias_data = {1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> output_data = {11.f, 11.f, 11.f};
-  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true,
-                            2);
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param = CreateParameter(&input_shape, &weight_shape, &bias_shape, &output_shape, ndim, ci, co);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestFullConnectionOpenCL, FullConnection4DFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 1;
-  int c = 4;
+TEST_F(TestOpenCL_FullConnection, 4D) {
+  int ndim = 4;
+  int ci = 4;
   int co = 2;
-  std::vector<int> shape = {n, h, w, c, co};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> bias_data = {1.0f, 1.0f};
-  std::vector<float> output_data = {29.f, 29.f};
-  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false,
-                            4);
-}
-
-TEST_F(TestFullConnectionOpenCL, FullConnection4DFp16) {
   int n = 1;
   int h = 2;
   int w = 1;
-  int c = 4;
-  int co = 2;
-  std::vector<int> shape = {n, h, w, c, co};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> bias_data = {1.0f, 1.0f};
-  std::vector<float16_t> output_data = {29.f, 29.f};
-  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true,
-                            4);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_data[] = {1, 1};
+  float output_data[] = {29, 29};
+
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param = CreateParameter(&input_shape, &weight_shape, &bias_shape, &output_shape, ndim, ci, co, n, h, w);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc
index cacfec82e1..4db30fa920 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc
@@ -13,177 +13,108 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/gather_parameter.h"
 
-namespace mindspore {
-class TestGatherOpenCL : public mindspore::CommonTest {
- public:
-  TestGatherOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-template <typename T>
-void test_main_gather(void *input_data, void *correct_data, const std::vector<int> &input_shape,
-                      const std::vector<int> &indices, GatherParameter *param, TypeId data_type,
-                      schema::Format format) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_wrp = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = ocl_wrp.GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
+class TestOpenCL_Gather : public CommonTest {};
 
-  std::vector<int> indices_shape = {static_cast<int>(indices.size())};
-  std::vector<int> output_shape = input_shape;
-  output_shape[param->axis_] = indices.size();
-
-  auto tensor_a = lite::Tensor(TypeId(data_type), input_shape, format);
-  auto tensor_b = lite::Tensor(kNumberTypeInt32, indices_shape, schema::Format_NC);
-  auto tensor_c = lite::Tensor(TypeId(data_type), output_shape, format);
-  std::vector<lite::Tensor *> inputs{&tensor_a, &tensor_b};
-  std::vector<lite::Tensor *> outputs{&tensor_c};
-  size_t input_size = tensor_a.Size();
-
-  auto *pkernel =
-    new (std::nothrow) kernel::GatherOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pkernel == nullptr) {
-    MS_LOG(INFO) << "new GatherOpenCLKernel failed ";
-    return;
-  }
-  pkernel->Init();
-
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{pkernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&tensor_a}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    delete pkernel;
-    MS_LOG(INFO) << " new SubGraphOpenCLKernel failed ";
-    return;
-  }
-  sub_graph->Init();
-
-  MS_LOG(INFO) << " init tensors ";
-  memcpy(inputs[0]->data_c(), input_data, input_size);
-  auto input1_tensor = reinterpret_cast<int *>(inputs[1]->data_c());
-  for (int i = 0; i < inputs[1]->ElementsNum(); ++i) {
-    input1_tensor[i] = indices.at(i);
-  }
-  sub_graph->Run();
-
-  std::cout << "==================output data================" << std::endl;
-  auto *output_data = reinterpret_cast<T *>(outputs[0]->data_c());
-  for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << output_data[i] << " ";
-  }
-  std::cout << std::endl;
-  std::cout << "==================expected data================" << std::endl;
-  for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << static_cast<T *>(correct_data)[i] << " ";
-  }
-  std::cout << std::endl;
-  CommonTest::CompareOutputData(output_data, static_cast<T *>(correct_data), outputs[0]->ElementsNum(), 0.0001);
-}
-TEST_F(TestGatherOpenCL, Axis0Fp16) {
-  std::vector<int> input_shape{5, 10, 10, 5};
-  std::vector<int> indices{1, 0, 3, 4};
-  GatherParameter *param = std::make_unique<GatherParameter>().release();
-  param->axis_ = 0;
-  size_t input_size, output_size;
-  std::string inputPpath = "./test_data/gatherfp16_input.bin";
-  std::string correctOutputPath = "./test_data/gatherfp16_output.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size));
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-  if (param == nullptr) {
-    return;
-  }
-  TypeId data_type = kNumberTypeFloat16;
-  schema::Format format = schema::Format_NHWC;
-  test_main_gather<float16_t>(input_data, correct_data, input_shape, indices, param, data_type, format);
+namespace {
+// PrimitiveType_Gather: src/ops/populate/gather_populate.cc
+OpParameter *CreateParameter(int axis) {
+  auto *param = test::CreateParameter<GatherParameter>(schema::PrimitiveType_Gather);
+  param->axis_ = axis;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
+
+TEST_F(TestOpenCL_Gather, Axis0) {
+  int axis = 0;
+  std::vector<int> input_shape = {10};
+  std::vector<int> indices_shape = {2};
+  std::vector<int> output_shape = {2};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  int32_t indices[] = {1, 3};
+  float output_data[] = {1, 3};
 
-TEST_F(TestGatherOpenCL, Axis0Fp32) {
-  std::vector<int> input_shape{5, 10, 10, 5};
-  std::vector<int> indices{1, 2, 3, 4};
-  GatherParameter *param = std::make_unique<GatherParameter>().release();
-  param->axis_ = 0;
-  size_t input_size, output_size;
-  std::string inputPpath = "./test_data/gatherfp32_input.bin";
-  std::string correctOutputPath = "./test_data/gatherfp32_output.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size));
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-  if (param == nullptr) {
-    return;
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain(
+      {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
+      {output_shape, output_data}, param, fp16_enable);
   }
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_gather<float>(input_data, correct_data, input_shape, indices, param, data_type, format);
 }
 
-TEST_F(TestGatherOpenCL, Axis1Fp32) {
-  std::vector<int> input_shape{1, 5, 4, 4};
-  std::vector<int> indices{1, 3};
-  GatherParameter *param = reinterpret_cast<GatherParameter *>(malloc(sizeof(GatherParameter)));
-  param->axis_ = 1;
+TEST_F(TestOpenCL_Gather, Axis1) {
+  int axis = 1;
+  std::vector<int> input_shape = {1, 5, 4, 4};
+  std::vector<int> indices_shape = {2};
+  std::vector<int> output_shape = {1, 2, 4, 4};
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                         40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                         60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79};
-  float correct_data[] = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                          48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
-  if (param == nullptr) {
-    return;
+  float output_data[] = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                         48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+  int32_t indices_int32[] = {1, 3};
+  int64_t indices_int64[] = {1, 3};
+  float32_t indices_fp32[] = {1, 3};
+  float16_t indices_fp16[] = {1, 3};
+  TypeId data_types[] = {kNumberTypeInt32, kNumberTypeInt64, kNumberTypeFloat32, kNumberTypeFloat16};
+  void *indices_datas[] = {indices_int32, indices_int64, indices_fp32, indices_fp16};
+
+  for (int i = 0; i < 1; ++i) {
+    for (auto fp16_enable : {false, true}) {
+      auto *param = CreateParameter(axis);
+      TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+                {indices_shape, indices_datas[i], CONST_TENSOR, data_types[i]}},
+               {output_shape, output_data}, param, fp16_enable);
+    }
   }
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_gather<float>(input_data, correct_data, input_shape, indices, param, data_type, format);
 }
 
-TEST_F(TestGatherOpenCL, Axis2Fp32) {
-  std::vector<int> input_shape{1, 5, 4, 4};
-  std::vector<int> indices{1, 3};
-  GatherParameter *param = std::make_unique<GatherParameter>().release();
-  param->axis_ = 2;
+TEST_F(TestOpenCL_Gather, Axis2) {
+  int axis = 2;
+  std::vector<int> input_shape = {1, 5, 4, 4};
+  std::vector<int> indices_shape = {2};
+  std::vector<int> output_shape = {1, 5, 2, 4};
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                         40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                         60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79};
-  float correct_data[] = {4,  5,  6,  7,  12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39,
-                          44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63, 68, 69, 70, 71, 76, 77, 78, 79};
-  if (param == nullptr) {
-    return;
+  int32_t indices[] = {1, 3};
+  float output_data[] = {4,  5,  6,  7,  12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39,
+                         44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63, 68, 69, 70, 71, 76, 77, 78, 79};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain(
+      {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
+      {output_shape, output_data}, param, fp16_enable);
   }
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_gather<float>(input_data, correct_data, input_shape, indices, param, data_type, format);
 }
 
-TEST_F(TestGatherOpenCL, Axis3Fp32) {
-  std::vector<int> input_shape{1, 5, 4, 4};
-  std::vector<int> indices{1, 3};
-  GatherParameter *param = std::make_unique<GatherParameter>().release();
-  param->axis_ = 3;
+TEST_F(TestOpenCL_Gather, Axis3) {
+  int axis = 3;
+  std::vector<int> input_shape = {1, 5, 4, 4};
+  std::vector<int> indices_shape = {2};
+  std::vector<int> output_shape = {1, 5, 4, 2};
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                         40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                         60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79};
-  float correct_data[] = {1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
-                          41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79};
-  if (param == nullptr) {
-    return;
+  int32_t indices[] = {1, 3};
+  float output_data[] = {1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                         41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain(
+      {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
+      {output_shape, output_data}, param, fp16_enable);
   }
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_gather<float>(input_data, correct_data, input_shape, indices, param, data_type, format);
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc
deleted file mode 100644
index 830bc36804..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h"
-using mindspore::lite::Tensor;
-using mindspore::schema::Format::Format_NHWC;
-namespace mindspore {
-class TestSwishOpenCLCI : public mindspore::CommonTest {
- public:
-  TestSwishOpenCLCI() {}
-};
-
-TEST_F(TestSwishOpenCLCI, Fp32CI) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape = {2, 10, 1, 4};
-  std::vector<int> output_shape = {2, 10, 1, 4};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  float input_data[] = {2.5f,  6.0f,  -7.4f, -3.5f, 5.9f,  6.5f,  -8.0f, 7.4f,  5.9f,  6.5f,  -8.0f, 7.4f,  7.5f,  6.0f,
-                        -7.4f, -3.5f, 7.5f,  6.0f,  -7.4f, -3.5f, 5.9f,  6.5f,  -8.0f, 7.4f,  5.9f,  6.5f,  -8.0f, 7.4f,
-                        7.5f,  6.0f,  -7.4f, -3.5f, 7.5f,  6.0f,  -7.4f, -3.5f, 5.9f,  6.5f,  -8.0f, 7.4f,  5.9f,  6.5f,
-                        -8.0f, 7.4f,  7.5f,  6.0f,  -7.4f, -3.5f, 7.5f,  6.0f,  -7.4f, -3.5f, 5.9f,  6.5f,  -8.0f, 7.4f,
-                        5.9f,  6.5f,  -8.0f, 7.4f,  7.5f,  6.0f,  -7.4f, -3.5f, 7.5f,  6.0f,  -7.4f, -3.5f, 5.9f,  6.5f,
-                        -8.0f, 7.4f,  5.9f,  6.5f,  -8.0f, 7.4f,  7.5f,  6.0f,  -7.4f, -3.5f};
-
-  float correctOutput[] = {0.9167f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f,
-                           0.0f,    0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f,
-                           1.0f,    1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f,
-                           0.0f,    1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f,
-                           1.0f,    1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f,
-                           0.0f,    1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f};
-  auto output_tensor = Tensor(data_type, input_shape, Format_NHWC, tensor_type);
-  auto in_tensor = Tensor(data_type, output_shape, Format_NHWC, tensor_type);
-  std::vector<lite::Tensor *> inputs{&in_tensor};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
-  }
-
-  auto *hswish_kernel =
-    new (std::nothrow) kernel::HswishOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (hswish_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::HswishOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  hswish_kernel->Init();
-  // to do allocate memory for inputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{hswish_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete hswish_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data, sizeof(input_data));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
-}
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
index 61c11ae2a2..c13bcdb210 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@@ -13,169 +13,61 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/matmul_parameter.h"
 
-namespace mindspore {
-class TestMatMulOpenCL : public mindspore::CommonTest {
- public:
-  TestMatMulOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data,
-                       bool enable_fp16, int dims) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  std::vector<int> input_shape, output_shape, weight_shape;
-  if (dims == 2) {
-    int ci = shape[0];
-    int co = shape[1];
-    input_shape = {1, ci};
-    output_shape = {1, co};
-    weight_shape = {co, ci};
-  } else if (dims == 4) {
-    int a = shape[0];
-    int b = shape[1];
-    int m = shape[2];
-    int ci = shape[3];
-    int co = shape[4];
-    input_shape = {a, b, m, ci};
-    output_shape = {a, b, m, co};
-    weight_shape = {a, b, co, ci};
-  }
-  auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->a_transpose_ = false;
-  param->b_transpose_ = true;
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
+class TestOpenCL_MatMul : public CommonTest {};
 
-  auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     weight_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
-  auto tensor_w = tensor_w_ptr.get();
-  if (tensor_w == nullptr) {
-    MS_LOG(ERROR) << "tensor_w create error.";
-    return;
-  }
-  tensor_w->set_data(weight_data);
-
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), output_shape,
-                                   dims == 2 ? schema::Format_NC : schema::Format_NHWC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel = kernel::OpenCLKernelCreator<kernel::MatMulOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
-
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
-  }
-
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-  MS_LOG(INFO) << "TestMatMul passed";
+namespace {
+// PrimitiveType_MatMul: src/ops/populate/matmul_populate.cc
+OpParameter *CreateParameter(bool a_transpose = false, bool b_transpose = true) {
+  auto *param = test::CreateParameter<MatMulParameter>(schema::PrimitiveType_MatMul);
+  param->a_transpose_ = a_transpose;
+  param->b_transpose_ = b_transpose;
+  param->has_bias_ = false;
+  param->act_type_ = ActType_No;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestMatMulOpenCL, MatMul2DFp32) {
+TEST_F(TestOpenCL_MatMul, 2D) {
   int ci = 5;
   int co = 3;
-  std::vector<int> shape = {ci, co};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> output_data = {10.f, 10.f, 10.f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2);
-}
+  std::vector<int> input_shape = {1, ci};
+  std::vector<int> output_shape = {1, co};
+  std::vector<int> weight_shape = {co, ci};
+  float input_data[] = {0, 1, 2, 3, 4};
+  float weight_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float output_data[] = {10, 10, 10};
 
-TEST_F(TestMatMulOpenCL, MatMul2DFp16) {
-  int ci = 5;
-  int co = 3;
-  std::vector<int> shape = {ci, co};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> output_data = {10.f, 10.f, 10.f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
 
-TEST_F(TestMatMulOpenCL, MatMul4DFp32) {
+TEST_F(TestOpenCL_MatMul, 4D) {
   int a = 1;
   int b = 2;
-  int c = 2;
+  int m = 2;
   int ci = 5;
   int co = 3;
-  std::vector<int> shape = {a, b, c, ci, co};
-  std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> weight_data = {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
-                                    11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
-                                    21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
-  std::vector<float> output_data = {15.0f, 40.0f,  65.0f,  15.0f, 40.0f,  65.0f,
-                                    90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4);
-}
+  std::vector<int> input_shape = {a, b, m, ci};
+  std::vector<int> output_shape = {a, b, m, co};
+  std::vector<int> weight_shape = {a, b, co, ci};
+  float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float weight_data[] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30};
+  float output_data[] = {15, 40, 65, 15, 40, 65, 90, 115, 140, 90, 115, 140};
 
-TEST_F(TestMatMulOpenCL, MatMul4DFp16) {
-  int a = 1;
-  int b = 2;
-  int c = 2;
-  int ci = 5;
-  int co = 3;
-  std::vector<int> shape = {a, b, c, ci, co};
-  std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> weight_data = {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
-                                        11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
-                                        21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
-  std::vector<float16_t> output_data = {15.0f, 40.0f,  65.0f,  15.0f, 40.0f,  65.0f,
-                                        90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 4);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data},
+             param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/one_hot_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/one_hot_tests.cc
index 150cde5620..ed0fc24c1a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/one_hot_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/one_hot_tests.cc
@@ -13,522 +13,592 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
-
-namespace mindspore {
-class TestOneHotOpenCL : public mindspore::CommonTest {
- public:
-  TestOneHotOpenCL() {}
-};
-
-void RunTestCaseOneHot(const std::vector<int> &shape_in, const std::vector<int> &shape_out, void *input_data,
-                       void *output_data, int axis, int depth, float on_value, float off_value) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<OneHotParameter *>(malloc(sizeof(OneHotParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->axis_ = axis;
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(kNumberTypeFloat32, shape_in, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  std::vector<int> weight_shape = {};
-  auto tensor_depth_ptr = std::make_unique<lite::Tensor>(kNumberTypeInt32, weight_shape, schema::Format_NHWC);
-  auto tensor_depth = tensor_depth_ptr.get();
-  if (tensor_depth == nullptr) {
-    MS_LOG(ERROR) << "tensor_depth create error.";
-    return;
-  }
-  tensor_depth->set_data(&depth);
-  auto tensor_on_value_ptr = std::make_unique<lite::Tensor>(kNumberTypeFloat32, weight_shape, schema::Format_NHWC);
-  auto tensor_on_value = tensor_on_value_ptr.get();
-  if (tensor_on_value == nullptr) {
-    MS_LOG(ERROR) << "tensor_on_value create error.";
-    return;
-  }
-  tensor_on_value->set_data(&on_value);
-  auto tensor_off_value_ptr = std::make_unique<lite::Tensor>(kNumberTypeFloat32, weight_shape, schema::Format_NHWC);
-  auto tensor_off_value = tensor_off_value_ptr.get();
-  if (tensor_off_value == nullptr) {
-    MS_LOG(ERROR) << "tensor_off_value create error.";
-    return;
-  }
-  tensor_off_value->set_data(&off_value);
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(kNumberTypeFloat32, shape_out);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_depth, tensor_on_value, tensor_off_value};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::OneHotOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/fp32/one_hot_fp32.h"
 
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * sizeof(int));
-  pGraph->Run();
+namespace mindspore::lite::opencl::test {
 
-  CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
+class TestOpenCL_OneHot : public CommonTest {};
 
-  MS_LOG(INFO) << "Test OneHot passed";
+namespace {
+// PrimitiveType_OneHot: src/ops/populate/one_hot_populate.cc
+OpParameter *CreateParameter(int axis) {
+  auto *param = test::CreateParameter<OneHotParameter>(schema::PrimitiveType_OneHot);
+  param->axis_ = axis;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis3Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis3Fp32) {
   int depth = 4;
   int axis = -1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {3, 4, -1, 2};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {3, 4, -1, 2};
+  float output_data[] = {-1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis3T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis3T2Fp32) {
   int depth = 5;
   int axis = -1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {-1, 3, 4, 5};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {-1, 3, 4, 5};
+  float output_data[] = {-1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis3T3Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis3T3Fp32) {
   int depth = 9;
   int axis = -1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 3};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4, 9, 8, 9, 1, 8};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 3};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4, 9, 8, 9, 1, 8};
+  float output_data[] = {-1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis3T4Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis3T4Fp32) {
   int depth = 6;
   int axis = -1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {2, 4, 0, 6, 1, 6, 2, 2, 4, 5};
-  std::vector<float> output_data = {-1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f,
-                                    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {2, 4, 0, 6, 1, 6, 2, 2, 4, 5};
+  float output_data[] = {-1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, 1,  -1, 1,  -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1,
+                         -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, 1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis2Fp32) {
   int depth = 5;
   int axis = 2;
-  float on_value = 2.f;
-  float off_value = 0.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {2, 3, 0, 3};
-  std::vector<float> output_data = {0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f,
-                                    2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = 0;
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {2, 3, 0, 3};
+  float output_data[] = {0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis2T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis2T2Fp32) {
   int depth = 5;
   int axis = 2;
-  float on_value = 2.f;
-  float off_value = 0.f;
-  std::vector<int> shape_in = {1, 6, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {1, 1, 1, 0, 1, 1, 4, -1, 4, 4, -1, 1};
-  std::vector<float> output_data = {0.0f, 0.0f, 2.0f, 2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f,
-                                    2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 2.0f,
-                                    0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                                    0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                                    2.0f, 2.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = 0;
+  std::vector<int> input_shape = {1, 6, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {1, 1, 1, 0, 1, 1, 4, -1, 4, 4, -1, 1};
+  float output_data[] = {0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis2T3Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis2T3Fp32) {
   int depth = 1;
   int axis = 2;
-  float on_value = 2.f;
-  float off_value = 0.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {-1, 1, -1, 0};
-  std::vector<float> output_data = {0.0f, 0.0f, 0.0f, 2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = 0;
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {-1, 1, -1, 0};
+  float output_data[] = {0, 0, 0, 2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis2T4Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis2T4Fp32) {
   int depth = 5;
   int axis = 2;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4, 0, -1, 2, 5, 4, -1, 4, 4, 4};
-  std::vector<float> output_data = {-1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, 1.0f,  1.0f,  1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4, 0, -1, 2, 5, 4, -1, 4, 4, 4};
+  float output_data[] = {-1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1,
+                         -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, 1,  1,  1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis1T1Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis1T1Fp32) {
   int depth = 1;
   int axis = 1;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {1, 6, 6};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {0,  -1, 1, 0, -1, -1, 0, 0,  -1, 1, 0, -1, -1, 1, 1, -1, 1, 1,
-                                 -1, 1,  1, 1, -1, 0,  0, -1, 0,  0, 1, 1,  1,  1, 0, 0,  0, -1};
-  std::vector<float> output_data = {2.0f,  -2.0f, -2.0f, 2.0f,  -2.0f, -2.0f, 2.0f,  2.0f,  -2.0f, -2.0f, 2.0f,  -2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,
-                                    2.0f,  -2.0f, 2.0f,  2.0f,  -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  2.0f,  2.0f,  -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {1, 6, 6};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {0,  -1, 1, 0, -1, -1, 0, 0,  -1, 1, 0, -1, -1, 1, 1, -1, 1, 1,
+                      -1, 1,  1, 1, -1, 0,  0, -1, 0,  0, 1, 1,  1,  1, 0, 0,  0, -1};
+  float output_data[] = {2,  -2, -2, 2,  -2, -2, 2, 2,  -2, -2, 2,  -2, -2, -2, -2, -2, -2, -2,
+                         -2, -2, -2, -2, -2, 2,  2, -2, 2,  2,  -2, -2, -2, -2, 2,  2,  2,  -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis1T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis1T2Fp32) {
   int depth = 4;
   int axis = 1;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {-1, 1, 1, 2};
-  std::vector<float> output_data = {-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  2.0f,  -2.0f,
-                                    -2.0f, -2.0f, -2.0f, 2.0f,  -2.0f, -2.0f, -2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {-1, 1, 1, 2};
+  float output_data[] = {-2, -2, -2, -2, -2, 2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis1T3Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis1T3Fp32) {
   int depth = 5;
   int axis = 1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {3, 5, 2, 0, 2, 2, -1, 0, 4, 3};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, 1.0f,  -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f,
-                                    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {3, 5, 2, 0, 2, 2, -1, 0, 4, 3};
+  float output_data[] = {-1, -1, -1, 1,  -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, 1,  -1, 1,  1,  -1, -1, -1, -1, 1,  -1, -1, -1,
+                         -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis0Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis0Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {1, 2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4, 0, 3, 3};
-  std::vector<float> output_data = {-2.0f, 2.0f,  -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  2.0f,  2.0f,  -2.0f, -2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {1, 2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4, 0, 3, 3};
+  float output_data[] = {-2, 2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 2, 2, 2, -2, -2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis0T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis0T2Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {1, 2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {2, 4, 4, 3, 5, 0, 3, 3, -1, 2};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,
-                                    -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f,
-                                    -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {1, 2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {2, 4, 4, 3, 5, 0, 3, 3, -1, 2};
+  float output_data[] = {-1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, 1,
+                         -1, -1, 1,  1,  -1, -1, -1, 1,  1,  -1, -1, -1, -1, -1, -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot4DAxis0T3Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot4DAxis0T3Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {2, 2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {0, 3, 2, 0, 0, 3, 4, 1, 5, 1, 4, -1, 3, 3, 1, 1, 4, 2, 2, 4};
-  std::vector<float> output_data = {
-    1.0f,  -1.0f, -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, 1.0f,
-    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f,
-    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  1.0f,  -1.0f,
-    -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  1.0f,  -1.0f,
-    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f,
-    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, 1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {2, 2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {0, 3, 2, 0, 0, 3, 4, 1, 5, 1, 4, -1, 3, 3, 1, 1, 4, 2, 2, 4};
+  float output_data[] = {1,  -1, -1, 1,  1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, -1, 1,  -1, 1,  -1, -1, -1, -1, 1,  1,  -1, -1, -1, -1,
+                         -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  1,  -1,
+                         -1, 1,  -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, 1,  1,  -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, 1,  -1, -1, -1, -1, -1, 1,  -1, -1, 1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis0Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis0Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {2, 3};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4, 4, 3, 2, -1, 5};
-  std::vector<float> output_data = {-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  -2.0f, -2.0f, -2.0f, -2.0f,
-                                    2.0f,  -2.0f, -2.0f, -2.0f, 2.0f,  2.0f,  -2.0f, -2.0f, -2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {2, 3};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4, 4, 3, 2, -1, 5};
+  float output_data[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                         2,  -2, -2, -2, -2, 2,  -2, -2, -2, 2,  2,  -2, -2, -2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis0T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis0T2Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4, 2, 2, 3, -1, 5, 2, 4, 5, -1};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4, 2, 2, 3, -1, 5, 2, 4, 5, -1};
+  float output_data[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, 1,  1,  -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, 1,
+                         -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, 1,  -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis1Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis1Fp32) {
   int depth = 5;
   int axis = 1;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {2, 3};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {0, 0, 0, 0, 4, -1};
-  std::vector<float> output_data = {2.0f,  2.0f,  2.0f,  -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  -2.0f, -2.0f, -2.0f, -2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {2, 3};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {0, 0, 0, 0, 4, -1};
+  float output_data[] = {2, 2,  2,  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                         2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 2,  -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis1T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis1T2Fp32) {
   int depth = 5;
   int axis = 1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {1, -1, 3, 2, 5, 5, 4, 5, 0, -1};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {1, -1, 3, 2, 5, 5, 4, 5, 0, -1};
+  float output_data[] = {-1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1,
+                         1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis2Fp32) {
   int depth = 4;
   int axis = 2;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {2, 2};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {0, 3, 4, 2};
-  std::vector<float> output_data = {2.0f,  -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,
-                                    -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, 2.0f,  -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {2, 2};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {0, 3, 4, 2};
+  float output_data[] = {2, -2, -2, -2, -2, -2, -2, 2, -2, -2, -2, -2, -2, -2, 2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot3DAxis2T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot3DAxis2T2Fp32) {
   int depth = 5;
   int axis = 2;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {2, 5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {0, -1, 2, -1, 5, 4, 2, -1, 4, -1};
-  std::vector<float> output_data = {1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,
-                                    -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {2, 5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {0, -1, 2, -1, 5, 4, 2, -1, 4, -1};
+  float output_data[] = {1,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1,
+                         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, 1,  -1,
+                         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot2DAxis0Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot2DAxis0Fp32) {
   int depth = 3;
   int axis = 0;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {3};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {2, 1, 3};
-  std::vector<float> output_data = {-2.0f, -2.0f, -2.0f, -2.0f, 2.0f, -2.0f, 2.0f, -2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {3};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {2, 1, 3};
+  float output_data[] = {-2, -2, -2, -2, 2, -2, 2, -2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot2DAxis0T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot2DAxis0T2Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {2, 2, 0, 0, 4};
-  std::vector<float> output_data = {-1.0f, -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, 1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {2, 2, 0, 0, 4};
+  float output_data[] = {-1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot2DAxis1Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot2DAxis1Fp32) {
   int depth = 3;
   int axis = -1;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {3};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {1, 2, 0};
-  std::vector<float> output_data = {-2.0f, 2.0f, -2.0f, -2.0f, -2.0f, 2.0f, 2.0f, -2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {3};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {1, 2, 0};
+  float output_data[] = {-2, 2, -2, -2, -2, 2, 2, -2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot2DAxis1T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot2DAxis1T2Fp32) {
   int depth = 5;
   int axis = -1;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {5};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {5, 4, 0, 4, -1};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    1.0f,  1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f,
-                                    -1.0f, 1.0f,  -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {5};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {5, 4, 0, 4, -1};
+  float output_data[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, 1,  1,  -1, -1,
+                         -1, -1, -1, -1, -1, -1, 1,  -1, -1, -1, -1, -1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot1DAxis0Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot1DAxis0Fp32) {
   int depth = 3;
   int axis = -1;
-  float on_value = 2.f;
-  float off_value = -2.f;
-  std::vector<int> shape_in = {};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {1};
-  std::vector<float> output_data = {-2.0f, 2.0f, -2.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 2;
+  float off_value = -2;
+  std::vector<int> input_shape = {};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {1};
+  float output_data[] = {-2, 2, -2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestOneHotOpenCL, OneHot1DAxis0T2Fp32) {
+TEST_F(TestOpenCL_OneHot, OneHot1DAxis0T2Fp32) {
   int depth = 5;
   int axis = 0;
-  float on_value = 1.f;
-  float off_value = -1.f;
-  std::vector<int> shape_in = {};
-  std::vector<int> shape_out = shape_in;
-  shape_out.insert(shape_out.begin() + (axis + shape_in.size() + 1) % (shape_in.size() + 1), depth);
-  std::vector<int> input_data = {4};
-  std::vector<float> output_data = {-1.0f, -1.0f, -1.0f, -1.0f, 1.0f};
-
-  RunTestCaseOneHot(shape_in, shape_out, input_data.data(), output_data.data(), axis, depth, on_value, off_value);
+  float on_value = 1;
+  float off_value = -1;
+  std::vector<int> input_shape = {};
+  std::vector<int> output_shape = input_shape;
+  output_shape.insert(output_shape.begin() + (axis + input_shape.size() + 1) % (input_shape.size() + 1), depth);
+  int input_data[] = {4};
+  float output_data[] = {-1, -1, -1, -1, 1};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
+              {{}, &depth, CONST_SCALAR, kNumberTypeInt32},
+              {{}, &on_value, CONST_SCALAR, kNumberTypeFloat32},
+              {{}, &off_value, CONST_SCALAR, kNumberTypeFloat32}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/opencl_kernel_tests.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/opencl_kernel_tests.h
deleted file mode 100644
index eb36b5c9c8..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/opencl_kernel_tests.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "common/common_test.h"
-#include "src/common/log_adapter.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-
-#ifndef TESTS_UT_OPENCL_KERNLE_TESTS_H
-#define TESTS_UT_OPENCL_KERNLE_TESTS_H
-
-namespace mindspore {
-
-class TestOpenCLKernel : public mindspore::CommonTest {
- public:
-  TestOpenCLKernel() {}
-};
-
-}  // namespace mindspore
-#endif  // TESTS_UT_OPENCL_KERNLE_TESTS_H
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/pad_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pad_tests.cc
index cae15c12f3..f22ba50b70 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/pad_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pad_tests.cc
@@ -13,155 +13,221 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h"
-#include "nnacl/pack.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/pad_parameter.h"
 
-using mindspore::kernel::LiteKernel;
-using mindspore::kernel::PadOpenCLKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
-using mindspore::lite::Tensor;
-using mindspore::schema::Format;
-using mindspore::schema::Format_NC4HW4;
-using mindspore::schema::Format_NHWC;
-using mindspore::schema::Format_NHWC4;
-using mindspore::schema::NodeType_ValueNode;
-using mindspore::schema::PaddingMode;
-using mindspore::schema::PaddingMode_CONSTANT;
-using mindspore::schema::PaddingMode_REFLECT;
-using mindspore::schema::PaddingMode_SYMMETRIC;
+namespace mindspore::lite::opencl::test {
 
-namespace mindspore {
+class TestOpenCL_Pad : public CommonTest {};
 
-class TestPadOpenCL : public mindspore::CommonTest {};
-
-void TEST_MAIN(PadParameter *param, Format input_format, Format output_format, Format op_format, const TypeId data_type,
-               const std::vector<int> &input_shape, const std::vector<int> &output_shape, const float *input_data,
-               const float *expect_data) {
-  auto ocl_runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = ocl_runtime_wrapper.GetInstance();
-  ocl_runtime->Init();
-  ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16);
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(DEBUG) << "create Tensors";
-  auto input = Tensor(kNumberTypeFloat32, input_shape, input_format, lite::Tensor::CONST_TENSOR);
-  auto output = Tensor(kNumberTypeFloat32, output_shape, output_format, lite::Tensor::CONST_TENSOR);
-
-  MS_LOG(DEBUG) << "create OpenCL Kernel";
-  std::vector<lite::Tensor *> inputs{&input};
-  std::vector<lite::Tensor *> outputs{&output};
-  auto kernel = std::make_unique<PadOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (kernel == nullptr) {
-    return;
+namespace {
+// PrimitiveType_Pad: src/ops/populate/pad_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &paddings, float constant_value) {
+  auto *param = test::CreateParameter<PadParameter>(schema::PrimitiveType_Pad);
+  param->pad_mode_ = schema::PaddingMode_CONSTANT;
+  param->constant_value_ = constant_value;
+  param->padding_length = MAX_PAD_SIZE;
+  int size = paddings.size();
+  for (size_t i = 0; i < MAX_PAD_SIZE - size; ++i) {
+    param->paddings_[i] = 0;
   }
-  kernel->Init();
-
-  MS_LOG(DEBUG) << "create SubGraph";
-  std::vector<kernel::LiteKernel *> kernels{kernel.release()};
-  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input}, {&output}, kernels, kernels, kernels);
-  input.MallocData(allocator);
-  sub_graph->Init();
-  memcpy(input.data_c(), input_data, input.Size());
-  sub_graph->Run();
-  if (CommonTest::CompareOutputData(reinterpret_cast<float *>(output.data_c()), const_cast<float *>(expect_data),
-                                    static_cast<size_t>(output.ElementsNum()))) {
-    FAIL();
-  } else {
-    std::cout << "COMPARE SUCCESS!\n";
+  for (size_t i = 0; i < size; i++) {
+    param->paddings_[MAX_PAD_SIZE - size + i] = paddings[i];
   }
-
-  MS_LOG(DEBUG) << "release resources";
-  input.set_data(nullptr);
-  output.set_data(nullptr);
-  delete sub_graph;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestPadOpenCL, TestPad3) {
-  auto param = static_cast<PadParameter *>(malloc(sizeof(PadParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "PadParameter create error.";
-    return;
+TEST_F(TestOpenCL_Pad, 1D) {
+  float input_data[] = {1, 1, 1, 1};
+  float output_data[] = {2, 2, 2, 1, 1, 1, 1, 2, 2};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({3, 2}, 2);
+    TestMain({{{4}, input_data, VAR}}, {{9}, output_data}, param, fp16_enable);
   }
-  param->pad_mode_ = PaddingMode_CONSTANT;
-  param->constant_value_ = 0.0f;
-  param->padding_length = MAX_PAD_SIZE;
-  int paddings[MAX_PAD_SIZE] = {0, 0, 3, 3, 3, 3, 0, 0};
-  memcpy(param->paddings_, paddings, sizeof(paddings));
-
-  float input_data[48] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0,
-                          12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
-                          24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-                          36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0};
-  float expect_data[300] = {
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  12.0, 13.0, 14.0, 15.0,
-    16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  36.0,
-    37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,
-    0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0};
+}
 
-  TEST_MAIN(param, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, {1, 4, 4, 3}, {1, 10, 10, 3}, input_data,
-            expect_data);
-  TEST_MAIN(param, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat32, {1, 4, 4, 3}, {1, 10, 10, 3},
-            input_data, expect_data);
-  TEST_MAIN(param, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, {1, 4, 4, 3}, {1, 10, 10, 3}, input_data,
-            expect_data);
-  TEST_MAIN(param, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat16, {1, 4, 4, 3}, {1, 10, 10, 3},
-            input_data, expect_data);
+TEST_F(TestOpenCL_Pad, 2D) {
+  float input_data[] = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
+  float output_data[] = {10, 10, 10, 10, 10, 10, 10, 10, 10, 1,  1,  1,  1,  1,  10, 10,
+                         10, 2,  2,  2,  2,  2,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 1, 1, 2}, 10);
+    TestMain({{{2, 5}, input_data, VAR}}, {{4, 8}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestPadOpenCL, TestPad4) {
-  auto param = static_cast<PadParameter *>(malloc(sizeof(PadParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "PadParameter create error.";
-    return;
+TEST_F(TestOpenCL_Pad, 4D) {
+  float input_data[48] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                          16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                          32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47};
+  float output_data[300] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 0, 3, 3, 3, 3, 0, 0}, 0);
+    TestMain({{{1, 4, 4, 3}, input_data, VAR}}, {{1, 10, 10, 3}, output_data}, param, fp16_enable);
   }
-  param->pad_mode_ = PaddingMode_CONSTANT;
-  param->constant_value_ = 1.0f;
-  param->padding_length = MAX_PAD_SIZE;
-  int paddings[MAX_PAD_SIZE] = {0, 0, 3, 3, 3, 3, 0, 0};
-  memcpy(param->paddings_, paddings, sizeof(paddings));
 
-  float input_data[48] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0,
-                          12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
-                          24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-                          36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0};
-  float expect_data[300] = {
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  12.0, 13.0, 14.0, 15.0,
-    16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  36.0,
-    37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,
-    1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0};
+  float output_data1[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 0, 3, 3, 3, 3, 0, 0}, 1);
+    TestMain({{{1, 4, 4, 3}, input_data, VAR}}, {{1, 10, 10, 3}, output_data1}, param, fp16_enable);
+  }
+}
+
+TEST_F(TestOpenCL_Pad, test0) {
+  std::vector<std::tuple<std::string, std::vector<int>, std::vector<int>, std::vector<float>, std::vector<float>,
+                         std::vector<int>, float>>
+    cases = {
+      {"SimpleConstTest",
+       {1, 2, 2, 1},
+       {3, 2, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {1, 1, 0, 0, 1, 1, 0, 0},
+       0},
+      {"SimpleConstImageStyleTest",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       0},
+      {"SimpleConst1DTest", {2}, {5}, {2, 3}, {0, 2, 3, 0, 0}, {1, 2}, 0},
+      {"SimpleDynamicTest",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       0},
+      {"AdvancedConstTest",
+       {1, 2, 3, 1},
+       {2, 4, 6, 1},
+       {1, 2, 3, 4, 5, 6},
+       {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        1, 2, 3, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {1, 0, 0, 2, 0, 3, 0, 0},
+       0},
+      {"AdvancedConstImageStyleTest",
+       {1, 2, 3, 1},
+       {1, 4, 7, 1},
+       {1, 2, 3, 4, 5, 6},
+       {0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {0, 0, 0, 2, 1, 3, 0, 0},
+       0},
+      {"AdvancedDynamicTest",
+       {1, 2, 3, 1},
+       {1, 4, 7, 1},
+       {1, 2, 3, 4, 5, 6},
+       {0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {0, 0, 0, 2, 1, 3, 0, 0},
+       0},
+      {"SimpleConstTestUint8",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       0},
+      {"SimpleConstTestInt8",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       0},
+      {"SimpleConstFloat32ValuedTestUint8",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4, 5, 5, 5, 5, 5},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       5},
+      {"SimpleConstFloat32ValuedTestInt8",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4, 5, 5, 5, 5, 5},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       5},
+      {"Simple4DConstFloat32ValuedTest",
+       {1, 1, 2, 1},
+       {2, 1, 2, 2},
+       {3, 3},
+       {3, 5, 3, 5, 5, 5, 5, 5},
+       {0, 1, 0, 0, 0, 0, 0, 1},
+       5},
+      {"SimpleConstInt32ValuedTest",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4, 5, 5, 5, 5, 5},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       5},
+      {"SimpleDynamicTest",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       0},
+      {"SimpleDynamicValuedTest",
+       {1, 2, 2, 1},
+       {1, 4, 4, 1},
+       {1, 2, 3, 4},
+       {5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4, 5, 5, 5, 5, 5},
+       {0, 0, 1, 1, 1, 1, 0, 0},
+       5},
+      {"AdvancedConstTest",
+       {1, 2, 3, 1},
+       {1, 4, 7, 1},
+       {1, 2, 3, 4, 5, 6},
+       {0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {0, 0, 0, 2, 1, 3, 0, 0},
+       0},
+      {"AdvancedDynamicTest",
+       {1, 2, 3, 1},
+       {1, 4, 7, 1},
+       {1, 2, 3, 4, 5, 6},
+       {0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+       {0, 0, 0, 2, 1, 3, 0, 0},
+       0},
+    };
 
-  TEST_MAIN(param, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, {1, 4, 4, 3}, {1, 10, 10, 3}, input_data,
-            expect_data);
+  for (auto &case_ : cases) {
+    auto &name = std::get<0>(case_);
+    auto &input_shape = std::get<1>(case_);
+    auto &output_shape = std::get<2>(case_);
+    auto input_data = std::get<3>(case_).data();
+    auto output_data = std::get<4>(case_).data();
+    auto &paddings = std::get<5>(case_);
+    auto constant_value = std::get<6>(case_);
+    std::cout << name << std::endl;
+    for (auto fp16_enable : {false, true}) {
+      auto *param = CreateParameter(paddings, constant_value);
+      TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+    }
+  }
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc
index 593a970559..9fd3991f6f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc
@@ -13,175 +13,56 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
-
-namespace mindspore {
-
-class TestPoolingOpenCL : public mindspore::CommonTest {};
-
-void InitPoolingParam(PoolingParameter *param) {
-  param->input_batch_ = 1;
-  param->input_h_ = 2;
-  param->input_w_ = 2;
-  param->input_channel_ = 4;
-
-  param->output_batch_ = 1;
-  param->output_h_ = 1;
-  param->output_w_ = 1;
-  param->output_channel_ = 4;
-
-  param->window_h_ = 2;
-  param->window_w_ = 2;
-
-  param->stride_h_ = 2;
-  param->stride_w_ = 2;
-
-  param->pad_u_ = 0;
-  param->pad_d_ = 0;
-  param->pad_l_ = 0;
-  param->pad_r_ = 0;
-}
-
-void RunTestCasePooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
-                        PoolMode pool_mode) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  int n = shape[0];
-  int h = shape[1];
-  int w = shape[2];
-  int c = shape[3];
-  int oh = shape[4];
-  int ow = shape[5];
-  auto param = static_cast<PoolingParameter *>(malloc(sizeof(PoolingParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param create error.";
-    return;
-  }
-  InitPoolingParam(param);
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/pooling_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_Pooling : public CommonTest {};
+
+namespace {
+// PrimitiveType_Pooling: src/ops/populate/pooling_populate.cc
+OpParameter *CreateParameter(PoolMode pool_mode, int window_h, int window_w, int stride_h, int stride_w, int pad_u,
+                             int pad_d, int pad_l, int pad_r, RoundMode round_mode = RoundMode_No,
+                             ActType act_type = ActType_No) {
+  auto *param = test::CreateParameter<PoolingParameter>(schema::PrimitiveType_Pooling);
+  param->global_ = false;
+  param->window_w_ = window_w;
+  param->window_h_ = window_h;
+  param->pad_u_ = pad_u;
+  param->pad_d_ = pad_d;
+  param->pad_l_ = pad_l;
+  param->pad_r_ = pad_r;
+  param->stride_w_ = stride_w;
+  param->stride_h_ = stride_h;
+  param->avg_mode_ = 0;
   param->pool_mode_ = pool_mode;
-  std::vector<int> input_shape = {n, h, w, c};
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  std::vector<int> out_shape = {n, oh, ow, c};
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, schema::Format_NHWC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::PoolingOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  }
-
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-
-  MS_LOG(INFO) << "Test AvgPool2d passed";
-}
-
-TEST_F(TestPoolingOpenCL, AvgPoolingFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 4;
-  int oh = 1;
-  int ow = 1;
-  std::vector<int> shape = {n, h, w, c, oh, ow};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-  std::vector<float> output_data = {6.0f, 7.0f, 8.0f, 9.0f};
-
-  RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_AvgPool);
+  param->round_mode_ = round_mode;
+  param->act_type_ = act_type;
+  return reinterpret_cast<OpParameter *>(param);
 }
-
-TEST_F(TestPoolingOpenCL, AvgPoolingFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 4;
-  int oh = 1;
-  int ow = 1;
-  std::vector<int> shape = {n, h, w, c, oh, ow};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                       8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-  std::vector<float16_t> output_data = {6.0f, 7.0f, 8.0f, 9.0f};
-
-  RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_AvgPool);
+}  // namespace
+
+TEST_F(TestOpenCL_Pooling, Avg) {
+  std::vector<int> input_shape = {1, 2, 2, 4};
+  std::vector<int> output_shape = {1, 1, 1, 4};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  float output_data[] = {6, 7, 8, 9};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(PoolMode_AvgPool, 2, 2, 2, 2, 0, 0, 0, 0);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestPoolingOpenCL, MaxPoolingFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 4;
-  int oh = 1;
-  int ow = 1;
-  std::vector<int> shape = {n, h, w, c, oh, ow};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-  std::vector<float> output_data = {12.0f, 13.0f, 14.0f, 15.0f};
-
-  RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_MaxPool);
+TEST_F(TestOpenCL_Pooling, Max) {
+  std::vector<int> input_shape = {1, 2, 2, 4};
+  std::vector<int> output_shape = {1, 1, 1, 4};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  float output_data[] = {12, 13, 14, 15};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(PoolMode_MaxPool, 2, 2, 2, 2, 0, 0, 0, 0);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestPoolingOpenCL, MaxPoolingFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 4;
-  int oh = 1;
-  int ow = 1;
-  std::vector<int> shape = {n, h, w, c, oh, ow};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                       8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-  std::vector<float16_t> output_data = {12.0f, 13.0f, 14.0f, 15.0f};
-
-  RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_MaxPool);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
index 2308c0ebc8..55652d1fea 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
@@ -21,10 +21,12 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/power.h"
 
+// PrimitiveType_Power: src/ops/populate/power_populate.cc
+
 using mindspore::lite::Tensor;
 using mindspore::schema::Format::Format_NHWC;
-namespace mindspore {
-class TestPowerOpenCLCI : public mindspore::CommonTest {
+namespace mindspore::lite::opencl::test {
+class TestPowerOpenCLCI : public CommonTest {
  public:
   TestPowerOpenCLCI() {}
 };
@@ -166,4 +168,4 @@ TEST_F(TestPowerOpenCLCI, broadcast) {
   TEST_MAIN(input_data1, input_data1, expect_data, data_type, shape_a, shape_b, output_shape, true);
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
index 9977a5e4ef..96707e4fa5 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
@@ -30,8 +30,10 @@ using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
-namespace mindspore {
-class TestPReluOpenCL : public mindspore::CommonTest {};
+// PrimitiveType_PReLU: src/ops/populate/p_relu_populate.cc
+
+namespace mindspore::lite::opencl::test {
+class TestPReluOpenCL : public CommonTest {};
 
 void LoadDataPRelu(void *dst, size_t dst_size, const std::string &file_path) {
   if (file_path.empty()) {
@@ -193,4 +195,4 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
   delete param;
   delete sub_graph;
 }
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
index 1eb894e9a8..05d10ca76f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
@@ -13,701 +13,80 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
-
-namespace mindspore {
-class TestReduceOpenCL : public mindspore::CommonTest {
- public:
-  TestReduceOpenCL() {}
-};
-
-void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
-                       int reduce_mode, bool WC = false) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<ReduceParameter *>(malloc(sizeof(ReduceParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->axes_[0] = 1;
-  param->axes_[1] = 2;
-  if (WC) {
-    param->axes_[0] = 2;
-    param->axes_[1] = 3;
-    param->keep_dims_ = true;
-  }
-  param->num_axes_ = 2;
-  param->mode_ = reduce_mode;
-  int n = shape[0];
-  int h = shape[1];
-  int w = shape[2];
-  int c = shape[3];
-  std::vector<int> input_shape = {n, h, w, c};
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  std::vector<int> out_shape = {n, c};
-  if (WC) {
-    out_shape = {n, h, 1, 1};
-  }
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, WC ? schema::Format_NHWC : schema::Format_NC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ReduceOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-3));
-  }
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/reduce_parameter.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_Reduce : public CommonTest {};
+
+namespace {
+// PrimitiveType_Reduce:    src/ops/populate/reduce_populate.cc
+// PrimitiveType_Mean:      src/ops/populate/mean_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &axis, schema::ReduceMode mode, bool keep_dims) {
+  auto *param = test::CreateParameter<ReduceParameter>(schema::PrimitiveType_Reduce);
+  param->keep_dims_ = keep_dims;
+  param->reduce_to_end_ = false;
+  param->coeff = 0.f;
+  param->num_axes_ = axis.size();
+  param->mode_ = mode;
+  for (int i = 0; i < axis.size(); ++i) {
+    param->axes_[i] = axis[i];
   }
-
-  MS_LOG(INFO) << "Test Reduce passed";
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {4.5f, 5.5f, 6.5f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float16_t> output_data = {4.5f, 5.5f, 6.5f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    3.0f, 1.0f, 6.0f, 8.0f, 6.0f, 4.0f, 1.0f, 3.0f, 5.0f, 4.0f, 5.0f, 4.0f, 0.0f, 2.0f, 4.0f, 1.0f, 3.0f,
-    1.0f, 6.0f, 5.0f, 4.0f, 7.0f, 0.0f, 7.0f, 1.0f, 2.0f, 5.0f, 0.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 8.0f,
-    5.0f, 7.0f, 6.0f, 2.0f, 5.0f, 3.0f, 2.0f, 9.0f, 1.0f, 0.0f, 2.0f, 0.0f, 6.0f, 0.0f, 3.0f, 6.0f, 0.0f,
-    7.0f, 1.0f, 0.0f, 6.0f, 3.0f, 0.0f, 1.0f, 0.0f, 5.0f, 3.0f, 8.0f, 1.0f, 9.0f, 2.0f, 2.0f, 2.0f, 7.0f,
-    7.0f, 6.0f, 7.0f, 0.0f, 5.0f, 4.0f, 2.0f, 6.0f, 8.0f, 2.0f, 0.0f, 8.0f, 4.0f, 9.0f, 1.0f, 2.0f, 9.0f,
-    9.0f, 6.0f, 0.0f, 8.0f, 5.0f, 2.0f, 9.0f, 3.0f, 1.0f, 9.0f, 0.0f, 4.0f, 6.0f, 0.0f, 5.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {3.971f, 4.559f, 3.294f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    3.0f, 1.0f, 6.0f, 8.0f, 6.0f, 4.0f, 1.0f, 3.0f, 5.0f, 4.0f, 5.0f, 4.0f, 0.0f, 2.0f, 4.0f, 1.0f, 3.0f,
-    1.0f, 6.0f, 5.0f, 4.0f, 7.0f, 0.0f, 7.0f, 1.0f, 2.0f, 5.0f, 0.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 8.0f,
-    5.0f, 7.0f, 6.0f, 2.0f, 5.0f, 3.0f, 2.0f, 9.0f, 1.0f, 0.0f, 2.0f, 0.0f, 6.0f, 0.0f, 3.0f, 6.0f, 0.0f,
-    7.0f, 1.0f, 0.0f, 6.0f, 3.0f, 0.0f, 1.0f, 0.0f, 5.0f, 3.0f, 8.0f, 1.0f, 9.0f, 2.0f, 2.0f, 2.0f, 7.0f,
-    7.0f, 6.0f, 7.0f, 0.0f, 5.0f, 4.0f, 2.0f, 6.0f, 8.0f, 2.0f, 0.0f, 8.0f, 4.0f, 9.0f, 1.0f, 2.0f, 9.0f,
-    9.0f, 6.0f, 0.0f, 8.0f, 5.0f, 2.0f, 9.0f, 3.0f, 1.0f, 9.0f, 0.0f, 4.0f, 6.0f, 0.0f, 5.0f, 2.0f, 3.0f};
-  std::vector<float16_t> output_data = {3.971f, 4.559f, 3.294f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {1.5f, 5.5f, 9.5f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMeanWCLocalFp32) {
-  int n = 1;
-  int h = 5;
-  int w = 17;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    6.0f, 3.0f, 6.0f, 1.0f, 4.0f, 2.0f, 5.0f, 1.0f, 7.0f, 5.0f, 7.0f, 3.0f, 0.0f, 2.0f, 9.0f, 8.0f, 3.0f, 1.0f, 6.0f,
-    8.0f, 6.0f, 6.0f, 3.0f, 0.0f, 6.0f, 3.0f, 8.0f, 0.0f, 6.0f, 1.0f, 0.0f, 9.0f, 4.0f, 4.0f, 9.0f, 4.0f, 9.0f, 5.0f,
-    0.0f, 1.0f, 4.0f, 6.0f, 4.0f, 0.0f, 9.0f, 3.0f, 6.0f, 6.0f, 7.0f, 1.0f, 7.0f, 8.0f, 6.0f, 0.0f, 2.0f, 6.0f, 4.0f,
-    4.0f, 3.0f, 7.0f, 7.0f, 5.0f, 2.0f, 3.0f, 4.0f, 3.0f, 1.0f, 5.0f, 4.0f, 8.0f, 7.0f, 5.0f, 0.0f, 7.0f, 5.0f, 5.0f,
-    0.0f, 3.0f, 4.0f, 0.0f, 6.0f, 5.0f, 4.0f, 6.0f, 2.0f, 0.0f, 8.0f, 6.0f, 4.0f, 6.0f, 3.0f, 2.0f, 6.0f, 4.0f, 8.0f,
-    4.0f, 8.0f, 2.0f, 0.0f, 0.0f, 9.0f, 4.0f, 3.0f, 4.0f, 1.0f, 7.0f, 9.0f, 1.0f, 9.0f, 4.0f, 2.0f, 8.0f, 3.0f, 5.0f,
-    8.0f, 7.0f, 8.0f, 8.0f, 4.0f, 8.0f, 2.0f, 8.0f, 9.0f, 4.0f, 5.0f, 0.0f, 2.0f, 1.0f, 0.0f, 8.0f, 4.0f, 7.0f, 2.0f,
-    4.0f, 5.0f, 0.0f, 0.0f, 7.0f, 2.0f, 0.0f, 2.0f, 7.0f, 1.0f, 1.0f, 0.0f, 1.0f, 2.0f, 1.0f, 3.0f, 7.0f, 7.0f, 3.0f,
-    2.0f, 3.0f, 1.0f, 7.0f, 2.0f, 2.0f, 2.0f, 9.0f, 3.0f, 6.0f, 1.0f, 8.0f, 0.0f, 1.0f, 2.0f, 0.0f, 9.0f, 5.0f};
-  std::vector<float> output_data = {4.206f, 4.441f, 4.265f, 4.706f, 3.147f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {18.0f, 22.0f, 26.0f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float16_t> output_data = {18.0f, 22.0f, 26.0f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    8.0f, 1.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 4.0f, 4.0f, 2.0f, 3.0f, 9.0f, 3.0f, 4.0f, 8.0f, 1.0f, 9.0f,
-    5.0f, 2.0f, 5.0f, 6.0f, 3.0f, 8.0f, 3.0f, 7.0f, 1.0f, 3.0f, 1.0f, 9.0f, 4.0f, 0.0f, 9.0f, 7.0f, 7.0f,
-    5.0f, 0.0f, 2.0f, 4.0f, 8.0f, 7.0f, 3.0f, 0.0f, 4.0f, 8.0f, 5.0f, 3.0f, 8.0f, 2.0f, 5.0f, 3.0f, 5.0f,
-    9.0f, 4.0f, 3.0f, 9.0f, 7.0f, 2.0f, 4.0f, 7.0f, 0.0f, 3.0f, 9.0f, 6.0f, 6.0f, 9.0f, 2.0f, 1.0f, 0.0f,
-    7.0f, 1.0f, 7.0f, 2.0f, 0.0f, 6.0f, 9.0f, 4.0f, 7.0f, 0.0f, 7.0f, 0.0f, 4.0f, 8.0f, 6.0f, 0.0f, 3.0f,
-    2.0f, 1.0f, 2.0f, 9.0f, 6.0f, 2.0f, 6.0f, 2.0f, 9.0f, 4.0f, 0.0f, 1.0f, 9.0f, 7.0f, 6.0f, 9.0f, 8.0f};
-  std::vector<float> output_data = {143.000f, 191.000f, 145.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    8.0f, 1.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 4.0f, 4.0f, 2.0f, 3.0f, 9.0f, 3.0f, 4.0f, 8.0f, 1.0f, 9.0f,
-    5.0f, 2.0f, 5.0f, 6.0f, 3.0f, 8.0f, 3.0f, 7.0f, 1.0f, 3.0f, 1.0f, 9.0f, 4.0f, 0.0f, 9.0f, 7.0f, 7.0f,
-    5.0f, 0.0f, 2.0f, 4.0f, 8.0f, 7.0f, 3.0f, 0.0f, 4.0f, 8.0f, 5.0f, 3.0f, 8.0f, 2.0f, 5.0f, 3.0f, 5.0f,
-    9.0f, 4.0f, 3.0f, 9.0f, 7.0f, 2.0f, 4.0f, 7.0f, 0.0f, 3.0f, 9.0f, 6.0f, 6.0f, 9.0f, 2.0f, 1.0f, 0.0f,
-    7.0f, 1.0f, 7.0f, 2.0f, 0.0f, 6.0f, 9.0f, 4.0f, 7.0f, 0.0f, 7.0f, 0.0f, 4.0f, 8.0f, 6.0f, 0.0f, 3.0f,
-    2.0f, 1.0f, 2.0f, 9.0f, 6.0f, 2.0f, 6.0f, 2.0f, 9.0f, 4.0f, 0.0f, 1.0f, 9.0f, 7.0f, 6.0f, 9.0f, 8.0f};
-  std::vector<float16_t> output_data = {143.000f, 191.000f, 145.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {6.0f, 22.0f, 38.0f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumWCLocalFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 5;
-  int c = 17;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    4.0f, 7.0f, 2.0f, 9.0f, 1.0f, 4.0f, 0.0f, 1.0f, 0.0f, 7.0f, 8.0f, 0.0f, 2.0f, 8.0f, 2.0f, 0.0f, 4.0f, 8.0f, 3.0f,
-    9.0f, 5.0f, 9.0f, 7.0f, 0.0f, 3.0f, 3.0f, 1.0f, 1.0f, 8.0f, 6.0f, 4.0f, 7.0f, 6.0f, 5.0f, 7.0f, 8.0f, 2.0f, 0.0f,
-    0.0f, 4.0f, 1.0f, 1.0f, 4.0f, 6.0f, 0.0f, 5.0f, 1.0f, 0.0f, 3.0f, 9.0f, 3.0f, 7.0f, 8.0f, 1.0f, 6.0f, 9.0f, 2.0f,
-    5.0f, 7.0f, 2.0f, 9.0f, 8.0f, 0.0f, 2.0f, 0.0f, 4.0f, 3.0f, 4.0f, 3.0f, 5.0f, 3.0f, 5.0f, 2.0f, 2.0f, 1.0f, 9.0f,
-    8.0f, 7.0f, 0.0f, 8.0f, 0.0f, 4.0f, 0.0f, 8.0f, 4.0f, 8.0f, 2.0f, 6.0f, 3.0f, 7.0f, 6.0f, 8.0f, 3.0f, 6.0f, 4.0f,
-    8.0f, 3.0f, 8.0f, 1.0f, 0.0f, 9.0f, 6.0f, 4.0f, 9.0f, 0.0f, 6.0f, 8.0f, 6.0f, 7.0f, 8.0f, 2.0f, 3.0f, 3.0f, 7.0f,
-    2.0f, 9.0f, 1.0f, 9.0f, 3.0f, 5.0f, 4.0f, 6.0f, 2.0f, 7.0f, 1.0f, 1.0f, 0.0f, 0.0f, 4.0f, 9.0f, 1.0f, 7.0f, 3.0f,
-    2.0f, 1.0f, 4.0f, 6.0f, 7.0f, 9.0f, 2.0f, 2.0f, 8.0f, 3.0f, 2.0f, 4.0f, 1.0f, 7.0f, 6.0f, 8.0f, 6.0f, 9.0f, 8.0f,
-    6.0f, 8.0f, 3.0f, 4.0f, 8.0f, 5.0f, 6.0f, 9.0f, 9.0f, 2.0f, 0.0f, 5.0f, 0.0f, 0.0f, 2.0f, 4.0f, 2.0f, 2.0f, 6.0f,
-    9.0f, 3.0f, 6.0f, 0.0f, 5.0f, 4.0f, 3.0f, 8.0f, 6.0f, 3.0f, 2.0f, 8.0f, 9.0f, 2.0f, 7.0f, 1.0f, 2.0f, 4.0f, 9.0f,
-    3.0f, 7.0f, 9.0f, 2.0f, 4.0f, 2.0f, 7.0f, 8.0f, 8.0f, 6.0f, 3.0f, 4.0f, 6.0f, 3.0f, 1.0f, 7.0f, 9.0f, 3.0f, 5.0f,
-    9.0f, 7.0f, 1.0f, 8.0f, 6.0f, 1.0f, 9.0f, 2.0f, 8.0f, 2.0f, 9.0f, 8.0f, 3.0f, 2.0f, 7.0f, 8.0f, 9.0f, 3.0f, 6.0f,
-    0.0f, 8.0f, 5.0f, 7.0f, 1.0f, 5.0f, 2.0f, 9.0f, 3.0f, 0.0f, 5.0f, 9.0f, 3.0f, 2.0f, 0.0f, 2.0f, 7.0f, 5.0f, 7.0f,
-    4.0f, 7.0f, 0.0f, 9.0f, 8.0f, 8.0f, 8.0f, 8.0f};
-  std::vector<float> output_data = {344.000f, 395.000f, 434.000f};
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {3.0f, -5.0f, 4.0f, 3.0f, -1.0f, 1.0f, -5.0f, -2.0f, -3.0f, 5.0f, -1.0f, 5.0f};
-  std::vector<float> output_data = {-5.000f, -5.000f, -3.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {3.0f, -5.0f, 4.0f, 3.0f, -1.0f, 1.0f, -5.0f, -2.0f, -3.0f, 5.0f, -1.0f, 5.0f};
-  std::vector<float16_t> output_data = {-5.000f, -5.000f, -3.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMin);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    2.0f,  -8.0f,  -4.0f,  -7.0f, 7.0f,   3.0f,  7.0f,  -3.0f, 2.0f,   -9.0f, -6.0f, 3.0f,   -8.0f,  1.0f,  -10.0f,
-    1.0f,  -10.0f, 2.0f,   -5.0f, 6.0f,   -5.0f, 7.0f,  3.0f,  4.0f,   3.0f,  -3.0f, 5.0f,   -1.0f,  -1.0f, -6.0f,
-    -4.0f, 9.0f,   5.0f,   -1.0f, 3.0f,   3.0f,  9.0f,  5.0f,  -10.0f, -1.0f, -8.0f, 9.0f,   -4.0f,  8.0f,  3.0f,
-    -1.0f, -2.0f,  8.0f,   -1.0f, -7.0f,  2.0f,  4.0f,  2.0f,  4.0f,   6.0f,  -1.0f, 7.0f,   4.0f,   -3.0f, 0.0f,
-    -2.0f, -1.0f,  -10.0f, -2.0f, 6.0f,   3.0f,  -4.0f, -9.0f, -5.0f,  -8.0f, 0.0f,  -7.0f,  9.0f,   2.0f,  7.0f,
-    -5.0f, 8.0f,   4.0f,   5.0f,  9.0f,   -3.0f, 2.0f,  0.0f,  -4.0f,  -1.0f, -7.0f, -10.0f, -10.0f, -3.0f, 9.0f,
-    -8.0f, 1.0f,   1.0f,   -5.0f, -10.0f, -1.0f, 8.0f,  -2.0f, 1.0f,   -4.0f, 1.0f,  0.0f};
-  std::vector<float> output_data = {-10.000f, -10.000f, -10.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    2.0f,  -8.0f,  -4.0f,  -7.0f, 7.0f,   3.0f,  7.0f,  -3.0f, 2.0f,   -9.0f, -6.0f, 3.0f,   -8.0f,  1.0f,  -10.0f,
-    1.0f,  -10.0f, 2.0f,   -5.0f, 6.0f,   -5.0f, 7.0f,  3.0f,  4.0f,   3.0f,  -3.0f, 5.0f,   -1.0f,  -1.0f, -6.0f,
-    -4.0f, 9.0f,   5.0f,   -1.0f, 3.0f,   3.0f,  9.0f,  5.0f,  -10.0f, -1.0f, -8.0f, 9.0f,   -4.0f,  8.0f,  3.0f,
-    -1.0f, -2.0f,  8.0f,   -1.0f, -7.0f,  2.0f,  4.0f,  2.0f,  4.0f,   6.0f,  -1.0f, 7.0f,   4.0f,   -3.0f, 0.0f,
-    -2.0f, -1.0f,  -10.0f, -2.0f, 6.0f,   3.0f,  -4.0f, -9.0f, -5.0f,  -8.0f, 0.0f,  -7.0f,  9.0f,   2.0f,  7.0f,
-    -5.0f, 8.0f,   4.0f,   5.0f,  9.0f,   -3.0f, 2.0f,  0.0f,  -4.0f,  -1.0f, -7.0f, -10.0f, -10.0f, -3.0f, 9.0f,
-    -8.0f, 1.0f,   1.0f,   -5.0f, -10.0f, -1.0f, 8.0f,  -2.0f, 1.0f,   -4.0f, 1.0f,  0.0f};
-  std::vector<float16_t> output_data = {-10.000f, -10.000f, -10.000f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMin);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {-0.080f, 0.481f, -0.853f, -0.838f, 0.557f, 0.255f,
-                                   0.116f,  0.446f, -0.051f, -0.095f, 0.552f, 0.077f};
-  std::vector<float> output_data = {-0.853f, 0.116f, -0.095f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMinWCLocalFp32) {
-  int n = 1;
-  int h = 5;
-  int w = 17;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    0.399f,  -0.139f, 0.238f,  0.779f,  -0.894f, 0.343f,  -0.955f, 0.593f,  0.448f,  0.816f,  0.841f,  -0.614f, 0.636f,
-    0.116f,  -0.031f, -0.109f, 0.770f,  0.962f,  0.307f,  -0.170f, 0.789f,  0.197f,  0.530f,  -0.883f, 0.753f,  0.385f,
-    -0.158f, 0.237f,  0.971f,  -0.781f, -0.523f, -0.547f, 0.257f,  -0.034f, 0.660f,  -0.666f, -0.379f, 0.092f,  -0.130f,
-    0.369f,  0.664f,  -0.747f, -0.687f, -0.628f, -0.434f, 0.736f,  0.673f,  0.125f,  -0.854f, 0.007f,  0.038f,  0.024f,
-    0.706f,  -0.806f, 0.042f,  0.532f,  -0.545f, -0.942f, 0.778f,  -0.419f, 0.931f,  -0.848f, 0.501f,  -0.415f, -0.292f,
-    -0.575f, 0.192f,  -0.825f, 0.256f,  -0.227f, -0.795f, 0.319f,  0.101f,  -0.337f, 0.940f,  -0.724f, 0.453f,  -0.646f,
-    -0.225f, -0.303f, 0.093f,  0.851f,  -0.467f, -0.657f, 0.980f,  0.867f,  0.606f,  0.356f,  0.982f,  -0.199f, 0.816f,
-    0.984f,  -0.466f, -0.857f, -0.070f, -0.562f, 0.744f,  0.477f,  0.831f,  -0.064f, 0.891f,  -0.813f, -0.341f, 0.969f,
-    0.538f,  0.233f,  -0.545f, 0.994f,  0.241f,  -0.829f, -0.272f, -0.420f, 0.607f,  0.658f,  -0.188f, 0.134f,  0.277f,
-    -0.173f, 0.373f,  0.286f,  -0.805f, 0.455f,  0.461f,  0.893f,  -0.457f, 0.360f,  -0.706f, -0.848f, 0.032f,  -0.566f,
-    0.014f,  0.507f,  -0.694f, -0.663f, -0.783f, 0.459f,  -0.613f, -0.496f, 0.332f,  0.829f,  -0.437f, 0.759f,  -0.061f,
-    -0.400f, -0.561f, 0.471f,  -0.042f, 0.073f,  0.546f,  -0.557f, 0.602f,  0.011f,  -0.214f, 0.733f,  0.289f,  -0.847f,
-    -0.637f, -0.791f, 0.519f,  0.449f,  -0.390f, -0.296f, 0.622f,  0.345f,  0.525f,  -0.205f, -0.626f, 0.089f,  -0.811f,
-    0.741f};
-  std::vector<float> output_data = {-0.955f, -0.942f, -0.857f, -0.848f, -0.847f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMaxFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.123f,  0.975f, 0.092f, 0.364f,  0.033f,  -0.140f,
-                                   -0.566f, 0.693f, 0.540f, -0.588f, -0.992f, -0.386f};
-  std::vector<float> output_data = {0.364f, 0.975f, 0.540f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMaxFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {0.123f,  0.975f, 0.092f, 0.364f,  0.033f,  -0.140f,
-                                       -0.566f, 0.693f, 0.540f, -0.588f, -0.992f, -0.386f};
-  std::vector<float16_t> output_data = {0.364f, 0.975f, 0.540f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMax);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMaxLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    0.113f,  -0.633f, 0.603f,  0.447f,  -0.588f, 0.039f,  0.494f,  -0.379f, -0.018f, -0.317f, 0.620f,  0.460f,  0.732f,
-    0.980f,  0.376f,  0.481f,  -0.371f, -0.219f, -0.496f, 0.670f,  -0.159f, 0.961f,  0.036f,  0.633f,  -0.118f, -0.300f,
-    0.971f,  -0.236f, -0.095f, -0.705f, -0.495f, -0.403f, -0.131f, -0.084f, -0.339f, 0.031f,  -0.582f, 0.893f,  -0.311f,
-    0.501f,  -0.623f, -0.523f, -0.177f, -0.438f, 0.626f,  0.028f,  -0.106f, 0.916f,  -0.504f, 0.678f,  0.358f,  -0.951f,
-    0.741f,  -0.577f, -0.544f, -0.952f, -0.133f, 0.441f,  -0.376f, -0.246f, 0.301f,  0.025f,  -0.904f, -0.337f, 0.132f,
-    -0.800f, 0.226f,  -0.135f, -0.617f, -0.871f, -0.393f, -0.195f, 0.591f,  0.034f,  -0.040f, 0.377f,  -0.106f, 0.265f,
-    -0.883f, -0.678f, -0.795f, -0.094f, -0.272f, -0.954f, 0.569f,  -0.910f, -0.288f, -0.978f, 0.262f,  -0.973f, -0.750f,
-    0.460f,  0.956f,  0.696f,  -0.938f, 0.537f,  0.516f,  -0.339f, -0.289f, 0.498f,  0.135f,  -0.649f};
-  std::vector<float> output_data = {0.961f, 0.980f, 0.971f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMaxLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    0.314f,  -0.714f, -0.736f, -0.459f, -0.819f, -0.530f, -0.275f, -0.141f, -0.797f, 0.522f,  -0.651f, 0.576f,  -0.644f,
-    0.725f,  0.208f,  -0.529f, -0.776f, 0.986f,  -0.862f, -0.327f, 0.922f,  0.554f,  -0.401f, 0.972f,  -0.485f, 0.423f,
-    -0.611f, -0.768f, 0.444f,  -0.678f, -0.734f, 0.572f,  0.413f,  0.612f,  -0.783f, -0.138f, -0.624f, -0.284f, 0.873f,
-    -0.298f, 0.630f,  -0.463f, 0.195f,  0.196f,  0.167f,  0.227f,  -0.015f, 0.436f,  -0.898f, 0.031f,  -0.149f, -0.218f,
-    0.184f,  -0.426f, 0.794f,  0.846f,  0.624f,  -0.889f, -0.336f, 0.401f,  -0.820f, -0.583f, 0.337f,  0.175f,  0.228f,
-    -0.626f, -0.505f, -0.088f, 0.833f,  -0.366f, 0.392f,  0.727f,  -0.598f, -0.851f, 0.007f,  -0.707f, 0.575f,  0.243f,
-    -0.372f, -0.141f, 0.679f,  -0.646f, 0.422f,  0.322f,  -0.294f, 0.831f,  0.929f,  -0.414f, -0.208f, -0.111f, 0.146f,
-    -0.489f, -0.808f, -0.635f, 0.811f,  0.544f,  -0.131f, 0.707f,  0.787f,  0.603f,  -0.149f, -0.095f};
-  std::vector<float16_t> output_data = {0.794f, 0.846f, 0.986f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMax);
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestReduceOpenCL, ReduceMaxWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {0.435f,  -0.949f, 0.580f,  0.858f, -0.465f, 0.255f,
-                                   -0.561f, -0.444f, -0.603f, 0.266f, 0.031f,  -0.638f};
-  std::vector<float> output_data = {0.858f, 0.255f, 0.266f};
+TEST_F(TestOpenCL_Reduce, Mean) {
+  std::vector<int> axis = {1, 2};
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 3};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {4.5, 5.5, 6.5f};
 
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceMaxWCLocalFp32) {
-  int n = 1;
-  int h = 5;
-  int w = 17;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    0.543f,  0.620f,  0.175f,  -0.275f, -0.570f, 0.516f,  -0.401f, -0.157f, 0.460f,  -0.072f, -0.322f, 0.208f,  0.385f,
-    0.919f,  -0.265f, 0.256f,  0.383f,  -0.399f, 0.183f,  0.363f,  -0.779f, -0.191f, -0.446f, 0.063f,  -0.671f, 0.823f,
-    -0.049f, -0.182f, -0.409f, 0.589f,  -0.804f, -0.461f, -0.407f, -0.119f, 0.833f,  0.718f,  -0.366f, 0.993f,  0.844f,
-    -0.018f, -0.203f, -0.004f, -0.610f, -0.461f, 0.938f,  -0.708f, -0.831f, -0.147f, 0.855f,  0.998f,  0.412f,  -0.393f,
-    -0.706f, -0.127f, 0.845f,  -0.236f, -0.341f, 0.299f,  0.793f,  0.794f,  -0.634f, -0.663f, -0.568f, -0.428f, -0.921f,
-    0.904f,  0.933f,  -0.985f, -0.760f, -0.673f, -0.080f, 0.235f,  0.539f,  -0.341f, -0.899f, 0.527f,  -0.210f, -0.151f,
-    0.148f,  -0.184f, -0.103f, -0.345f, -0.772f, -0.960f, -0.282f, -0.486f, -0.986f, -0.591f, 0.702f,  0.973f,  0.269f,
-    0.058f,  -0.831f, -0.677f, -0.665f, -0.403f, 0.241f,  -0.365f, 0.741f,  0.603f,  0.347f,  0.812f,  -0.515f, -0.085f,
-    0.251f,  0.631f,  0.819f,  0.622f,  -0.615f, -0.122f, 0.064f,  0.445f,  -0.508f, -0.023f, -0.072f, -0.423f, 0.547f,
-    -0.841f, -0.308f, 0.924f,  -0.187f, 0.601f,  0.879f,  -0.868f, 0.395f,  -0.307f, 0.977f,  -0.300f, 0.737f,  0.022f,
-    0.106f,  -0.520f, -0.673f, -0.351f, 0.367f,  0.588f,  -0.223f, 0.062f,  0.870f,  -0.017f, 0.583f,  0.405f,  0.507f,
-    -0.457f, 0.196f,  0.048f,  -0.173f, 0.596f,  -0.017f, -0.245f, -0.433f, -0.852f, 0.058f,  0.237f,  0.280f,  -0.129f,
-    -0.224f, 0.869f,  -0.781f, -0.029f, -0.715f, 0.497f,  -0.341f, 0.230f,  -0.572f, 0.718f,  -0.408f, -0.998f, -0.752f,
-    -0.701f};
-  std::vector<float> output_data = {0.919f, 0.998f, 0.973f, 0.977f, 0.870f};
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {4.0f, 3.0f, 1.0f, 4.0f, 1.0f, 3.0f, 1.0f, 4.0f, 2.0f, 4.0f, 4.0f, 3.0f};
-  std::vector<float> output_data = {64.0f, 48.0f, 18.0f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {2.0f, 1.0f, 3.0f, 1.0f, 4.0f, 1.0f, 4.0f, 3.0f, 2.0f, 3.0f, 1.0f, 1.0f};
-  std::vector<float16_t> output_data = {24.0f, 12.0f, 6.0f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceProd);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    0.304f, 2.304f, 1.391f, 1.072f, 0.351f, 0.641f, 0.120f, 2.382f, 0.460f, 1.672f, 0.553f, 1.534f, 1.423f,
-    0.892f, 2.900f, 1.953f, 1.745f, 1.171f, 1.717f, 1.291f, 1.572f, 2.388f, 0.154f, 0.252f, 0.794f, 0.981f,
-    0.366f, 1.372f, 1.778f, 1.848f, 1.023f, 1.124f, 2.045f, 2.374f, 1.965f, 0.260f, 1.306f, 1.889f, 1.144f,
-    1.816f, 2.189f, 2.215f, 1.913f, 2.577f, 2.910f, 1.712f, 0.342f, 1.349f, 0.215f, 2.717f, 1.813f, 2.764f,
-    1.989f, 1.710f, 0.156f, 2.293f, 2.648f, 1.281f, 1.078f, 2.757f, 0.746f, 0.238f, 0.235f, 0.123f, 0.730f,
-    1.558f, 1.798f, 0.993f, 2.479f, 1.930f, 1.687f, 1.078f, 0.600f, 0.710f, 1.926f, 0.848f, 0.984f, 0.568f,
-    0.983f, 1.068f, 2.362f, 2.770f, 2.184f, 2.883f, 1.177f, 0.232f, 0.782f, 1.340f, 2.029f, 1.524f, 0.159f,
-    2.892f, 1.225f, 0.638f, 2.537f, 0.813f, 0.337f, 1.871f, 0.602f, 2.387f, 1.209f, 2.886f};
-  std::vector<float> output_data = {0.103f, 229.081f, 1030.031f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    2.843f, 2.398f, 0.998f, 1.164f, 1.048f, 0.880f, 2.112f, 1.354f, 2.892f, 0.755f, 2.033f, 1.140f, 1.117f,
-    2.550f, 2.340f, 2.905f, 0.114f, 0.773f, 2.589f, 2.404f, 1.037f, 0.561f, 2.671f, 0.419f, 1.723f, 2.041f,
-    2.888f, 2.440f, 1.668f, 0.821f, 0.918f, 1.251f, 1.141f, 2.497f, 0.408f, 2.384f, 0.457f, 2.754f, 0.624f,
-    0.198f, 0.599f, 2.566f, 1.279f, 2.973f, 0.363f, 2.222f, 1.144f, 2.715f, 1.135f, 0.900f, 1.906f, 0.982f,
-    2.211f, 2.113f, 0.585f, 1.766f, 1.612f, 1.796f, 0.607f, 1.121f, 1.277f, 2.600f, 1.446f, 1.467f, 1.828f,
-    2.227f, 0.950f, 2.702f, 1.297f, 0.552f, 2.476f, 1.404f, 2.487f, 0.615f, 0.205f, 0.577f, 0.809f, 1.432f,
-    1.668f, 2.243f, 2.711f, 2.221f, 0.183f, 2.964f, 1.174f, 0.928f, 2.703f, 0.427f, 0.410f, 1.436f, 1.427f,
-    1.144f, 2.970f, 2.014f, 2.380f, 1.286f, 2.570f, 2.765f, 1.757f, 0.513f, 2.449f, 0.770f};
-  std::vector<float16_t> output_data = {715.940f, 12232.266f, 46763.609f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceProd);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {1.691f, 2.804f, 0.184f, 1.760f, 0.255f, 1.461f,
-                                   2.751f, 2.487f, 1.304f, 0.686f, 0.702f, 0.393f};
-  std::vector<float> output_data = {1.536f, 2.549f, 0.247f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceProdWCLocalFp32) {
-  int n = 1;
-  int h = 5;
-  int w = 17;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    1.360f, 0.615f, 0.894f, 1.357f, 0.701f, 1.430f, 1.488f, 0.701f, 0.688f, 0.869f, 1.321f, 0.836f, 1.160f, 1.460f,
-    1.215f, 1.157f, 0.855f, 0.992f, 0.724f, 0.741f, 0.921f, 1.496f, 1.285f, 1.040f, 0.695f, 1.264f, 0.998f, 0.925f,
-    1.170f, 1.384f, 1.413f, 0.617f, 0.743f, 1.299f, 0.998f, 1.131f, 1.491f, 1.371f, 0.808f, 1.001f, 0.602f, 0.812f,
-    1.299f, 1.500f, 0.867f, 0.970f, 1.174f, 0.887f, 1.409f, 1.144f, 0.969f, 1.303f, 1.154f, 0.796f, 0.952f, 1.347f,
-    0.794f, 0.601f, 1.191f, 1.310f, 0.619f, 0.961f, 0.951f, 1.395f, 0.861f, 1.177f, 1.274f, 0.701f, 0.758f, 0.635f,
-    1.256f, 1.450f, 0.900f, 1.313f, 1.401f, 0.904f, 0.835f, 0.767f, 1.258f, 1.467f, 1.278f, 0.652f, 0.731f, 0.648f,
-    1.308f, 1.199f, 1.485f, 1.352f, 0.639f, 1.291f, 0.924f, 0.762f, 0.791f, 1.392f, 1.328f, 1.190f, 1.458f, 1.193f,
-    1.109f, 1.098f, 1.117f, 1.197f, 1.097f, 0.879f, 1.175f, 0.723f, 1.260f, 1.454f, 0.703f, 0.729f, 1.467f, 0.918f,
-    0.631f, 0.750f, 1.292f, 1.208f, 0.972f, 0.621f, 0.673f, 0.710f, 1.482f, 1.092f, 1.162f, 1.432f, 0.774f, 1.132f,
-    1.258f, 0.761f, 0.799f, 1.071f, 1.099f, 1.484f, 0.674f, 0.916f, 0.684f, 0.842f, 1.412f, 0.956f, 1.199f, 0.969f,
-    0.957f, 1.124f, 0.937f, 0.815f, 1.308f, 1.448f, 1.059f, 1.373f, 0.804f, 1.172f, 1.387f, 0.826f, 0.783f, 0.707f,
-    1.159f, 0.927f, 0.602f, 0.932f, 1.024f, 1.266f, 0.885f, 0.920f, 1.120f, 0.973f, 0.964f, 1.365f, 0.926f, 0.709f,
-    1.177f, 0.615f};
-  std::vector<float> output_data = {1.544f, 1.984f, 5.516f, 0.247f, 0.919f};
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd, true);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumSquareFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {-0.081f, 0.305f,  -0.291f, 0.777f, 0.338f, 0.482f,
-                                   0.959f,  -0.695f, -0.055f, 0.001f, 0.723f, -0.112f};
-  std::vector<float> output_data = {1.530f, 1.213f, 0.333f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare);
-}
-
-TEST_F(TestReduceOpenCL, ReduceSumSquareFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {-0.730f, -0.938f, 0.236f, -0.631f, -0.058f, -0.625f,
-                                       0.097f,  -0.343f, 0.120f, -0.339f, 0.003f,  -0.288f};
-  std::vector<float16_t> output_data = {1.055f, 1.001f, 0.544f};
-
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSumSquare);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, false);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestReduceOpenCL, ReduceSumSquareLocalFp32) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    0.025f,  -0.130f, 0.292f,  0.128f,  0.360f,  -0.181f, -0.179f, 0.469f,  0.434f,  -0.417f, -0.414f, 0.998f,  0.654f,
-    -0.102f, 0.039f,  -0.822f, -0.155f, 0.113f,  0.204f,  0.615f,  0.844f,  -0.364f, 0.486f,  0.799f,  0.452f,  -0.884f,
-    -0.006f, 0.888f,  -0.567f, 0.620f,  -0.365f, -0.096f, -0.300f, -0.263f, 0.945f,  -0.900f, -0.798f, -0.536f, -0.506f,
-    0.148f,  -0.496f, 0.344f,  0.096f,  0.881f,  -0.848f, 0.401f,  -0.724f, 0.806f,  -0.550f, 0.377f,  0.560f,  -0.144f,
-    0.439f,  0.038f,  -0.985f, 0.246f,  0.233f,  -0.864f, 0.427f,  -0.723f, 0.592f,  -0.642f, 0.376f,  0.769f,  0.020f,
-    0.965f,  0.532f,  -0.448f, -0.168f, 0.502f,  0.900f,  0.468f,  0.834f,  -0.768f, -0.337f, 0.874f,  0.941f,  -0.449f,
-    -0.330f, 0.605f,  0.081f,  0.804f,  -0.823f, -0.270f, 0.117f,  0.040f,  0.316f,  0.951f,  -0.920f, 0.599f,  0.855f,
-    0.075f,  -0.898f, -0.298f, 0.208f,  0.899f,  0.751f,  -0.421f, 0.478f,  -0.106f, -0.031f, 0.974f};
-  std::vector<float> output_data = {11.569f, 10.620f, 11.552f};
+TEST_F(TestOpenCL_Reduce, Sum) {
+  std::vector<int> axis = {1, 2};
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 3};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {18, 22, 26};
 
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, false);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestReduceOpenCL, ReduceSumSquareLocalFp16) {
-  int n = 1;
-  int h = 17;
-  int w = 2;
-  int c = 3;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {
-    0.931f,  0.611f,  0.921f,  -0.873f, 0.084f,  -0.677f, -0.366f, -0.627f, -0.359f, 0.217f,  -0.825f, -0.453f, 0.486f,
-    0.675f,  -0.968f, 0.070f,  0.300f,  -0.508f, -0.423f, -0.741f, -0.390f, 0.649f,  -0.313f, -0.921f, -0.130f, -0.212f,
-    -0.591f, 0.135f,  -0.556f, -0.963f, -0.509f, -0.480f, 0.694f,  -0.913f, 0.778f,  0.498f,  -0.520f, 0.271f,  0.087f,
-    0.265f,  0.905f,  0.669f,  0.257f,  -0.307f, 0.789f,  0.117f,  0.468f,  0.728f,  0.372f,  -0.475f, 0.195f,  0.163f,
-    0.766f,  -0.504f, 0.876f,  -0.203f, 0.636f,  -0.340f, -0.126f, 0.368f,  -0.173f, -0.149f, 0.492f,  -0.220f, 0.521f,
-    -0.844f, -0.684f, -0.718f, 0.255f,  -0.148f, -0.891f, 0.577f,  -0.880f, 0.005f,  -0.904f, 0.282f,  0.473f,  -0.512f,
-    -0.385f, -0.674f, 0.443f,  -0.172f, 0.224f,  0.720f,  -0.050f, 0.003f,  -0.743f, 0.025f,  0.941f,  0.107f,  0.176f,
-    -0.360f, 0.975f,  -0.781f, -0.727f, 0.274f,  0.214f,  -0.330f, 0.237f,  0.967f,  0.156f,  -0.587f};
-  std::vector<float16_t> output_data = {8.472f, 9.920f, 13.418f};
+TEST_F(TestOpenCL_Reduce, MeanWC) {
+  std::vector<int> axis = {2, 3};
+  std::vector<int> input_shape = {1, 3, 2, 2};
+  std::vector<int> output_shape = {1, 3, 1, 1};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {1.5, 5.5, 9.5f};
 
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSumSquare);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestReduceOpenCL, ReduceSumSquareWCFp32) {
-  int n = 1;
-  int h = 3;
-  int w = 2;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {-0.686f, 0.613f,  -0.701f, 0.978f, 0.632f,  0.677f,
-                                   0.780f,  -0.888f, 0.147f,  0.448f, -0.100f, 0.936f};
-  std::vector<float> output_data = {2.294f, 2.255f, 1.108f};
+TEST_F(TestOpenCL_Reduce, SumWC) {
+  std::vector<int> axis = {2, 3};
+  std::vector<int> input_shape = {1, 3, 2, 2};
+  std::vector<int> output_shape = {1, 3, 1, 1};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {6, 22, 38};
 
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare, true);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, true);
+    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestReduceOpenCL, ReduceSumSquareWCLocalFp32) {
-  int n = 1;
-  int h = 5;
-  int w = 17;
-  int c = 2;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {
-    -0.309f, -0.836f, 0.749f,  -0.820f, -0.715f, -0.770f, 0.030f,  -0.817f, 0.009f,  0.146f,  0.642f,  0.382f,  -0.085f,
-    -0.268f, -0.424f, -0.957f, -0.127f, -0.852f, 0.596f,  0.340f,  -0.492f, -0.374f, -0.669f, 0.665f,  -0.664f, -0.079f,
-    0.462f,  0.469f,  0.187f,  -0.730f, -0.240f, -0.446f, 0.254f,  0.284f,  0.743f,  0.297f,  0.235f,  -0.068f, 0.652f,
-    -0.474f, -0.749f, -0.499f, 0.106f,  -0.988f, 0.033f,  -0.327f, -0.050f, -0.228f, -0.676f, -0.136f, -0.801f, 0.885f,
-    -0.108f, -0.019f, -0.092f, 0.538f,  0.760f,  0.996f,  -0.610f, 0.125f,  0.296f,  0.861f,  0.811f,  0.948f,  -0.665f,
-    0.920f,  0.669f,  0.572f,  -0.653f, -0.823f, -0.967f, -0.094f, 0.078f,  0.458f,  0.954f,  -0.357f, 0.887f,  -0.194f,
-    -0.453f, -0.774f, -0.805f, -0.064f, -0.671f, -0.151f, -0.910f, 0.695f,  0.762f,  0.755f,  -0.933f, 0.277f,  -0.697f,
-    0.074f,  -0.333f, 0.790f,  -0.370f, 0.264f,  -0.649f, 0.570f,  0.933f,  0.714f,  0.296f,  -0.430f, 0.634f,  0.619f,
-    -0.744f, -0.898f, -0.908f, -0.800f, 0.500f,  -0.688f, 0.816f,  0.901f,  0.054f,  0.993f,  0.346f,  -0.285f, -0.926f,
-    0.746f,  -0.718f, 0.708f,  -0.193f, 0.838f,  -0.869f, -0.189f, -0.195f, -0.324f, -0.498f, -0.216f, 0.632f,  -0.701f,
-    0.272f,  0.550f,  0.486f,  -0.415f, 0.285f,  0.617f,  0.740f,  0.170f,  0.486f,  0.251f,  -0.165f, -0.424f, 0.705f,
-    -0.802f, -0.977f, -0.449f, 0.502f,  -0.406f, 0.125f,  -0.643f, -0.324f, -0.409f, 0.218f,  0.719f,  -0.043f, -0.933f,
-    -0.580f, 0.830f,  -0.091f, 0.998f,  -0.458f, 0.142f,  -0.220f, -0.440f, 0.824f,  -0.349f, 0.983f,  -0.546f, 0.085f,
-    0.235f};
-  std::vector<float> output_data = {9.889f, 11.926f, 13.296f, 13.537f, 10.563f};
-  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare, true);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
index dfe91c1ef3..e95af71b0d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
@@ -13,169 +13,91 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/reshape_parameter.h"
 
-namespace mindspore {
-class TestReshapeOpenCL : public mindspore::CommonTest {
- public:
-  TestReshapeOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseReshape(const std::vector<int> &shape_in, const std::vector<int> &shape_out, void *input_data,
-                        void *output_data, bool enable_fp16) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     shape_in, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  bool is_output_2d = shape_out.size() == 2;
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), shape_out,
-                                   is_output_2d ? schema::Format_NC : schema::Format_NHWC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ReshapeOpenCLKernel>(inputs, outputs, nullptr, nullptr,
-                                                                               kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  }
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
+class TestOpenCL_Reshape : public CommonTest {};
 
-  MS_LOG(INFO) << "Test Reshape passed";
+namespace {
+// PrimitiveType_Reshape: src/ops/populate/reshape_populate.cc
+OpParameter *CreateParameter() {
+  auto *param = test::CreateParameter<ReshapeParameter>(schema::PrimitiveType_Reshape);
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestReshapeOpenCL, ReshapeFp32) {
+TEST_F(TestOpenCL_Reshape, 4D_2D_test0) {
   std::vector<int> shape_in = {1, 1, 1, 7};
   std::vector<int> shape_out = {1, 7};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
-}
-
-TEST_F(TestReshapeOpenCL, ReshapeFp16) {
-  std::vector<int> shape_in = {1, 1, 1, 7};
-  std::vector<int> shape_out = {1, 7};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), true);
-}
-
-TEST_F(TestReshapeOpenCL, Reshape4DFp32) {
-  std::vector<int> shape_in = {1, 2, 2, 3};
-  std::vector<int> shape_out = {1, 1, 4, 3};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
 
-TEST_F(TestReshapeOpenCL, Reshape4DFp16) {
+TEST_F(TestOpenCL_Reshape, 4D_4D_test0) {
   std::vector<int> shape_in = {1, 2, 2, 3};
   std::vector<int> shape_out = {1, 1, 4, 3};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), true);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
 
-TEST_F(TestReshapeOpenCL, Reshape4D2DFp32) {
+TEST_F(TestOpenCL_Reshape, 4D_2D_test1) {
   std::vector<int> shape_in = {1, 2, 2, 4};
   std::vector<int> shape_out = {4, 4};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                                    8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
-TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test0) {
+
+TEST_F(TestOpenCL_Reshape, 4D_4D_test1) {
   std::vector<int> shape_in = {1, 4, 2, 3};
   std::vector<int> shape_out = {1, 3, 2, 4};
-  std::vector<float> input_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f,
-                                   12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f};
-  std::vector<float> output_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f,
-                                    12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
-TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test1) {
+
+TEST_F(TestOpenCL_Reshape, 4D_4D_test2) {
   std::vector<int> shape_in = {1, 2, 2, 5};
   std::vector<int> shape_out = {1, 1, 5, 4};
-  std::vector<float> input_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,
-                                   10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f};
-  std::vector<float> output_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,
-                                    10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
-TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test2) {
+
+TEST_F(TestOpenCL_Reshape, 4D_4D_test3) {
   std::vector<int> shape_in = {1, 4, 2, 5};
   std::vector<int> shape_out = {1, 2, 5, 4};
-  std::vector<float> input_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f,
-    14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f,
-    28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f,
+  float input_data[] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
   };
-  std::vector<float> output_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f,
-    14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f,
-    28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f,
+  float output_data[] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
   };
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
-TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem10) {
+
+TEST_F(TestOpenCL_Reshape, 4D_4D_test4) {
   std::vector<int> shape_in = {1, 5, 5, 8};
   std::vector<int> shape_out = {8, 1, 5, 5};
-  std::vector<float> input_data = {
+  float input_data[] = {
     0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,
     23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
     46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
@@ -185,7 +107,7 @@ TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem10) {
     138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
     161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
     184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199};
-  std::vector<float> output_data = {
+  float output_data[] = {
     0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,
     23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
     46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
@@ -196,19 +118,21 @@ TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem10) {
     161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
     184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199};
 
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
-TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem11) {
+
+TEST_F(TestOpenCL_Reshape, 4D_4D_test5) {
   std::vector<int> shape_in = {1, 3, 2, 5};
   std::vector<int> shape_out = {1, 5, 2, 3};
-  std::vector<float> input_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,
-                                   10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f,
-                                   20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f};
-  std::vector<float> output_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,
-                                    10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f,
-                                    20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f};
-
-  RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false);
+  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+  float output_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29};
+  for (auto fp16_enable : {false, true}) {
+    TestMain({{shape_in, input_data, VAR}}, {shape_out, output_data}, CreateParameter(), fp16_enable);
+  }
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc
index 137cdd5c61..26f550c886 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc
@@ -13,169 +13,72 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "common/common_test.h"
-#include "src/common/file_utils.h"
-#include "src/common/log_adapter.h"
-#include "src/runtime/kernel/opencl/kernel/resize.h"
-#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "src/runtime/opencl/opencl_runtime.h"
-#include "test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/resize_parameter.h"
 
-namespace mindspore {
-class TestResizeOpenCL : public mindspore::CommonTest {
- public:
-  TestResizeOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseResize(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
-                       int resize_mode, bool align_corners) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<ResizeParameter *>(malloc(sizeof(ResizeParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  int n = shape[0];
-  int h = shape[1];
-  int w = shape[2];
-  int oh = shape[3];
-  int ow = shape[4];
-  int c = shape[5];
-  param->new_height_ = oh;
-  param->new_width_ = ow;
-  param->align_corners_ = align_corners;
-  param->method_ = resize_mode;
-  std::vector<int> input_shape = {n, h, w, c};
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  std::vector<int> out_shape = {n, oh, ow, c};
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, schema::Format_NHWC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ResizeOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
+class TestOpenCL_Resize : public CommonTest {};
 
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  }
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-
-  MS_LOG(INFO) << "Test Resize passed";
+namespace {
+// PrimitiveType_Resize: src/ops/populate/resize_populate.cc
+OpParameter *CreateParameter(schema::ResizeMethod method, int new_height, int new_width, bool align_corners) {
+  auto *param = test::CreateParameter<ResizeParameter>(schema::PrimitiveType_Resize);
+  param->new_height_ = new_height;
+  param->new_width_ = new_width;
+  param->align_corners_ = align_corners;
+  param->method_ = method;
+  param->preserve_aspect_ratio_ = false;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestResizeOpenCL, ResizeBilinearFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
+TEST_F(TestOpenCL_Resize, Bilinear) {
+  schema::ResizeMethod method = schema::ResizeMethod_LINEAR;
   int oh = 4;
   int ow = 4;
-  int c = 1;
   bool align_corners = false;
-  std::vector<int> shape = {n, h, w, oh, ow, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {0.0f, 0.5f, 1.0f, 1.0f, 1.0f, 1.5f, 2.0f, 2.0f,
-                                    2.0f, 2.5f, 3.0f, 3.0f, 2.0f, 2.5f, 3.0f, 3.0f};
-  RunTestCaseResize(shape, input_data.data(), output_data.data(), false, schema::ResizeMethod_LINEAR, align_corners);
-}
 
-TEST_F(TestResizeOpenCL, ResizeBilinearFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int oh = 4;
-  int ow = 4;
-  int c = 1;
-  bool align_corners = false;
-  std::vector<int> shape = {n, h, w, oh, ow, c};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float16_t> output_data = {0.0f, 0.5f, 1.0f, 1.0f, 1.0f, 1.5f, 2.0f, 2.0f,
-                                        2.0f, 2.5f, 3.0f, 3.0f, 2.0f, 2.5f, 3.0f, 3.0f};
-  RunTestCaseResize(shape, input_data.data(), output_data.data(), true, schema::ResizeMethod_LINEAR, align_corners);
+  std::vector<int> input_shape = {1, 2, 2, 1};
+  std::vector<int> output_shape = {1, oh, ow, 1};
+  float input_data[] = {0, 1, 2, 3};
+  float output_data[] = {0, 0.5, 1, 1, 1, 1.5, 2, 2, 2, 2.5, 3, 3, 2, 2.5, 3, 3};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(method, oh, ow, align_corners);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestResizeOpenCL, ResizeBilinearAlignFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
+TEST_F(TestOpenCL_Resize, Bilinear_AlignCorners) {
+  schema::ResizeMethod method = schema::ResizeMethod_LINEAR;
   int oh = 3;
   int ow = 3;
-  int c = 1;
   bool align_corners = true;
-  std::vector<int> shape = {n, h, w, oh, ow, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {0.0f, 0.5f, 1.0f, 1.0f, 1.5f, 2.0f, 2.0f, 2.5f, 3.0f};
-  RunTestCaseResize(shape, input_data.data(), output_data.data(), false, schema::ResizeMethod_LINEAR, align_corners);
-}
 
-TEST_F(TestResizeOpenCL, ResizeNearestNeighborFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int oh = 4;
-  int ow = 4;
-  int c = 1;
-  bool align_corners = false;
-  std::vector<int> shape = {n, h, w, oh, ow, c};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f,
-                                    2.0f, 2.0f, 3.0f, 3.0f, 2.0f, 2.0f, 3.0f, 3.0f};
-  RunTestCaseResize(shape, input_data.data(), output_data.data(), false, schema::ResizeMethod_NEAREST, align_corners);
+  std::vector<int> input_shape = {1, 2, 2, 1};
+  std::vector<int> output_shape = {1, oh, ow, 1};
+  float input_data[] = {0, 1, 2, 3};
+  float output_data[] = {0, 0.5, 1, 1, 1.5, 2, 2, 2.5, 3};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(method, oh, ow, align_corners);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestResizeOpenCL, ResizeNearestNeighborFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
+TEST_F(TestOpenCL_Resize, NEAREST) {
+  schema::ResizeMethod method = schema::ResizeMethod_NEAREST;
   int oh = 4;
   int ow = 4;
-  int c = 1;
   bool align_corners = false;
-  std::vector<int> shape = {n, h, w, oh, ow, c};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float16_t> output_data = {0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f,
-                                        2.0f, 2.0f, 3.0f, 3.0f, 2.0f, 2.0f, 3.0f, 3.0f};
-  RunTestCaseResize(shape, input_data.data(), output_data.data(), true, schema::ResizeMethod_NEAREST, align_corners);
+
+  std::vector<int> input_shape = {1, 2, 2, 1};
+  std::vector<int> output_shape = {1, oh, ow, 1};
+  float input_data[] = {0, 1, 2, 3};
+  float output_data[] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(method, oh, ow, align_corners);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc
index 1ce7d6e870..aeb9ef7b68 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc
@@ -13,171 +13,75 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/scale.h"
 
-namespace mindspore {
-class TestScaleOpenCL : public mindspore::CommonTest {
- public:
-  TestScaleOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseScale(void *input_data0, const std::vector<int> &input_shape, void *scale_data, void *offset_data,
-                      const std::vector<int> &weight_shape, void *output_data, const std::vector<int> &out_shape,
-                      bool enable_fp16, int axis, int act_type = schema::ActivationType_NO_ACTIVATION) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<ScaleParameter *>(malloc(sizeof(ScaleParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->axis_ = axis;
-  param->activation_type_ = act_type;
-  auto tensor_x_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-
-  auto tensor_scale_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape,
-                                   schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
-  auto tensor_scale = tensor_scale_ptr.get();
-  if (tensor_scale == nullptr) {
-    MS_LOG(ERROR) << "tensor_scale create error.";
-    return;
-  }
-  tensor_scale->set_data(scale_data);
-  auto tensor_offset_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape,
-                                   schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
-  auto tensor_offset = tensor_offset_ptr.get();
-  if (tensor_offset == nullptr) {
-    MS_LOG(ERROR) << "tensor_offset create error.";
-    return;
-  }
-  tensor_offset->set_data(offset_data);
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x, tensor_scale, tensor_offset};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel_ptr =
-    std::make_unique<kernel::ScaleOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  auto op_kernel = op_kernel_ptr.release();
-  if (op_kernel == nullptr) {
-    MS_LOG(ERROR) << "op_kernel create error.";
-    return;
-  }
-  op_kernel->Init();
-  inputs[0]->MallocData(allocator);
+class TestOpenCL_Scale : public CommonTest {};
 
-  std::vector<kernel::LiteKernel *> kernels{op_kernel};
-
-  std::vector<lite::Tensor *> inputs_g{tensor_x};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size);
-  pGraph->Run();
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
-  }
-
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-  MS_LOG(INFO) << "TestScale passed";
+namespace {
+// PrimitiveType_Resize: src/ops/populate/scale_populate.cc
+OpParameter *CreateParameter(int axis, int activation_type = schema::ActivationType_NO_ACTIVATION) {
+  auto *param = test::CreateParameter<ScaleParameter>(schema::PrimitiveType_Scale);
+  param->axis_ = axis;
+  param->activation_type_ = activation_type;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestScaleOpenCL, ScaleAxis3Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> weight_shape = {c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> scale_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float> offset_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {2.0f, 6.0f, 12.0f, 5.0f, 12.0f, 21.0f, 8.0f, 18.0f, 30.0f, 11.0f, 24.0f, 39.0f};
-  RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape,
-                   output_data.data(), out_shape, false, 3);
+TEST_F(TestOpenCL_Scale, Axis1) {
+  int axis = 1;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> weight_shape = {input_shape[axis]};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float scale_data[] = {1, 2};
+  float offset_data[] = {1, 2};
+  float output_data[] = {2, 3, 4, 5, 6, 7, 16, 18, 20, 22, 24, 26};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, scale_data, CONST_TENSOR},
+              {weight_shape, offset_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestScaleOpenCL, ScaleAxis1Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> weight_shape = {h};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> scale_data = {1.0f, 2.0f};
-  std::vector<float> offset_data = {1.0f, 2.0f};
-  std::vector<float> output_data = {2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f};
-  RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape,
-                   output_data.data(), out_shape, false, 1);
+TEST_F(TestOpenCL_Scale, Axis3) {
+  int axis = 3;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> weight_shape = {input_shape[axis]};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float scale_data[] = {1, 2, 3};
+  float offset_data[] = {1, 2, 3};
+  float output_data[] = {2, 6, 12, 5, 12, 21, 8, 18, 30, 11, 24, 39};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, scale_data, CONST_TENSOR},
+              {weight_shape, offset_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestScaleOpenCL, ScaleAxis3ReLU6Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> weight_shape = {c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float> scale_data = {1.0f, 2.0f, -1.0f};
-  std::vector<float> offset_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {2.0f, 6.0f, 0.0f, 5.0f, 6.0f, 0.0f, 6.0f, 6.0f, 0.0f, 6.0f, 6.0f, 0.0f};
-  RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape,
-                   output_data.data(), out_shape, false, 3, schema::ActivationType_RELU6);
+TEST_F(TestOpenCL_Scale, Axis3RELU6) {
+  int axis = 3;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> weight_shape = {input_shape[axis]};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float scale_data[] = {1, 2, -1};
+  float offset_data[] = {1, 2, 3};
+  float output_data[] = {2, 6, 0, 5, 6, 0, 6, 6, 0, 6, 6, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis, schema::ActivationType_RELU6);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, scale_data, CONST_TENSOR},
+              {weight_shape, offset_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestScaleOpenCL, ScaleAxis3Fp16) {
-  int n = 1;
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  std::vector<int> in_shape0 = {n, h, w, c};
-  std::vector<int> weight_shape = {c};
-  std::vector<int> out_shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  std::vector<float16_t> scale_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float16_t> offset_data = {1.0f, 2.0f, 3.0f};
-  std::vector<float16_t> output_data = {2.0f, 6.0f, 12.0f, 5.0f, 12.0f, 21.0f, 8.0f, 18.0f, 30.0f, 11.0f, 24.0f, 39.0f};
-  RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape,
-                   output_data.data(), out_shape, true, 3);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/shape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/shape_tests.cc
new file mode 100644
index 0000000000..f2fca060ab
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/shape_tests.cc
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/shape.h"
+
+namespace mindspore::lite::opencl::test {
+
+class TestOpenCL_Shape : public CommonTest {};
+
+namespace {
+// PrimitiveType_Shape: src/ops/populate/shape_populate.cc
+OpParameter *CreateParameter() {
+  auto *param = test::CreateParameter<ShapeParameter>(schema::PrimitiveType_Shape);
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
+
+TEST_F(TestOpenCL_Shape, test0) {
+  std::vector<int> input_shape = {2, 4};
+  std::vector<int> output_shape = {2};
+  float input_data[] = {-0.4045, -0.0924, -0.617, -0.10114, -0.9893, 0.3342, 2.445, -2.182};
+  float output_data[] = {2, 4};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
+}
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
index 692ee9b073..d87cc1dbcd 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
@@ -13,21 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "common/common_test.h"
 #include "nnacl/slice_parameter.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
 
-namespace mindspore {
+namespace mindspore::lite::opencl::test {
 
-class TestSliceOpenCL : public mindspore::CommonTest {};
+class TestOpenCL_Slice : public CommonTest {};
 
-OpParameter *GetSliceParameter(const std::vector<int> &begin, const std::vector<int> &size) {
-  auto param = static_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "SliceParameter create error.";
-    return nullptr;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_Slice;
+namespace {
+// PrimitiveType_Slice: src/ops/populate/slice_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &begin, const std::vector<int> &size) {
+  auto *param = test::CreateParameter<SliceParameter>(schema::PrimitiveType_Slice);
   param->param_length_ = begin.size();
   for (int i = 0; i < begin.size(); ++i) {
     param->begin_[i] = begin[i];
@@ -35,21 +31,22 @@ OpParameter *GetSliceParameter(const std::vector<int> &begin, const std::vector<
   }
   return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestSliceOpenCL, 4D) {
+TEST_F(TestOpenCL_Slice, 4D) {
   float input_data[] = {-0.45816937, 0.92391545,  -0.9135602, -1.4002057, 1.1080881,  0.40712625,  -0.28128958,
                         0.09470133,  0.19801073,  0.04927751, -1.2808367, 0.1470597,  0.03393711,  -0.33282498,
                         -1.0433807,  -1.3678077,  -0.6423931, 0.5584889,  0.28965706, 0.5343769,   0.75480366,
                         -1.9328151,  -0.48714373, 1.711132,   -1.8871949, -0.2987629, -0.14000037, -0.080552,
                         0.95056856,  -0.06886655, 0.5316237,  0.05787678};
-  float expect_data[] = {-0.9135602,  -1.4002057,  1.1080881,  0.40712625, -0.28128958, -1.2808367, 0.1470597,
+  float output_data[] = {-0.9135602,  -1.4002057,  1.1080881,  0.40712625, -0.28128958, -1.2808367, 0.1470597,
                          0.03393711,  -0.33282498, -1.0433807, 0.28965706, 0.5343769,   0.75480366, -1.9328151,
                          -0.48714373, -0.14000037, -0.080552,  0.95056856, -0.06886655, 0.5316237};
-  auto param = GetSliceParameter({0, 0, 0, 2}, {1, 2, 2, 5});
-  TestMain({{{1, 2, 2, 8}, input_data, Tensor::Category::VAR}}, {{1, 2, 2, 5}, expect_data}, param, false);
+  auto param = CreateParameter({0, 0, 0, 2}, {1, 2, 2, 5});
+  TestMain({{{1, 2, 2, 8}, input_data, VAR}}, {{1, 2, 2, 5}, output_data}, param, false);
 }
 
-TEST_F(TestSliceOpenCL, tflite_cpu) {
+TEST_F(TestOpenCL_Slice, test0) {
   std::vector<std::tuple<std::string, std::vector<int>, std::vector<int>, std::vector<float>, std::vector<float>,
                          std::vector<int>, std::vector<int>>>
     cases = {{"In1D", {4}, {2}, {1, 2, 3, 4}, {2, 3}, {1}, {2}},
@@ -146,18 +143,16 @@ TEST_F(TestSliceOpenCL, tflite_cpu) {
     auto &input_shape = std::get<1>(case_);
     auto &output_shape = std::get<2>(case_);
     auto &input_data = std::get<3>(case_);
-    auto &expect_data = std::get<4>(case_);
+    auto &output_data = std::get<4>(case_);
     auto &begin = std::get<5>(case_);
     auto &size = std::get<6>(case_);
 
     std::cout << name << std::endl;
-    auto *param = GetSliceParameter(begin, size);
-    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
-             false);
-    param = GetSliceParameter(begin, size);
-    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
-             true);
+    auto *param = CreateParameter(begin, size);
+    TestMain({{input_shape, input_data.data(), VAR}}, {output_shape, output_data.data()}, param, false);
+    param = CreateParameter(begin, size);
+    TestMain({{input_shape, input_data.data(), VAR}}, {output_shape, output_data.data()}, param, true);
   }
 }  // namespace mindspore
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
index 893da87506..b696111e3b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
@@ -13,157 +13,62 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/softmax_parameter.h"
 
-namespace mindspore {
-class TestSoftmaxOpenCL : public mindspore::CommonTest {
- public:
-  TestSoftmaxOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
-                        int axis) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  int n, h, w, c;
-  bool is_2d = false;
-  if (shape.size() == 2) {
-    is_2d = true;
-    h = w = 1;
-    n = shape[0];
-    c = shape[1];
-  } else {
-    n = shape[0];
-    h = shape[1];
-    w = shape[2];
-    c = shape[3];
-  }
-  std::vector<int> input_shape = {n, h, w, c};
-  if (is_2d) {
-    input_shape = {n, c};
-  }
-  auto input_format = is_2d ? schema::Format_NC : schema::Format_NHWC;
-  auto input_dtype = enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(input_dtype), input_shape, input_format);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(input_dtype), input_shape, input_format);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto opParameter = static_cast<SoftmaxParameter *>(malloc(sizeof(SoftmaxParameter)));
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "opParameter create error.";
-    return;
-  }
-  opParameter->axis_ = axis;
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::SoftmaxOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(opParameter), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  }
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
+class TestOpenCL_SoftMax : public CommonTest {};
 
-  MS_LOG(INFO) << "Test Softmax passed";
+namespace {
+// PrimitiveType_SoftMax: src/ops/populate/softmax_populate.cc
+OpParameter *CreateParameter(int axis) {
+  auto *param = test::CreateParameter<SoftmaxParameter>(schema::PrimitiveType_SoftMax);
+  param->axis_ = axis;
+  return reinterpret_cast<OpParameter *>(param);
 }
-
-TEST_F(TestSoftmaxOpenCL, Softmax2DFp32) {
-  int n = 1;
-  int c = 10;
-  std::vector<int> shape = {n, c};
-  std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> output_data = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
-
-  RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), false, 1);
-}
-
-TEST_F(TestSoftmaxOpenCL, Softmax2DFp16) {
-  int n = 1;
-  int c = 10;
-  std::vector<int> shape = {n, c};
-  std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> output_data = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
-
-  RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), true, 1);
+}  // namespace
+
+TEST_F(TestOpenCL_SoftMax, 2D_axis1) {
+  int axis = 1;
+  std::vector<int> input_shape = {1, 10};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float output_data[] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
+             fp16_enable ? 2e-2 : 1e-5);
+  }
 }
 
-TEST_F(TestSoftmaxOpenCL, Softmax4DFp32) {
-  int n = 1;
-  int h = 2;
-  int w = 1;
-  int c = 5;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float> output_data = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
-
-  RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), false, 3);
+TEST_F(TestOpenCL_SoftMax, 4D_axis3) {
+  int axis = 3;
+  std::vector<int> input_shape = {1, 2, 1, 5};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float output_data[] = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
+             fp16_enable ? 2e-2 : 1e-5);
+  }
 }
 
-TEST_F(TestSoftmaxOpenCL, Softmax4DFp16) {
-  int n = 1;
-  int h = 2;
-  int w = 1;
-  int c = 5;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  std::vector<float16_t> output_data = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
-
-  RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), true, 3);
+TEST_F(TestOpenCL_SoftMax, 4D_axis1) {
+  int axis = 1;
+  std::vector<int> input_shape = {1, 2, 1, 1};
+  std::vector<int> output_shape = input_shape;
+  float input_data[] = {1, 1};
+  float output_data[] = {0.5, 0.5};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
+             fp16_enable ? 2e-2 : 1e-5);
+  }
 }
 
-TEST_F(TestSoftmaxOpenCL, Softmax4DAxis1Fp32) {
-  int n = 1;
-  int h = 2;
-  int w = 1;
-  int c = 1;
-  std::vector<int> shape = {n, h, w, c};
-  std::vector<float> input_data = {1.0f, 1.0f};
-  std::vector<float> output_data = {0.5f, 0.5f};
-
-  RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), false, 1);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_batch_nd_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_batch_nd_tests.cc
index 593e88ef10..c8c127837d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_batch_nd_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_batch_nd_tests.cc
@@ -13,100 +13,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/fp32/space_to_batch_fp32.h"
 
-namespace mindspore {
-class TestSpaceToBatchNDOpenCL : public mindspore::CommonTest {
- public:
-  TestSpaceToBatchNDOpenCL() {}
-};
-template <typename T>
-void test_main_space_to_batch_nd(void *input_data, void *correct_data, const std::vector<int> &input_shape,
-                                 SpaceToBatchParameter *param, TypeId data_type, schema::Format format) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime_wrap = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = ocl_runtime_wrap.GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
+namespace mindspore::lite::opencl::test {
 
-  std::vector<int> output_shape = input_shape;
-  output_shape[0] = input_shape[0] * param->block_sizes_[0] * param->block_sizes_[1];
-  output_shape[1] = (input_shape[1] + param->paddings_[0] + param->paddings_[1]) / param->block_sizes_[0];
-  output_shape[2] = (input_shape[2] + +param->paddings_[2] + param->paddings_[3]) / param->block_sizes_[1];
-
-  auto tensor_a = lite::Tensor(TypeId(data_type), input_shape, format);
-  auto tensor_c = lite::Tensor(TypeId(data_type), output_shape, format);
-  std::vector<lite::Tensor *> inputs{&tensor_a};
-  std::vector<lite::Tensor *> outputs{&tensor_c};
-  size_t input_size = tensor_a.Size();
-
-  auto *pkernel =
-    new (std::nothrow) kernel::SpaceToBatchNDOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (pkernel == nullptr) {
-    MS_LOG(INFO) << "new SpaceToBatchNDOpenCLKernel failed ";
-    return;
-  }
-  pkernel->Init();
+class TestOpenCL_SpaceToBatch : public CommonTest {};
 
-  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
+namespace {
+// PrimitiveType_SpaceToBatchND: src/ops/populate/space_to_batch_nd_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &block_sizes, const std::vector<int> &paddings) {
+  auto *param = test::CreateParameter<SpaceToBatchParameter>(schema::PrimitiveType_SpaceToBatchND);
+  EXPECT_LE(block_sizes.size(), 4);
+  EXPECT_LE(paddings.size(), 4);
+  for (int i = 0; i < block_sizes.size(); ++i) {
+    param->block_sizes_[i] = block_sizes[i];
   }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{pkernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    delete pkernel;
-    MS_LOG(INFO) << " new SubGraphOpenCLKernel failed ";
-    return;
+  for (int i = 0; i < paddings.size(); ++i) {
+    param->paddings_[i] = paddings[i];
   }
-  sub_graph->Init();
-
-  MS_LOG(INFO) << " init tensors ";
-  T *input_ptr = reinterpret_cast<T *>(inputs[0]->MutableData());
-  memcpy(input_ptr, input_data, input_size);
-  std::cout << "==================input data================" << std::endl;
-  for (auto i = 0; i < inputs[0]->ElementsNum(); ++i) {
-    std::cout << input_ptr[i] << ", ";
-  }
-  std::cout << std::endl;
-
-  sub_graph->Run();
+  return reinterpret_cast<OpParameter *>(param);
+}
 
-  auto *output_data = reinterpret_cast<T *>(outputs[0]->MutableData());
-  std::cout << "==================output data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << output_data[i] << ", ";
-  }
-  std::cout << std::endl;
-  std::cout << "==================correct data================" << std::endl;
-  for (auto i = 0; i < outputs[0]->ElementsNum(); ++i) {
-    std::cout << static_cast<T *>(correct_data)[i] << ", ";
-  }
-  std::cout << std::endl;
-  CommonTest::CompareOutputData<T>(output_data, static_cast<T *>(correct_data), outputs[0]->ElementsNum(), 0.0001);
-  delete sub_graph;
+std::vector<int> InferShape(const std::vector<int> &input_shape, const std::vector<int> &block_sizes,
+                            const std::vector<int> &paddings) {
+  std::vector<int> output_shape = input_shape;
+  output_shape[0] = input_shape[0] * block_sizes[0] * block_sizes[1];
+  output_shape[1] = (input_shape[1] + paddings[0] + paddings[1]) / block_sizes[0];
+  output_shape[2] = (input_shape[2] + +paddings[2] + paddings[3]) / block_sizes[1];
+  return output_shape;
 }
-TEST_F(TestSpaceToBatchNDOpenCL, NHWC4H2W2Pad2222) {
+}  // namespace
+
+TEST_F(TestOpenCL_SpaceToBatch, H2W2Pad2222) {
   std::vector<int> input_shape{1, 6, 6, 4};
-  SpaceToBatchParameter *param = std::make_unique<SpaceToBatchParameter>().release();
-  if (param == nullptr) {
-    return;
-  }
-  param->block_sizes_[0] = 2;
-  param->block_sizes_[1] = 2;
-  param->paddings_[0] = 2;
-  param->paddings_[1] = 2;
-  param->paddings_[2] = 2;
-  param->paddings_[3] = 2;
+  std::vector<int> block_sizes = {2, 2};
+  std::vector<int> paddings = {2, 2, 2, 2};
+  auto output_shape = InferShape(input_shape, block_sizes, paddings);
   float input_data[] = {172, 47,  117, 192, 67,  251, 195, 103, 9,   211, 21,  242, 36,  87,  70,  216, 88,  140,
                         58,  193, 230, 39,  87,  174, 88,  81,  165, 25,  77,  72,  9,   148, 115, 208, 243, 197,
                         254, 79,  175, 192, 82,  99,  216, 177, 243, 29,  147, 147, 142, 167, 32,  193, 9,   185,
@@ -115,7 +58,7 @@ TEST_F(TestSpaceToBatchNDOpenCL, NHWC4H2W2Pad2222) {
                         119, 11,  174, 82,  91,  128, 142, 99,  53,  140, 121, 170, 84,  203, 68,  6,   196, 47,
                         127, 244, 131, 204, 100, 180, 232, 78,  143, 148, 227, 186, 23,  207, 141, 117, 85,  48,
                         49,  69,  169, 163, 192, 95,  197, 94,  0,   113, 178, 36,  162, 48,  93,  131, 98,  42};
-  float correct_data[] = {
+  float output_data[] = {
     0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     0,   172, 47, 117, 192, 9,   211, 21,  242, 88,  140, 58,  193, 0,   0,   0,   0,   0,   0,   0,   0,   142, 167,
     32,  193, 31, 202, 244, 151, 183, 28,  34,  128, 0,   0,   0,   0,   0,   0,   0,   0,   142, 99,  53,  140, 68,
@@ -134,51 +77,10 @@ TEST_F(TestSpaceToBatchNDOpenCL, NHWC4H2W2Pad2222) {
     132, 105, 42, 65,  231, 169, 57,  174, 82,  91,  128, 0,   0,   0,   0,   0,   0,   0,   0,   85,  48,  49,  69,
     197, 94,  0,  113, 93,  131, 98,  42,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     0,   0,   0,  0,   0,   0,   0,   0,   0};
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NHWC;
-  test_main_space_to_batch_nd<float>(input_data, correct_data, input_shape, param, data_type, format);
-}
-TEST_F(TestSpaceToBatchNDOpenCL, NC4HW4H2W2Pad2222) {
-  std::vector<int> input_shape{1, 6, 6, 4};
-  SpaceToBatchParameter *param = std::make_unique<SpaceToBatchParameter>().release();
-  if (param == nullptr) {
-    return;
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_sizes, paddings);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-  param->block_sizes_[0] = 2;
-  param->block_sizes_[1] = 2;
-  param->paddings_[0] = 2;
-  param->paddings_[1] = 2;
-  param->paddings_[2] = 2;
-  param->paddings_[3] = 2;
-  float input_data[] = {172, 47,  117, 192, 67,  251, 195, 103, 9,   211, 21,  242, 36,  87,  70,  216, 88,  140,
-                        58,  193, 230, 39,  87,  174, 88,  81,  165, 25,  77,  72,  9,   148, 115, 208, 243, 197,
-                        254, 79,  175, 192, 82,  99,  216, 177, 243, 29,  147, 147, 142, 167, 32,  193, 9,   185,
-                        127, 32,  31,  202, 244, 151, 163, 254, 203, 114, 183, 28,  34,  128, 128, 164, 53,  133,
-                        38,  232, 244, 17,  79,  132, 105, 42,  186, 31,  120, 1,   65,  231, 169, 57,  35,  102,
-                        119, 11,  174, 82,  91,  128, 142, 99,  53,  140, 121, 170, 84,  203, 68,  6,   196, 47,
-                        127, 244, 131, 204, 100, 180, 232, 78,  143, 148, 227, 186, 23,  207, 141, 117, 85,  48,
-                        49,  69,  169, 163, 192, 95,  197, 94,  0,   113, 178, 36,  162, 48,  93,  131, 98,  42};
-  float correct_data[] = {
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   172, 47, 117, 192, 9,   211, 21,  242, 88,  140, 58,  193, 0,   0,   0,   0,   0,   0,   0,   0,   142, 167,
-    32,  193, 31, 202, 244, 151, 183, 28,  34,  128, 0,   0,   0,   0,   0,   0,   0,   0,   142, 99,  53,  140, 68,
-    6,   196, 47, 100, 180, 232, 78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   67,  251, 195, 103, 36,  87,  70,  216, 230, 39,  87,  174, 0,   0,
-    0,   0,   0,  0,   0,   0,   9,   185, 127, 32,  163, 254, 203, 114, 128, 164, 53,  133, 0,   0,   0,   0,   0,
-    0,   0,   0,  121, 170, 84,  203, 127, 244, 131, 204, 143, 148, 227, 186, 0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   88,  81,  165, 25,  115, 208,
-    243, 197, 82, 99,  216, 177, 0,   0,   0,   0,   0,   0,   0,   0,   38,  232, 244, 17,  186, 31,  120, 1,   35,
-    102, 119, 11, 0,   0,   0,   0,   0,   0,   0,   0,   23,  207, 141, 117, 169, 163, 192, 95,  178, 36,  162, 48,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   77, 72,  9,   148, 254, 79,  175, 192, 243, 29,  147, 147, 0,   0,   0,   0,   0,   0,   0,   0,   79,
-    132, 105, 42, 65,  231, 169, 57,  174, 82,  91,  128, 0,   0,   0,   0,   0,   0,   0,   0,   85,  48,  49,  69,
-    197, 94,  0,  113, 93,  131, 98,  42,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,  0,   0,   0,   0,   0,   0};
-  TypeId data_type = kNumberTypeFloat32;
-  schema::Format format = schema::Format_NCHW;
-  test_main_space_to_batch_nd<float>(input_data, correct_data, input_shape, param, data_type, format);
 }
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_depth_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_depth_tests.cc
index 83ba480679..836abfed04 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_depth_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/space_to_depth_tests.cc
@@ -13,256 +13,154 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/fp32/space_to_depth_fp32.h"
 
-namespace mindspore {
-class TestSpaceToDepthOpenCL : public mindspore::CommonTest {
- public:
-  TestSpaceToDepthOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestCaseSpaceToDepth(const std::vector<int> &shape_in, const std::vector<int> &shape_out, void *input_data,
-                             void *output_data, bool enable_fp16, int block_size) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto allocator = ocl_runtime->GetAllocator();
-  auto param = static_cast<SpaceToDepthParameter *>(malloc(sizeof(SpaceToDepthParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->block_size_ = block_size;
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     shape_in, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  auto tensor_out_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), shape_out);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::SpaceToDepthOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
-
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
-  }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
-  pGraph->Run();
-
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
-                  2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
-  }
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
-  }
-
-  MS_LOG(INFO) << "Test SpaceToDepth passed";
-}
-
-TEST_F(TestSpaceToDepthOpenCL, AlignTest1Fp32) {
-  std::vector<int> shape_in = {1, 2, 2, 4};
-  std::vector<int> shape_out = {1, 1, 1, 16};
-  std::vector<float> input_data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                                   9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  std::vector<float> output_data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                                    9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 2);
-}
-
-TEST_F(TestSpaceToDepthOpenCL, AlignTest1Fp16) {
-  std::vector<int> shape_in = {1, 2, 2, 4};
-  std::vector<int> shape_out = {1, 1, 1, 16};
-  std::vector<float16_t> input_data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  std::vector<float16_t> output_data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                                        9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+class TestOpenCL_SpaceToDepth : public CommonTest {};
 
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), true, 2);
-}
-
-TEST_F(TestSpaceToDepthOpenCL, AlignTest2Fp32) {
-  std::vector<int> shape_in = {1, 4, 4, 4};
-  std::vector<int> shape_out = {1, 2, 2, 16};
-  std::vector<float> input_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
-    16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,
-    32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,
-    48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f};
-  std::vector<float> output_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
-    8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,
-    32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
-    40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 2);
+namespace {
+// PrimitiveType_SpaceToDepth: src/ops/populate/space_to_depth_populate.cc
+OpParameter *CreateParameter(int block_size) {
+  auto *param = test::CreateParameter<SpaceToDepthParameter>(schema::PrimitiveType_SpaceToDepth);
+  param->block_size_ = block_size;
+  return reinterpret_cast<OpParameter *>(param);
 }
-
-TEST_F(TestSpaceToDepthOpenCL, AlignTest2Fp16) {
-  std::vector<int> shape_in = {1, 4, 4, 4};
-  std::vector<int> shape_out = {1, 2, 2, 16};
-  std::vector<float16_t> input_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
-    16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,
-    32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,
-    48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f};
-  std::vector<float16_t> output_data = {
-    0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
-    8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,
-    32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
-    40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), true, 2);
+}  // namespace
+
+TEST_F(TestOpenCL_SpaceToDepth, AlignTest1) {
+  int block_size = 2;
+  std::vector<int> input_shape = {1, 2, 2, 4};
+  std::vector<int> output_shape = {1, 1, 1, 16};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  float output_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, AlignTest3Fp32) {
-  std::vector<int> shape_in = {1, 6, 6, 4};
-  std::vector<int> shape_out = {1, 2, 2, 36};
-  std::vector<float> input_data = {
-    0.0f,   1.0f,   2.0f,   3.0f,   4.0f,   5.0f,   6.0f,   7.0f,   8.0f,   9.0f,   10.0f,  11.0f,  12.0f,  13.0f,
-    14.0f,  15.0f,  16.0f,  17.0f,  18.0f,  19.0f,  20.0f,  21.0f,  22.0f,  23.0f,  24.0f,  25.0f,  26.0f,  27.0f,
-    28.0f,  29.0f,  30.0f,  31.0f,  32.0f,  33.0f,  34.0f,  35.0f,  36.0f,  37.0f,  38.0f,  39.0f,  40.0f,  41.0f,
-    42.0f,  43.0f,  44.0f,  45.0f,  46.0f,  47.0f,  48.0f,  49.0f,  50.0f,  51.0f,  52.0f,  53.0f,  54.0f,  55.0f,
-    56.0f,  57.0f,  58.0f,  59.0f,  60.0f,  61.0f,  62.0f,  63.0f,  64.0f,  65.0f,  66.0f,  67.0f,  68.0f,  69.0f,
-    70.0f,  71.0f,  72.0f,  73.0f,  74.0f,  75.0f,  76.0f,  77.0f,  78.0f,  79.0f,  80.0f,  81.0f,  82.0f,  83.0f,
-    84.0f,  85.0f,  86.0f,  87.0f,  88.0f,  89.0f,  90.0f,  91.0f,  92.0f,  93.0f,  94.0f,  95.0f,  96.0f,  97.0f,
-    98.0f,  99.0f,  100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 108.0f, 109.0f, 110.0f, 111.0f,
-    112.0f, 113.0f, 114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 120.0f, 121.0f, 122.0f, 123.0f, 124.0f, 125.0f,
-    126.0f, 127.0f, 128.0f, 129.0f, 130.0f, 131.0f, 132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f,
-    140.0f, 141.0f, 142.0f, 143.0f};
-  std::vector<float> output_data = {
-    0.0f,   1.0f,   2.0f,   3.0f,   4.0f,   5.0f,   6.0f,   7.0f,   8.0f,   9.0f,   10.0f,  11.0f,  24.0f,  25.0f,
-    26.0f,  27.0f,  28.0f,  29.0f,  30.0f,  31.0f,  32.0f,  33.0f,  34.0f,  35.0f,  48.0f,  49.0f,  50.0f,  51.0f,
-    52.0f,  53.0f,  54.0f,  55.0f,  56.0f,  57.0f,  58.0f,  59.0f,  12.0f,  13.0f,  14.0f,  15.0f,  16.0f,  17.0f,
-    18.0f,  19.0f,  20.0f,  21.0f,  22.0f,  23.0f,  36.0f,  37.0f,  38.0f,  39.0f,  40.0f,  41.0f,  42.0f,  43.0f,
-    44.0f,  45.0f,  46.0f,  47.0f,  60.0f,  61.0f,  62.0f,  63.0f,  64.0f,  65.0f,  66.0f,  67.0f,  68.0f,  69.0f,
-    70.0f,  71.0f,  72.0f,  73.0f,  74.0f,  75.0f,  76.0f,  77.0f,  78.0f,  79.0f,  80.0f,  81.0f,  82.0f,  83.0f,
-    96.0f,  97.0f,  98.0f,  99.0f,  100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 120.0f, 121.0f,
-    122.0f, 123.0f, 124.0f, 125.0f, 126.0f, 127.0f, 128.0f, 129.0f, 130.0f, 131.0f, 84.0f,  85.0f,  86.0f,  87.0f,
-    88.0f,  89.0f,  90.0f,  91.0f,  92.0f,  93.0f,  94.0f,  95.0f,  108.0f, 109.0f, 110.0f, 111.0f, 112.0f, 113.0f,
-    114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f,
-    140.0f, 141.0f, 142.0f, 143.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 3);
+TEST_F(TestOpenCL_SpaceToDepth, AlignTest2) {
+  int block_size = 2;
+  std::vector<int> input_shape = {1, 4, 4, 4};
+  std::vector<int> output_shape = {1, 2, 2, 16};
+  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                        44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+  float output_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23, 8,  9,  10, 11, 12, 13,
+                         14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51,
+                         52, 53, 54, 55, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, NotAlignTest1Fp32) {
-  std::vector<int> shape_in = {1, 2, 2, 1};
-  std::vector<int> shape_out = {1, 1, 1, 4};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 2);
+TEST_F(TestOpenCL_SpaceToDepth, AlignTest3) {
+  int block_size = 3;
+  std::vector<int> input_shape = {1, 6, 6, 4};
+  std::vector<int> output_shape = {1, 2, 2, 36};
+  float input_data[] = {0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
+                        18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+                        36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
+                        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+                        72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                        90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+                        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+                        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143};
+  float output_data[] = {0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  24,  25,  26,  27,  28,  29,
+                         30,  31,  32,  33,  34,  35,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+                         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  36,  37,  38,  39,  40,  41,
+                         42,  43,  44,  45,  46,  47,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+                         72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  96,  97,  98,  99,  100, 101,
+                         102, 103, 104, 105, 106, 107, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+                         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  108, 109, 110, 111, 112, 113,
+                         114, 115, 116, 117, 118, 119, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, NotAlignTest1Fp16) {
-  std::vector<int> shape_in = {1, 2, 2, 1};
-  std::vector<int> shape_out = {1, 1, 1, 4};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f};
-  std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f};
+TEST_F(TestOpenCL_SpaceToDepth, NotAlignTest1) {
+  int block_size = 2;
+  std::vector<int> input_shape = {1, 2, 2, 1};
+  std::vector<int> output_shape = {1, 1, 1, 4};
+  float input_data[] = {0, 1, 2, 3};
+  float output_data[] = {0, 1, 2, 3};
 
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), true, 2);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, NotAlignTest2Fp32) {
-  std::vector<int> shape_in = {1, 2, 2, 3};
-  std::vector<int> shape_out = {1, 1, 1, 12};
-  std::vector<float> input_data = {
-    0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f,
+TEST_F(TestOpenCL_SpaceToDepth, NotAlignTest2) {
+  int block_size = 2;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 1, 1, 12};
+  float input_data[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
   };
-  std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  float output_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
 
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 2);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, NotAlignTest3Fp32) {
-  std::vector<int> shape_in = {1, 4, 4, 3};
-  std::vector<int> shape_out = {1, 2, 2, 12};
-  std::vector<float> input_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f,
-                                   12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
-                                   24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
-                                   36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f};
-  std::vector<float> output_data = {0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
-                                    6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
-                                    24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f,
-                                    30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 2);
+TEST_F(TestOpenCL_SpaceToDepth, NotAlignTest3) {
+  int block_size = 2;
+  std::vector<int> input_shape = {1, 4, 4, 3};
+  std::vector<int> output_shape = {1, 2, 2, 12};
+  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47};
+  float output_data[] = {0,  1,  2,  3,  4,  5,  12, 13, 14, 15, 16, 17, 6,  7,  8,  9,
+                         10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 36, 37,
+                         38, 39, 40, 41, 30, 31, 32, 33, 34, 35, 42, 43, 44, 45, 46, 47};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestSpaceToDepthOpenCL, NotAlignTest4Fp32) {
-  std::vector<int> shape_in = {1, 6, 6, 6};
-  std::vector<int> shape_out = {1, 2, 2, 54};
-  std::vector<float> input_data = {
-    0.0f,   1.0f,   2.0f,   3.0f,   4.0f,   5.0f,   6.0f,   7.0f,   8.0f,   9.0f,   10.0f,  11.0f,  12.0f,  13.0f,
-    14.0f,  15.0f,  16.0f,  17.0f,  18.0f,  19.0f,  20.0f,  21.0f,  22.0f,  23.0f,  24.0f,  25.0f,  26.0f,  27.0f,
-    28.0f,  29.0f,  30.0f,  31.0f,  32.0f,  33.0f,  34.0f,  35.0f,  36.0f,  37.0f,  38.0f,  39.0f,  40.0f,  41.0f,
-    42.0f,  43.0f,  44.0f,  45.0f,  46.0f,  47.0f,  48.0f,  49.0f,  50.0f,  51.0f,  52.0f,  53.0f,  54.0f,  55.0f,
-    56.0f,  57.0f,  58.0f,  59.0f,  60.0f,  61.0f,  62.0f,  63.0f,  64.0f,  65.0f,  66.0f,  67.0f,  68.0f,  69.0f,
-    70.0f,  71.0f,  72.0f,  73.0f,  74.0f,  75.0f,  76.0f,  77.0f,  78.0f,  79.0f,  80.0f,  81.0f,  82.0f,  83.0f,
-    84.0f,  85.0f,  86.0f,  87.0f,  88.0f,  89.0f,  90.0f,  91.0f,  92.0f,  93.0f,  94.0f,  95.0f,  96.0f,  97.0f,
-    98.0f,  99.0f,  100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 108.0f, 109.0f, 110.0f, 111.0f,
-    112.0f, 113.0f, 114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 120.0f, 121.0f, 122.0f, 123.0f, 124.0f, 125.0f,
-    126.0f, 127.0f, 128.0f, 129.0f, 130.0f, 131.0f, 132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f,
-    140.0f, 141.0f, 142.0f, 143.0f, 144.0f, 145.0f, 146.0f, 147.0f, 148.0f, 149.0f, 150.0f, 151.0f, 152.0f, 153.0f,
-    154.0f, 155.0f, 156.0f, 157.0f, 158.0f, 159.0f, 160.0f, 161.0f, 162.0f, 163.0f, 164.0f, 165.0f, 166.0f, 167.0f,
-    168.0f, 169.0f, 170.0f, 171.0f, 172.0f, 173.0f, 174.0f, 175.0f, 176.0f, 177.0f, 178.0f, 179.0f, 180.0f, 181.0f,
-    182.0f, 183.0f, 184.0f, 185.0f, 186.0f, 187.0f, 188.0f, 189.0f, 190.0f, 191.0f, 192.0f, 193.0f, 194.0f, 195.0f,
-    196.0f, 197.0f, 198.0f, 199.0f, 200.0f, 201.0f, 202.0f, 203.0f, 204.0f, 205.0f, 206.0f, 207.0f, 208.0f, 209.0f,
-    210.0f, 211.0f, 212.0f, 213.0f, 214.0f, 215.0f};
-  std::vector<float> output_data = {
-    0.0f,   1.0f,   2.0f,   3.0f,   4.0f,   5.0f,   6.0f,   7.0f,   8.0f,   9.0f,   10.0f,  11.0f,  12.0f,  13.0f,
-    14.0f,  15.0f,  16.0f,  17.0f,  36.0f,  37.0f,  38.0f,  39.0f,  40.0f,  41.0f,  42.0f,  43.0f,  44.0f,  45.0f,
-    46.0f,  47.0f,  48.0f,  49.0f,  50.0f,  51.0f,  52.0f,  53.0f,  72.0f,  73.0f,  74.0f,  75.0f,  76.0f,  77.0f,
-    78.0f,  79.0f,  80.0f,  81.0f,  82.0f,  83.0f,  84.0f,  85.0f,  86.0f,  87.0f,  88.0f,  89.0f,  18.0f,  19.0f,
-    20.0f,  21.0f,  22.0f,  23.0f,  24.0f,  25.0f,  26.0f,  27.0f,  28.0f,  29.0f,  30.0f,  31.0f,  32.0f,  33.0f,
-    34.0f,  35.0f,  54.0f,  55.0f,  56.0f,  57.0f,  58.0f,  59.0f,  60.0f,  61.0f,  62.0f,  63.0f,  64.0f,  65.0f,
-    66.0f,  67.0f,  68.0f,  69.0f,  70.0f,  71.0f,  90.0f,  91.0f,  92.0f,  93.0f,  94.0f,  95.0f,  96.0f,  97.0f,
-    98.0f,  99.0f,  100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 108.0f, 109.0f, 110.0f, 111.0f,
-    112.0f, 113.0f, 114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 120.0f, 121.0f, 122.0f, 123.0f, 124.0f, 125.0f,
-    144.0f, 145.0f, 146.0f, 147.0f, 148.0f, 149.0f, 150.0f, 151.0f, 152.0f, 153.0f, 154.0f, 155.0f, 156.0f, 157.0f,
-    158.0f, 159.0f, 160.0f, 161.0f, 180.0f, 181.0f, 182.0f, 183.0f, 184.0f, 185.0f, 186.0f, 187.0f, 188.0f, 189.0f,
-    190.0f, 191.0f, 192.0f, 193.0f, 194.0f, 195.0f, 196.0f, 197.0f, 126.0f, 127.0f, 128.0f, 129.0f, 130.0f, 131.0f,
-    132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f, 140.0f, 141.0f, 142.0f, 143.0f, 162.0f, 163.0f,
-    164.0f, 165.0f, 166.0f, 167.0f, 168.0f, 169.0f, 170.0f, 171.0f, 172.0f, 173.0f, 174.0f, 175.0f, 176.0f, 177.0f,
-    178.0f, 179.0f, 198.0f, 199.0f, 200.0f, 201.0f, 202.0f, 203.0f, 204.0f, 205.0f, 206.0f, 207.0f, 208.0f, 209.0f,
-    210.0f, 211.0f, 212.0f, 213.0f, 214.0f, 215.0f};
-
-  RunTestCaseSpaceToDepth(shape_in, shape_out, input_data.data(), output_data.data(), false, 3);
+TEST_F(TestOpenCL_SpaceToDepth, NotAlignTest4) {
+  int block_size = 3;
+  std::vector<int> input_shape = {1, 6, 6, 6};
+  std::vector<int> output_shape = {1, 2, 2, 54};
+  float input_data[] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
+    22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+    44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
+    66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
+    88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+    132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+    154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
+    198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215};
+  float output_data[] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  36,  37,  38,  39,
+    40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  72,  73,  74,  75,  76,  77,  78,  79,
+    80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
+    70,  71,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 144, 145, 146, 147, 148, 149,
+    150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+    190, 191, 192, 193, 194, 195, 196, 197, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+    140, 141, 142, 143, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+    198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(block_size);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
-}  // namespace mindspore
+
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc
index 6b1c1b1fe1..ea8fc94e6b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc
@@ -1,11 +1,11 @@
 /**
  * Copyright 2020 Huawei Technologies Co., Ltd
  *
- * Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,519 +13,175 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h"
-using mindspore::lite::Tensor;
-using mindspore::schema::Format::Format_NHWC;
-namespace mindspore {
-class TestSparseToDenseOpenCLCI : public mindspore::CommonTest {
- public:
-  TestSparseToDenseOpenCLCI() {}
-};
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/sparse_to_dense_parameter.h"
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape3Vector) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
+namespace mindspore::lite::opencl::test {
 
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6, 3};
-  std::vector<int> input_shape2 = {3};
-  std::vector<int> input_shape3 = {6};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6};
-  float input_data2[] = {6, 1, 10};
-  float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-  float input_data4[] = {0.0};
-  float correctOutput[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  auto data_type = kNumberTypeFloat32;
-  std::vector<int> output_shape = {6, 1, 10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
-  }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
-  }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
+class TestOpenCL_SparseToDense : public CommonTest {};
 
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
+namespace {
+// PrimitiveType_SparseToDense: src/ops/populate/sparse_to_dense_populate.cc
+OpParameter *CreateParameter() {
+  auto *param = test::CreateParameter<SparseToDenseParameter>(schema::PrimitiveType_SparseToDense);
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Scalar) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6, 2};
-  std::vector<int> input_shape2 = {2};
+TEST_F(TestOpenCL_SparseToDense, Dim2Shape3Vector) {
+  std::vector<int> input_shape0 = {6, 3};
+  std::vector<int> input_shape1 = {3};
+  std::vector<int> input_shape2 = {6};
   std::vector<int> input_shape3 = {1};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {0, 0, 1, 2, 2, 3, 3, 6, 4, 7, 5, 9};
-  float input_data2[] = {6, 10};
-  float input_data3[] = {6.0};
-  float input_data4[] = {0.0};
-  float correctOutput[] = {6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
-                           0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6};
-  auto data_type = kNumberTypeFloat32;
-  std::vector<int> output_shape = {6, 10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
-  }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
+  std::vector<int> output_shape = {6, 1, 10};
+  float input_data0[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6};
+  float input_data1[] = {6, 1, 10};
+  float input_data2[] = {1, 2, 3, 4, 5, 6};
+  float input_data3[] = {0};
+  float output_data[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_TENSOR},
+              {input_shape3, input_data3, CONST_SCALAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Vector) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6, 2};
-  std::vector<int> input_shape2 = {2};
-  std::vector<int> input_shape3 = {6};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {0, 0, 1, 2, 2, 3, 3, 6, 4, 7, 5, 9};
-  float input_data2[] = {6, 10};
-  float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-  float input_data4[] = {0.0};
-  float correctOutput[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0,
-                           0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6};
-  auto data_type = kNumberTypeFloat32;
+TEST_F(TestOpenCL_SparseToDense, Dim2Scalar) {
+  std::vector<int> input_shape0 = {6, 2};
+  std::vector<int> input_shape1 = {2};
+  std::vector<int> input_shape2 = {1};
+  std::vector<int> input_shape3 = {1};
   std::vector<int> output_shape = {6, 10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
+  float input_data0[] = {0, 0, 1, 2, 2, 3, 3, 6, 4, 7, 5, 9};
+  float input_data1[] = {6, 10};
+  float input_data2[] = {6};
+  float input_data3[] = {0};
+  float output_data[] = {6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_SCALAR},
+              {input_shape3, input_data3, CONST_SCALAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
+}
 
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
+TEST_F(TestOpenCL_SparseToDense, Dim2Vector) {
+  std::vector<int> input_shape0 = {6, 2};
+  std::vector<int> input_shape1 = {2};
+  std::vector<int> input_shape2 = {6};
+  std::vector<int> input_shape3 = {1};
+  std::vector<int> output_shape = {6, 10};
+  float input_data0[] = {0, 0, 1, 2, 2, 3, 3, 6, 4, 7, 5, 9};
+  float input_data1[] = {6, 10};
+  float input_data2[] = {1, 2, 3, 4, 5, 6};
+  float input_data3[] = {0};
+  float output_data[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_TENSOR},
+              {input_shape3, input_data3, CONST_SCALAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape1Vector) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6, 1};
-  std::vector<int> input_shape2 = {1};
-  std::vector<int> input_shape3 = {6};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {0, 2, 3, 6, 7, 9};
-  float input_data2[] = {10};
-  float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-  float input_data4[] = {0.0};
-  float correctOutput[] = {1, 0, 2, 3, 0, 0, 4, 5, 0, 6};
-  auto data_type = kNumberTypeFloat32;
+TEST_F(TestOpenCL_SparseToDense, Dim2Shape1Vector) {
+  std::vector<int> input_shape0 = {6, 1};
+  std::vector<int> input_shape1 = {1};
+  std::vector<int> input_shape2 = {6};
+  std::vector<int> input_shape3 = {1};
   std::vector<int> output_shape = {10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
-  }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
+  float input_data0[] = {0, 2, 3, 6, 7, 9};
+  float input_data1[] = {10};
+  float input_data2[] = {1, 2, 3, 4, 5, 6};
+  float input_data3[] = {0};
+  float output_data[] = {1, 0, 2, 3, 0, 0, 4, 5, 0, 6};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_TENSOR},
+              {input_shape3, input_data3, CONST_SCALAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
-  }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape1Scalar) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {7, 1};  // shape[1] = 1
+TEST_F(TestOpenCL_SparseToDense, Dim2Shape1Scalar) {
+  std::vector<int> input_shape0 = {7, 1};
+  std::vector<int> input_shape1 = {1};
   std::vector<int> input_shape2 = {1};
   std::vector<int> input_shape3 = {1};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {0, 1, 2, 3, 4, 5, 9};
-  float input_data2[] = {10};
-  float input_data3[] = {6.0};
-  float input_data4[] = {0.0};
-  float correctOutput[] = {6, 6, 6, 6, 6, 6, 0, 0, 0, 6};
-  auto data_type = kNumberTypeFloat32;
   std::vector<int> output_shape = {10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
+  float input_data0[] = {0, 1, 2, 3, 4, 5, 9};
+  float input_data1[] = {10};
+  float input_data2[] = {6};
+  float input_data3[] = {0};
+  float output_data[] = {6, 6, 6, 6, 6, 6, 0, 0, 0, 6};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_SCALAR},
+              {input_shape3, input_data3, CONST_SCALAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
-  }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim1Scalar) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6};
+TEST_F(TestOpenCL_SparseToDense, Dim1Scalar) {
+  std::vector<int> input_shape0 = {6};
+  std::vector<int> input_shape1 = {1};
   std::vector<int> input_shape2 = {1};
   std::vector<int> input_shape3 = {1};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {1, 3, 4, 5, 6, 7};
-  float input_data2[] = {10};
-  float input_data3[] = {1.0};
-  float input_data4[] = {2.0};
-  float correctOutput[] = {2, 1, 2, 1, 1, 1, 1, 1, 2, 2};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
   std::vector<int> output_shape = {10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, tensor_type);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, tensor_type);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_SCALAR);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, tensor_type);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, tensor_type);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
+  float input_data0[] = {1, 3, 4, 5, 6, 7};
+  float input_data1[] = {10};
+  float input_data2[] = {1};
+  float input_data3[] = {2};
+  float output_data[] = {2, 1, 2, 1, 1, 1, 1, 1, 2, 2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_SCALAR},
+              {input_shape3, input_data3, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
-  }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
 
-TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim1Vector) {
-  MS_LOG(INFO) << " begin test ";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->Init();
-  auto allocator = runtime->GetAllocator();
-  MS_LOG(INFO) << " init tensors ";
-  std::vector<int> input_shape1 = {6};
-  std::vector<int> input_shape2 = {1};
-  std::vector<int> input_shape3 = {6};
-  std::vector<int> input_shape4 = {1};
-  float input_data1[] = {1, 3, 4, 5, 6, 7};
-  float input_data2[] = {10};
-  float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-  float input_data4[] = {2.0};
-  float correctOutput[] = {2, 1, 2, 2, 3, 4, 5, 6, 2, 2};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
+TEST_F(TestOpenCL_SparseToDense, Dim1Vector) {
+  std::vector<int> input_shape0 = {6};
+  std::vector<int> input_shape1 = {1};
+  std::vector<int> input_shape2 = {6};
+  std::vector<int> input_shape3 = {1};
   std::vector<int> output_shape = {10};
-  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, tensor_type);
-  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, tensor_type);
-  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, tensor_type);
-  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, tensor_type);
-  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, tensor_type);
-  // allocate memory for weights
-  in_tensor2.MallocData();
-  in_tensor3.MallocData();
-  in_tensor4.MallocData();
-  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
-  std::vector<lite::Tensor *> outputs{&output_tensor};
-  // initialize weights
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new ActivationParameter failed ";
-    return;
+  float input_data0[] = {1, 3, 4, 5, 6, 7};
+  float input_data1[] = {10};
+  float input_data2[] = {1, 2, 3, 4, 5, 6};
+  float input_data3[] = {2};
+  float output_data[] = {2, 1, 2, 2, 3, 4, 5, 6, 2, 2};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter();
+    TestMain({{input_shape0, input_data0, VAR},
+              {input_shape1, input_data1, CONST_TENSOR},
+              {input_shape2, input_data2, CONST_TENSOR},
+              {input_shape3, input_data3, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-
-  auto *sparse_to_dense_kernel =
-    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (sparse_to_dense_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
-    delete param;
-    return;
-  }
-  sparse_to_dense_kernel->Init();
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    delete param;
-    delete sparse_to_dense_kernel;
-    return;
-  }
-  // to do allocate memory for inputs
-  in_tensor1.MallocData(allocator);
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
-  delete sub_graph;
 }
-
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
index 9df29e386f..298cab43b2 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
@@ -13,271 +13,51 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h"
-namespace mindspore {
-class TestStackOpenCLCI : public mindspore::CommonTest {
- public:
-  TestStackOpenCLCI() {}
-};
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/stack_parameter.h"
 
-class TestStackOpenCLfp16 : public mindspore::CommonTest {
- public:
-  TestStackOpenCLfp16() {}
-};
+namespace mindspore::lite::opencl::test {
 
-TEST_F(TestStackOpenCLCI, StackFp32_8inputforCI) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
+class TestOpenCL_Stack : public CommonTest {};
 
-  MS_LOG(INFO) << " init tensors ";
-  constexpr int INPUT_NUM = 8;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8},
-    std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}};
-  std::vector<int> output_shape = {8, 1, 1, 8};
-  auto data_type = kNumberTypeFloat32;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f};
-  float input_data2[] = {0.5f, 0.6f, 0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f};
-  float input_data3[] = {0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f};
-  float input_data4[] = {0.35f, 0.26f, 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f};
-  float input_data5[] = {0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, 0.87f};
-  float input_data6[] = {0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f};
-  float input_data7[] = {0.35f, 0.26f, 0.54f, 0.33f, 0.76f, 0.59f, 0.73f, 0.34f};
-  float input_data8[] = {0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f};
-  float correctOutput[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f,  0.59f, 0.03f, 0.37f, 0.5f,  0.6f,  0.74f, 0.23f, 0.46f,
-                           0.69f, 0.13f, 0.47f, 0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f, 0.35f, 0.26f,
-                           0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f, 0.57f, 0.6f,  0.84f, 0.83f, 0.48f, 0.78f, 0.63f,
-                           0.87f, 0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f, 0.35f, 0.26f, 0.54f, 0.33f,
-                           0.76f, 0.59f, 0.73f, 0.34f, 0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f};
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs;
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new StackParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 0;
-  auto *stack_kernel =
-    new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (stack_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  stack_kernel->Init();
-  // to do allocate memory for inputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{stack_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete stack_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
-  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
-  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data1));
-  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data2));
-  memcpy(inputs[4]->data_c(), input_data5, sizeof(input_data1));
-  memcpy(inputs[5]->data_c(), input_data6, sizeof(input_data2));
-  memcpy(inputs[6]->data_c(), input_data7, sizeof(input_data1));
-  memcpy(inputs[7]->data_c(), input_data8, sizeof(input_data2));
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  delete sub_graph;
+namespace {
+// PrimitiveType_Stack: src/ops/populate/stack_populate.cc
+OpParameter *CreateParameter(int axis) {
+  auto *param = test::CreateParameter<StackParameter>(schema::PrimitiveType_Stack);
+  param->axis_ = axis;
+  return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestStackOpenCLfp16, StackFp32_8inputaxis1) {
-  MS_LOG(INFO) << " begin test ";
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->SetFp16Enable(true);
-  ocl_runtime->Init();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // get the input from .bin
-  size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size,
-    output_size;
-  std::string input1Ppath = "./test_data/stackfp16_input1.bin";
-  std::string input2Ppath = "./test_data/stackfp16_input2.bin";
-  std::string input3Ppath = "./test_data/stackfp16_input3.bin";
-  std::string input4Ppath = "./test_data/stackfp16_input4.bin";
-  std::string input5Ppath = "./test_data/stackfp16_input5.bin";
-  std::string input6Ppath = "./test_data/stackfp16_input6.bin";
-  std::string input7Ppath = "./test_data/stackfp16_input7.bin";
-  std::string input8Ppath = "./test_data/stackfp16_input8.bin";
-  std::string correctOutputPath = "./test_data/stackfp16_output.bin";
-  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
-  auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
-  auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
-  auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
-  auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size));
-  auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size));
-  auto input_data7 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size));
-  auto input_data8 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size));
-  auto correctOutput =
-    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
-  MS_LOG(INFO) << " init tensors ";
+TEST_F(TestOpenCL_Stack, input8_ndim3_axis0) {
   constexpr int INPUT_NUM = 8;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18},
-    std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}};
-  std::vector<int> output_shape = {1, 8, 17, 18};
-  auto data_type = kNumberTypeFloat16;
-  auto tensor_type = lite::Tensor::CONST_TENSOR;
-  std::vector<lite::Tensor *> inputs;
-  for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
-    inputs.push_back(input_temp);
-    if (input_temp == nullptr) {
-      MS_LOG(INFO) << " new input_tensor failed ";
-      return;
-    }
-  }
-  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
-  if (output_tensor == nullptr) {
-    MS_LOG(INFO) << " new output_tensor failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    return;
-  }
-  std::vector<lite::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();
-
-  MS_LOG(INFO) << " initialize tensors ";
-  auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter)));
-  if (param == nullptr) {
-    MS_LOG(INFO) << " new StackParameter failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    return;
-  }
-  param->axis_ = 1;
-  auto *stack_kernel =
-    new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
-  if (stack_kernel == nullptr) {
-    MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    return;
-  }
-  stack_kernel->Init();
-  // to  allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
-  }
-  MS_LOG(INFO) << " initialize sub_graph ";
-  std::vector<kernel::LiteKernel *> kernels{stack_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
-    for (auto tensor : inputs) {
-      delete tensor;
-    }
-    for (auto tensor : outputs) {
-      delete tensor;
-    }
-    delete param;
-    delete stack_kernel;
-    return;
-  }
-  sub_graph->Init();
-  MS_LOG(INFO) << " initialize input data ";
-  if (inputs.size() == 8) {
-    memcpy(inputs[0]->data_c(), input_data1, input1_size);
-    memcpy(inputs[1]->data_c(), input_data2, input2_size);
-    memcpy(inputs[2]->data_c(), input_data3, input3_size);
-    memcpy(inputs[3]->data_c(), input_data4, input4_size);
-    memcpy(inputs[4]->data_c(), input_data5, input5_size);
-    memcpy(inputs[5]->data_c(), input_data6, input6_size);
-    memcpy(inputs[6]->data_c(), input_data7, input7_size);
-    memcpy(inputs[7]->data_c(), input_data8, input8_size);
-  } else {
-    MS_LOG(ERROR) << " input size must be 2 or 3 or 4";
-  }
-
-  std::cout << "==================output data================" << std::endl;
-  sub_graph->Run();
-  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData());
-  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001));
-  for (auto tensor : inputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
-  }
-  for (auto tensor : outputs) {
-    tensor->set_data(nullptr);
-    delete tensor;
+  int axis = 0;
+  std::vector<int> input_shapes[INPUT_NUM] = {{1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8},
+                                              {1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8}};
+  std::vector<int> output_shape = {8, 1, 1, 8};
+  float input_datas[INPUT_NUM][8] = {
+    {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37},  {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47},
+    {0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57}, {0.35, 0.26, 0.17, 0.33, 0.66, 0.89, 0.93, 0.77},
+    {0.57, 0.6, 0.84, 0.83, 0.48, 0.78, 0.63, 0.87},  {0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37},
+    {0.35, 0.26, 0.54, 0.33, 0.76, 0.59, 0.73, 0.34}, {0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37}};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.03, 0.37, 0.5,  0.6,  0.74, 0.23, 0.46,
+                         0.69, 0.13, 0.47, 0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57, 0.35, 0.26,
+                         0.17, 0.33, 0.66, 0.89, 0.93, 0.77, 0.57, 0.6,  0.84, 0.83, 0.48, 0.78, 0.63,
+                         0.87, 0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37, 0.35, 0.26, 0.54, 0.33,
+                         0.76, 0.59, 0.73, 0.34, 0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37};
+
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_datas[0], VAR},
+              {input_shapes[1], input_datas[1], VAR},
+              {input_shapes[2], input_datas[2], VAR},
+              {input_shapes[3], input_datas[3], VAR},
+              {input_shapes[4], input_datas[4], VAR},
+              {input_shapes[5], input_datas[5], VAR},
+              {input_shapes[6], input_datas[6], VAR},
+              {input_shapes[7], input_datas[7], VAR}},
+             {output_shape, output_data}, param, fp16_enable);
   }
-  delete sub_graph;
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
index 4b4410841d..1415cf02a5 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
@@ -13,22 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "common/common_test.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
 #include "nnacl/strided_slice.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
 
-namespace mindspore {
+namespace mindspore::lite::opencl::test {
 
-class TestStridedSliceOpenCL : public mindspore::CommonTest {};
+class TestOpenCL_StridedSlice : public CommonTest {};
 
-OpParameter *GetStridedSliceParameter(const std::vector<int> &begins, const std::vector<int> &ends,
-                                      const std::vector<int> &strides) {
-  auto param = static_cast<StridedSliceParameter *>(malloc(sizeof(StridedSliceParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "create StridedSliceParameter error.";
-    return nullptr;
-  }
-  param->op_parameter_.type_ = schema::PrimitiveType_StridedSlice;
+namespace {
+// PrimitiveType_StridedSlice: src/ops/populate/strided_slice_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &begins, const std::vector<int> &ends,
+                             const std::vector<int> &strides) {
+  auto *param = test::CreateParameter<StridedSliceParameter>(schema::PrimitiveType_StridedSlice);
   param->num_axes_ = begins.size();
   for (int i = 0; i < begins.size(); ++i) {
     param->begins_[i] = begins[i];
@@ -37,84 +33,109 @@ OpParameter *GetStridedSliceParameter(const std::vector<int> &begins, const std:
   }
   return reinterpret_cast<OpParameter *>(param);
 }
+}  // namespace
 
-TEST_F(TestStridedSliceOpenCL, 1D) {
+TEST_F(TestOpenCL_StridedSlice, 1D) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  float expect_data[] = {3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33};
-  auto *param = GetStridedSliceParameter({3}, {36}, {3});
-  TestMain({{{36}, input_data, Tensor::Category::VAR}}, {{11}, expect_data}, param, false);
+  float output_data[] = {3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({3}, {36}, {3});
+    TestMain({{{36}, input_data, VAR}}, {{11}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, 2D) {
+TEST_F(TestOpenCL_StridedSlice, 2D) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  float expect_data[] = {11, 14};
-  auto *param = GetStridedSliceParameter({1, 2}, {3, 8}, {2, 3});
-  TestMain({{{4, 9}, input_data, Tensor::Category::VAR}}, {{1, 2}, expect_data}, param, false);
+  float output_data[] = {11, 14};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 2}, {3, 8}, {2, 3});
+    TestMain({{{4, 9}, input_data, VAR}}, {{1, 2}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, 3D) {
+TEST_F(TestOpenCL_StridedSlice, 3D) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  float expect_data[] = {11, 14};
-  auto *param = GetStridedSliceParameter({0, 1, 2}, {1, 3, 8}, {1, 2, 3});
-  TestMain({{{1, 4, 9}, input_data, Tensor::Category::VAR}}, {{1, 1, 2}, expect_data}, param, false);
+  float output_data[] = {11, 14};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 1, 2}, {1, 3, 8}, {1, 2, 3});
+    TestMain({{{1, 4, 9}, input_data, VAR}}, {{1, 1, 2}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, 4D) {
+TEST_F(TestOpenCL_StridedSlice, 4D) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
 
-  float expect_data0[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+  float output_data0[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                           18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  auto *param = GetStridedSliceParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 3, 3}, expect_data0}, param, false);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{2, 2, 3, 3}, output_data0}, param, fp16_enable);
+  }
 
-  param = GetStridedSliceParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 3, 3}, expect_data0}, param, true);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{2, 2, 3, 3}, output_data0}, param, fp16_enable);
+  }
 
-  float expect_data1[] = {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  param = GetStridedSliceParameter({1, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 2, 3, 3}, expect_data1}, param, false);
+  float output_data1[] = {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{1, 2, 3, 3}, output_data1}, param, fp16_enable);
+  }
 
-  float expect_data2[] = {27, 28, 29, 30, 31, 32, 33, 34, 35};
-  param = GetStridedSliceParameter({1, 1, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 3, 3}, expect_data2}, param, false);
+  float output_data2[] = {27, 28, 29, 30, 31, 32, 33, 34, 35};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 1, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{1, 1, 3, 3}, output_data2}, param, fp16_enable);
+  }
 
-  float expect_data3[] = {33, 34, 35};
-  param = GetStridedSliceParameter({1, 1, 2, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 1, 3}, expect_data3}, param, false);
+  float output_data3[] = {33, 34, 35};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 1, 2, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{1, 1, 1, 3}, output_data3}, param, fp16_enable);
+  }
 
-  float expect_data4[] = {34};
-  param = GetStridedSliceParameter({1, 1, 2, 1}, {2, 2, 3, 2}, {1, 1, 1, 1});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 1, 1}, expect_data4}, param, false);
+  float output_data4[] = {34};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 1, 2, 1}, {2, 2, 3, 2}, {1, 1, 1, 1});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{1, 1, 1, 1}, output_data4}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, 4D_stride2) {
+TEST_F(TestOpenCL_StridedSlice, 4D_stride2) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  float expect_data[] = {13, 14, 31, 32};
-  auto *param = GetStridedSliceParameter({0, 1, 1, 1}, {1, 4, 3, 3}, {2, 2, 2, 1});
-  TestMain({{{1, 4, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 2, 1, 2}, expect_data}, param, false);
+  float output_data[] = {13, 14, 31, 32};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 1, 1, 1}, {1, 4, 3, 3}, {2, 2, 2, 1});
+    TestMain({{{1, 4, 3, 3}, input_data, VAR}}, {{1, 2, 1, 2}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, 4D_to_3D) {
+TEST_F(TestOpenCL_StridedSlice, 4D_to_3D) {
   float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
-  float expect_data[] = {18, 20, 21, 23, 27, 29, 30, 32};
-  auto *param = GetStridedSliceParameter({1, 0, 0, 0}, {2, 2, 2, 3}, {1, 1, 1, 2});
-  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 2}, expect_data}, param, false);
+  float output_data[] = {18, 20, 21, 23, 27, 29, 30, 32};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({1, 0, 0, 0}, {2, 2, 2, 3}, {1, 1, 1, 2});
+    TestMain({{{2, 2, 3, 3}, input_data, VAR}}, {{2, 2, 2}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, In1D_OutOfRangeBeginNegativeStride) {
+TEST_F(TestOpenCL_StridedSlice, In1D_OutOfRangeBeginNegativeStride) {
   float input_data[] = {1, 2, 3, 4};
-  float expect_data[] = {4, 3, 2};
-  auto *param = GetStridedSliceParameter({5}, {0}, {-1});
-  TestMain({{{4}, input_data, Tensor::Category::VAR}}, {{3}, expect_data}, param, false);
+  float output_data[] = {4, 3, 2};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({5}, {0}, {-1});
+    TestMain({{{4}, input_data, VAR}}, {{3}, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestStridedSliceOpenCL, tflite_cpu) {
+TEST_F(TestOpenCL_StridedSlice, test0) {
   std::vector<float> values(32768);
   for (int i = 0; i < values.size(); ++i) {
     values[i] = i % 1000;
@@ -290,28 +311,30 @@ TEST_F(TestStridedSliceOpenCL, tflite_cpu) {
     auto &name = std::get<0>(case_);
     auto &input_shape = std::get<1>(case_);
     auto &output_shape = std::get<2>(case_);
-    auto &input_data = std::get<3>(case_);
-    auto &expect_data = std::get<4>(case_);
+    auto input_data = std::get<3>(case_).data();
+    auto output_data = std::get<4>(case_).data();
     auto &begin = std::get<5>(case_);
     auto &end = std::get<6>(case_);
     auto &stride = std::get<7>(case_);
-
     std::cout << name << std::endl;
-    auto *param = GetStridedSliceParameter(begin, end, stride);
-    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
-             false);
-    param = GetStridedSliceParameter(begin, end, stride);
-    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
-             true);
+
+    for (auto fp16_enable : {false, true}) {
+      auto *param = CreateParameter(begin, end, stride);
+      TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+    }
   }
 }
 
-TEST_F(TestStridedSliceOpenCL, tflite_opencl) {
-  float input_data[] = {0.1f,  0.2f,  0.3f,  0.4,  1.1f,  1.2f,  1.3f,  1.4,  10.1f, 10.2f, 10.3f, 10.4,
-                        11.1f, 11.2f, 11.3f, 11.4, 20.1f, 20.2f, 20.3f, 20.4, 21.1f, 21.2f, 21.3f, 21.4};
-  float expect_data[] = {10.2, 10.4, 20.2, 20.4};
-  auto *param = GetStridedSliceParameter({0, 1, 0, 1}, {1, 3, 2, 4}, {1, 1, 2, 2});
-  TestMain({{{1, 3, 2, 4}, input_data, Tensor::Category::VAR}}, {{1, 2, 1, 2}, expect_data}, param, false);
+TEST_F(TestOpenCL_StridedSlice, test1) {
+  float input_data[] = {0.1,  0.2,  0.3,  0.4,  1.1,  1.2,  1.3,  1.4,  10.1, 10.2, 10.3, 10.4,
+                        11.1, 11.2, 11.3, 11.4, 20.1, 20.2, 20.3, 20.4, 21.1, 21.2, 21.3, 21.4};
+  float output_data[] = {10.2, 10.4, 20.2, 20.4};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter({0, 1, 0, 1}, {1, 3, 2, 4}, {1, 1, 2, 2});
+    TestMain({{{1, 3, 2, 4}, input_data, VAR}}, {{1, 2, 1, 2}, output_data}, param, fp16_enable,
+             fp16_enable ? 1e-2 : 1e-9);
+  }
 }
 
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
index f5f3259860..b61c701afb 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
@@ -22,8 +22,8 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h"
 
-namespace mindspore {
-class TestToFormatOpenCL : public mindspore::CommonTest {
+namespace mindspore::lite::opencl::test {
+class TestToFormatOpenCL : public CommonTest {
  public:
   TestToFormatOpenCL() {}
 };
@@ -103,4 +103,4 @@ TEST_F(TestToFormatOpenCL, ToFormatNHWC2NCHW) {
   ASSERT_EQ(0, CompareOutputData(output_data, correct_data, h * w * c, 0.00001));
   MS_LOG(INFO) << "Test TransposeFp32 passed";
 }
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
index 90719a1e20..104605962f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@@ -13,153 +13,57 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+#include "ut/src/runtime/kernel/opencl/common.h"
+#include "nnacl/transpose.h"
 
-namespace mindspore {
-class TestTransposeOpenCL : public mindspore::CommonTest {
- public:
-  TestTransposeOpenCL() {}
-};
+namespace mindspore::lite::opencl::test {
 
-void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
-  ocl_runtime->Init();
-  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
-  ocl_runtime->SetFp16Enable(enable_fp16);
-  auto param = static_cast<TransposeParameter *>(malloc(sizeof(TransposeParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "param_ptr create error.";
-    return;
-  }
-  param->num_axes_ = 4;
-  param->perm_[0] = shape[3];
-  param->perm_[1] = shape[4];
-  param->perm_[2] = shape[5];
-  param->perm_[3] = shape[6];
-  auto allocator = ocl_runtime->GetAllocator();
-  int h = shape[0];
-  int w = shape[1];
-  int c = shape[2];
-  std::vector<int> input_shape = {1, h, w, c};
-  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, schema::Format_NHWC);
-  auto tensor_x = tensor_x_ptr.get();
-  if (tensor_x == nullptr) {
-    MS_LOG(ERROR) << "tensor_x create error.";
-    return;
-  }
-  std::vector<int> out_shape = {input_shape[param->perm_[0]], input_shape[param->perm_[1]],
-                                input_shape[param->perm_[2]], input_shape[param->perm_[3]]};
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, schema::Format_NHWC);
-  auto tensor_out = tensor_out_ptr.get();
-  if (tensor_out == nullptr) {
-    MS_LOG(ERROR) << "tensor_out create error.";
-    return;
-  }
-  std::vector<lite::Tensor *> inputs{tensor_x};
-  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel = kernel::OpenCLKernelCreator<kernel::TransposeOpenCLKernel>(
-    inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr);
-  if (arith_kernel == nullptr) {
-    MS_LOG(ERROR) << "arith_kernel create error.";
-    return;
-  }
-
-  inputs[0]->MallocData(allocator);
+class TestOpenCL_Transpose : public CommonTest {};
 
-  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
-  auto pGraph = pGraph_ptr.get();
-  if (pGraph == nullptr) {
-    MS_LOG(ERROR) << "pGraph create error.";
-    return;
+namespace {
+// PrimitiveType_Transpose: src/ops/populate/transpose_populate.cc
+//                          src/ops/populate/nchw2nhwc_populate.cc
+//                          src/ops/populate/nhwc2nchw_populate.cc
+OpParameter *CreateParameter(const std::vector<int> &perm) {
+  auto *param = test::CreateParameter<TransposeParameter>(schema::PrimitiveType_Transpose);
+  param->num_axes_ = perm.size();
+  for (int i = 0; i < perm.size(); ++i) {
+    param->perm_[i] = perm[i];
   }
-  pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, h * w * c * dtype_size);
-  pGraph->Run();
+  return reinterpret_cast<OpParameter *>(param);
+}
+}  // namespace
 
-  if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, h * w * c, static_cast<float16_t>(1e-3), 2e-2);
-  } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, h * w * c, static_cast<float>(1e-5));
+TEST_F(TestOpenCL_Transpose, NHWC2NCHW) {
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> perm = {0, 3, 1, 2};
+  std::vector<int> output_shape;
+  for (int axis : perm) {
+    output_shape.push_back(input_shape[axis]);
   }
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11};
 
-  for (auto t : inputs) {
-    t->set_data(nullptr);
-  }
-  for (auto t : outputs) {
-    t->set_data(nullptr);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(perm);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
   }
-
-  MS_LOG(INFO) << "Test TransposeFp32 passed";
-}
-
-TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp32) {
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  int perm0 = 0;
-  int perm1 = 3;
-  int perm2 = 1;
-  int perm3 = 2;
-  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
-
-  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
-}
-
-TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp16) {
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  int perm0 = 0;
-  int perm1 = 3;
-  int perm2 = 1;
-  int perm3 = 2;
-  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float16_t> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
-
-  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
 }
 
-TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp32) {
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  int perm0 = 0;
-  int perm1 = 2;
-  int perm2 = 3;
-  int perm3 = 1;
-  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
-  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
+TEST_F(TestOpenCL_Transpose, NCHW2NHWC) {
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> perm = {0, 2, 3, 1};
+  std::vector<int> output_shape;
+  for (int axis : perm) {
+    output_shape.push_back(input_shape[axis]);
+  }
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  float output_data[] = {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11};
 
-  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(perm);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
+  }
 }
 
-TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp16) {
-  int h = 2;
-  int w = 2;
-  int c = 3;
-  int perm0 = 0;
-  int perm1 = 2;
-  int perm2 = 3;
-  int perm3 = 1;
-  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
-  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  std::vector<float16_t> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
-
-  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
-}
-}  // namespace mindspore
+}  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
deleted file mode 100644
index 1aa68f4363..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include "common/common_test.h"
-#include "src/kernel_registry.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
-#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
-
-using mindspore::kernel::LiteKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
-using mindspore::lite::KernelRegistry;
-using mindspore::lite::Tensor;
-using mindspore::schema::Format::Format_NHWC;
-
-namespace mindspore {
-
-void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
-  if (file_path.empty()) {
-    memset(dst, 0x00, dst_size);
-  } else {
-    auto src_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &dst_size));
-    if (src_data != nullptr) {
-      memcpy(dst, src_data, dst_size);
-    } else {
-      MS_LOG(ERROR) << "read file empty.";
-    }
-  }
-}
-
-void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
-              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable,
-              float atol, bool print_output) {
-  MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  auto ocl_runtime = runtime_wrapper.GetInstance();
-  EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
-  ocl_runtime->SetFp16Enable(fp16_enable);
-  auto allocator = ocl_runtime->GetAllocator();
-
-  MS_LOG(DEBUG) << "create Tensors & init weight data";
-  std::vector<Tensor> tensors;
-  std::vector<Tensor *> kernel_inputs;
-  std::vector<Tensor *> subgraph_inputs;
-  std::map<Tensor *, float *> subgraph_inputs_data;
-  for (auto input_info : input_infos) {
-    const std::vector<int> &shape = std::get<0>(input_info);
-    auto *input_data = std::get<1>(input_info);
-    const Tensor::Category category = std::get<2>(input_info);
-    tensors.emplace_back(kNumberTypeFloat32, shape, Format_NHWC, category);
-    auto *new_tensor = &tensors.back();
-    kernel_inputs.push_back(new_tensor);
-    if (category != Tensor::Category::VAR) {
-      memcpy(new_tensor->MutableData(), input_data, new_tensor->Size());
-    } else {
-      subgraph_inputs.push_back(new_tensor);
-      subgraph_inputs_data[new_tensor] = input_data;
-    }
-  }
-  const std::vector<int> &output_shape = std::get<0>(output_info);
-  float *expect_data = std::get<1>(output_info);
-  auto output = Tensor(kNumberTypeFloat32, output_shape, Format_NHWC, Tensor::Category::VAR);
-
-  MS_LOG(DEBUG) << "create OpenCL Kernel";
-  auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
-  kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  if (creator == nullptr) {
-    std::cerr << "get kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
-    free(op_parameter);
-    FAIL();
-  }
-  auto *kernel = creator(kernel_inputs, {&output}, op_parameter, nullptr, key, nullptr);
-  if (kernel == nullptr) {
-    std::cerr << "call kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
-    free(op_parameter);
-    FAIL();
-  }
-
-  MS_LOG(DEBUG) << "create SubGraph & init input data";
-  std::vector<LiteKernel *> kernels{kernel};
-  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
-  if (sub_graph == nullptr) {
-    return;
-  }
-  for (auto input : subgraph_inputs) {
-    EXPECT_TRUE(input->MallocData(allocator) == RET_OK);
-  }
-  EXPECT_TRUE(sub_graph->Init() == RET_OK);
-  for (auto input : subgraph_inputs) {
-    memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
-  }
-
-  MS_LOG(DEBUG) << "run SubGraph & compare result";
-  EXPECT_TRUE(sub_graph->Run() == RET_OK);
-  if (print_output) {
-    for (int i = 0; i < output.ElementsNum(); ++i) {
-      printf("%d: expect=%.3f output=%.3f\n", i, expect_data[i], reinterpret_cast<float *>(output.data_c())[i]);
-    }
-  }
-  CommonTest::CompareOutputData(reinterpret_cast<float *>(output.data_c()), expect_data, output.ElementsNum(), atol);
-
-  MS_LOG(DEBUG) << "release resources";
-  delete sub_graph;
-}
-
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
deleted file mode 100644
index dadcbd00a9..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
-#define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
-
-#include <string>
-#include <iostream>
-#include <vector>
-#include <tuple>
-#include <map>
-#include "mindspore/lite/src/tensor.h"
-#include "mindspore/lite/src/common/file_utils.h"
-
-using mindspore::lite::Tensor;
-
-namespace mindspore {
-
-void LoadTestData(void *dst, size_t dst_size, const std::string &file_path);
-
-template <typename T>
-void CompareOutput(void *output, void *expect, size_t elem_num, T atol, float rtol = 1e-5) {
-  T *output_data = reinterpret_cast<T *>(output);
-  T *expect_data = reinterpret_cast<T *>(expect);
-
-  std::cout << std::setprecision(5) << std::setiosflags(std::ios::fixed) << std::setw(7);
-  std::cout << "output[0:12]:";
-  for (int i = 0; i < 12 && i < elem_num; i++) {
-    std::cout << output_data[i] << " ";
-  }
-  std::cout << std::endl;
-  std::cout << "expect[0:12]:";
-  for (int i = 0; i < 12 && i < elem_num; i++) {
-    std::cout << expect_data[i] << " ";
-  }
-  std::cout << std::endl;
-  for (int i = 0; i < elem_num; ++i) {
-    auto left = static_cast<float>(std::fabs(output_data[i] - expect_data[i]));
-    auto right = static_cast<float>(atol + rtol * std::fabs(expect_data[i]));
-    if (left > right) {
-      std::cout << "error at idx[" << i << "] expect=" << expect_data[i] << " output=" << output_data[i] << std::endl;
-    }
-    ASSERT_LE(left, right);
-  }
-  std::cout << "compare success!" << std::endl;
-}
-
-template <typename T>
-void CompareOutput(lite::Tensor *output_tensor, const std::string &file_path, T atol, float rtol = 1e-5) {
-  size_t output_size;
-  auto expect_data = mindspore::lite::ReadFile(file_path.c_str(), &output_size);
-  CompareOutput(output_tensor->data_c(), expect_data, output_tensor->ElementsNum(), atol, rtol);
-}
-
-void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
-              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable = false,
-              float atol = 10e-9, bool print_output = false);
-
-}  // namespace mindspore
-
-#endif  // TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_