!8781 【MSLITE】optimize opencl test code

From: @wangdongxu6 Reviewed-by: Signed-off-by:
5 years ago · a574cdb4e5
parent d773b54fe7 5e46d7ccd7
commit a574cdb4e5
95 changed files with 3438 additions and 7536 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
@ -91,3 +91,16 @@ __kernel void HSwish(__read_only image2d_t input, __write_only image2d_t output,
  result.w = temp.w * (temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
+
+__kernel void HSigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+    int X = get_global_id(0);  // w*c
+    int Y = get_global_id(1);  // n*h
+    if (X >= img_shape.x || Y >= img_shape.y) return;
+    FLT4 temp = READ_IMAGE(input, smp_zero, (int2)(X, Y));
+    FLT4 result = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    result.x = temp.x <= -3 ? 0 : (temp.x >= 3 ? 1 : temp.x / 6 + 0.5f);
+    result.y = temp.y <= -3 ? 0 : (temp.y >= 3 ? 1 : temp.y / 6 + 0.5f);
+    result.z = temp.z <= -3 ? 0 : (temp.z >= 3 ? 1 : temp.z / 6 + 0.5f);
+    result.w = temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f);
+    WRITE_IMAGE(output, (int2)(X, Y), result);
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
@ -28,9 +28,9 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
    return;                                                                       \
  }

-__kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
  const int BlockH = 1;
  const int BlockW = 1;
  const int BlockC = 1;
@ -84,9 +84,9 @@ __kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image
  }
 }

-__kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
  const int BlockH = 2;
  const int BlockW = 1;
  const int BlockC = 1;
@ -161,9 +161,9 @@ __kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image
  }
 }

-__kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
  const int BlockH = 2;
  const int BlockW = 1;
  const int BlockC = 2;
@ -268,9 +268,9 @@ __kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image
  }
 }

-__kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                                 __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                                 const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
+__kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
+                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
+                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
  const int BlockH = 2;
  const int BlockW = 2;
  const int BlockC = 2;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
@ -1,7 +1,9 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
-                           int4 src_size, int4 dst_size, int indices_num, int axis) {
+
+__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
+                     int4 src_size, int4 dst_size, int indices_num, int axis) {
  int X = get_global_id(0);  // w
  int Y = get_global_id(1);  // n*h
  int Z = get_global_id(2);  // c
@ -40,48 +42,3 @@ __kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_
  }
  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data);
 }
-
-__kernel void gather_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
-                            int4 src_size, int4 dst_size, int indices_num, int axis) {
-  int X = get_global_id(0);  // w
-  int Y = get_global_id(1);  // n*h
-  int Z = get_global_id(2);  // c
-  if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) {
-    return;
-  }
-  FLT4 res_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int batch = Y / dst_size.y;
-  int height = Y % dst_size.y;
-  if (axis == 0) {
-    int index_y = indices[batch] * src_size.y * src_size.z + Z * src_size.y + height;
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-  } else if (axis == 1) {
-    int index_y = batch * src_size.y * src_size.z + Z * src_size.y + indices[height];
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-  } else if (axis == 2) {
-    int index_y = batch * src_size.y * src_size.z + Z * src_size.y + height;
-    res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X], index_y));
-  } else if (axis == 3) {
-    int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4};
-    FLT tmp[4];
-    FLT res_tmp[4];
-    for (int i = 0; i < indices_num; ++i) {
-      FLT4 rd_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-      int index_y = batch * src_size.y * src_size.z + offset[i] * src_size.y + height;
-      rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-      if (i >= 1 && offset[i] != offset[i - 1]) {
-        rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X, index_y));
-      }
-      tmp[0] = rd_data.x;
-      tmp[1] = rd_data.y;
-      tmp[2] = rd_data.z;
-      tmp[3] = rd_data.w;
-      res_tmp[i] = tmp[indices[Z * 4 + i] % 4];
-    }
-    res_data.x = res_tmp[0];
-    res_data.y = res_tmp[1];
-    res_data.z = res_tmp[2];
-    res_data.w = res_tmp[3];
-  }
-  WRITE_IMAGE(dst_data, (int2)(X, (batch * dst_size.y * dst_size.z + Z * dst_size.y + height)), res_data);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl
@ -1,19 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void hswish(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 tensor_shape) {
-  int X = get_global_id(0);  // n*h n: default =1
-  int Y = get_global_id(1);  // w
-  int Z = get_global_id(2);  // c
-  if (X >= tensor_shape.x * tensor_shape.y || Y >= tensor_shape.z || Z >= tensor_shape.w || tensor_shape.y == 0) {
-    return;
-  }
-  int n = X / tensor_shape.y;
-  int h = X % tensor_shape.y;
-  FLT4 temp = READ_IMAGE(src_data, smp_none, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h)));
-  FLT4 result = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  result.x = temp.x <= -3 ? 0 : (temp.x >= 3 ? 1 : temp.x / 6 + 0.5f);
-  result.y = temp.y <= -3 ? 0 : (temp.y >= 3 ? 1 : temp.y / 6 + 0.5f);
-  result.z = temp.z <= -3 ? 0 : (temp.z >= 3 ? 1 : temp.z / 6 + 0.5f);
-  result.w = temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f);
-  WRITE_IMAGE(dst_data, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h)), result);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/pad.cl
@ -2,35 +2,56 @@

 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-#define Pad(dataformat, in_x, in_y, out_x, out_y)                                                              \
-  __kernel void Pad_##dataformat(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape, \
-                                 int4 output_shape, int2 pad, float constant_value) {                          \
-    int oh = get_global_id(0);                                                                                 \
-    int ow = get_global_id(1);                                                                                 \
-    int co_slice = get_global_id(2);                                                                           \
-    int OH = output_shape.y;                                                                                   \
-    int OW = output_shape.z;                                                                                   \
-    int CO_SLICES = output_shape.w;                                                                            \
-                                                                                                               \
-    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) {                                                       \
-      return;                                                                                                  \
-    }                                                                                                          \
-                                                                                                               \
-    int IH = input_shape.y;                                                                                    \
-    int IW = input_shape.z;                                                                                    \
-    int CI_SLICES = input_shape.w;                                                                             \
-                                                                                                               \
-    int pad_top = pad.x;                                                                                       \
-    int pad_left = pad.y;                                                                                      \
-    int ih = oh - pad_top;                                                                                     \
-    int iw = ow - pad_left;                                                                                    \
-                                                                                                               \
-    FLT4 result = (FLT4)(constant_value);                                                                      \
-    if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {                                                            \
-      result = READ_IMAGE(input, smp_zero, (int2)(in_x, in_y));                                                \
-    }                                                                                                          \
-    WRITE_IMAGE(output, (int2)(out_x, out_y), result);                                                         \
+__kernel void Pad(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape, int4 output_shape,
+                  int2 io_slices, int4 pad_before, float constant_value) {
+  int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
+  int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
+  int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
+  int on_oh = get_global_id(0);
+  int ow = get_global_id(1);
+  int co_slice = get_global_id(2);
+  int on = on_oh / OH;
+  int oh = on_oh % OH;
+  if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
+    return;
  }

-Pad(NHWC4, iw *CI_SLICES + co_slice, ih, ow *CO_SLICES + co_slice, oh);
-Pad(NC4HW4, iw, co_slice *IH + ih, ow, co_slice *OH + oh);
+  int in = on - pad_before.x;
+  int ih = oh - pad_before.y;
+  int iw = ow - pad_before.z;
+  int ci = co_slice * 4 - pad_before.w;
+  if (in < 0 || in >= IN || ih < 0 || ih >= IH || iw < 0 || iw >= IW || ci + 3 < 0 || ci >= CI) {
+    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), (FLT4)(constant_value));
+    return;
+  }
+
+  int offset = ci % 4;
+  if (offset < 0) {
+    offset += 4;
+  }
+  FLT4 src0 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
+  if (offset == 0 && ci >= 0 && ci + 3 < CI) {
+    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), src0);
+    return;
+  }
+  FLT4 src1 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + (ci + 4) / 4, in * IH + ih));
+  FLT4 src_f4;
+  if (offset == 0) {
+    src_f4 = (FLT4)(src0.x, src0.y, src0.z, src0.w);
+  } else if (offset == 1) {
+    src_f4 = (FLT4)(src0.y, src0.z, src0.w, src1.x);
+  } else if (offset == 2) {
+    src_f4 = (FLT4)(src0.z, src0.w, src1.x, src1.y);
+  } else {  // if (offset==3)
+    src_f4 = (FLT4)(src0.w, src1.x, src1.y, src1.z);
+  }
+  FLT src[4] = {src_f4.x, src_f4.y, src_f4.z, src_f4.w};
+  FLT out[4] = {constant_value, constant_value, constant_value, constant_value};
+  for (int i = 0; i < 4; ++i) {
+    if (ci + i >= 0 && ci + i < CI) {
+      out[i] = src[i];
+    }
+  }
+  FLT4 out_f4 = (FLT4)(out[0], out[1], out[2], out[3]);
+  WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out_f4);
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -31,6 +31,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_HSIGMOID;
 using mindspore::schema::ActivationType_HSWISH;
 using mindspore::schema::ActivationType_LEAKY_RELU;
 using mindspore::schema::ActivationType_RELU;
@ -44,9 +45,9 @@ namespace mindspore::kernel {

 std::string ActivationOpenCLKernel::GetActTypeString(int act_type) {
  static std::map<int, std::string> supported_act_type = {
-    {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"},
-    {ActivationType_RELU6, "Relu6"},          {ActivationType_TANH, "Tanh"}, {ActivationType_SWISH, "Swish"},
-    {ActivationType_HSWISH, "HSwish"}};
+    {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"},        {ActivationType_SIGMOID, "Sigmoid"},
+    {ActivationType_RELU6, "Relu6"},          {ActivationType_TANH, "Tanh"},        {ActivationType_SWISH, "Swish"},
+    {ActivationType_HSWISH, "HSwish"},        {ActivationType_HSIGMOID, "HSigmoid"}};
  auto result_iter = supported_act_type.find(act_type);
  if (result_iter != supported_act_type.end()) {
    return result_iter->second;
@ -63,13 +64,12 @@ int ActivationOpenCLKernel::CheckSpecs() {
 }

 int ActivationOpenCLKernel::Prepare() {
-  outShape = Image2DInfo(out_tensors_[0]);
+  outShape = GpuTensorInfo(out_tensors_[0]);
  std::string source = activation_source;
-  std::set<std::string> build_options;
  std::string program_name = "Activation";
  ocl_runtime_->LoadSource(program_name, source);
  std::string kernel_name = GetActTypeString(type_);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  SetConstArgs();
  SetGlobalLocal();
  MS_LOG(DEBUG) << kernel_name << " init Done!";
@ -101,7 +101,7 @@ int ActivationOpenCLKernel::Run() {
  int arg_idx = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@ -45,7 +45,7 @@ class ActivationOpenCLKernel : public OpenCLKernel {
  cl::Kernel kernel_;
  int type_;
  float alpha_;
-  Image2DInfo outShape = Image2DInfo(nullptr);
+  GpuTensorInfo outShape = GpuTensorInfo(nullptr);
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@ -76,7 +76,7 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
  auto in_shape = in_tensors_[0]->shape();
  auto in_shape_align = in_shape;
  in_shape_align[3] = UP_ROUND(in_shape[3], C4NUM);
-  im_in_ = Image2DInfo(in_tensors_[0]);
+  im_in_ = GpuTensorInfo(in_tensors_[0]);
  auto out_shape_align = in_shape_align;
  out_shape_align.at(param->axis_) = param->axis_ == 3 ? UP_ROUND(param->topk_, C4NUM) : param->topk_;
  int reduce_len = GetUpPow2(in_shape.at(param->axis_));
@ -152,8 +152,7 @@ int ArgMinMaxOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
@ -43,7 +43,7 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel {
  cl::Kernel kernel_;
  void *buff_{nullptr};
  void *ids_{nullptr};
-  Image2DInfo im_in_{Image2DInfo(nullptr)};
+  GpuTensorInfo im_in_{GpuTensorInfo(nullptr)};
  cl_int4 src_size_;
  cl_int4 cus_size_;
  cl_int4 strides_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@ -245,10 +245,9 @@ int ArithmeticOpenCLKernel::Prepare() {
    kernel_name_ += "_BUF";
  }
  std::string program_name = "Arithmetic";
-  std::set<std::string> build_options;
  std::string source = arithmetic_source;
  ocl_runtime_->LoadSource(program_name, source);
-  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options);
+  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_);
 #endif
  if (error_code != RET_OK) {
    return error_code;
@ -270,7 +269,7 @@ int ArithmeticOpenCLKernel::Run() {
  auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@ -157,8 +157,7 @@ int ArithmeticSelfOpenCLKernel::Run() {
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
@ -87,11 +87,10 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else

-  std::set<std::string> build_options;
  std::string source = batch_to_space_nd_source;
  std::string program_name = "batch_to_space_nd";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif

  SetGlobalLocal();
@ -102,9 +101,9 @@ int BatchToSpaceNDOpenCLKernel::Prepare() {

 int BatchToSpaceNDOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
-  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -91,8 +91,7 @@ int BatchNormOpenCLKernel::Run() {
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());   // mean
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c());   // variance
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
@ -71,12 +71,11 @@ int BiasAddOpenCLKernel::Init() {
    return mindspore::lite::RET_ERROR;
  }
  InitWeights();
-  std::set<std::string> build_options;
  std::string source = biasadd_source;
  std::string program_name = "BiasAdd";
  std::string kernel_name = "BiasAdd";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);

  MS_LOG(DEBUG) << program_name << " Init Done!";
  return mindspore::lite::RET_OK;
@ -95,7 +94,7 @@ int BiasAddOpenCLKernel::Run() {
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
  std::vector<size_t> local = {1, 1};
  std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
-  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
  if (ret != mindspore::lite::RET_OK) {
    MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
    return mindspore::lite::RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
@ -100,8 +100,7 @@ int CastOpenCLKernel::Run() {
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@ -114,11 +114,10 @@ int ConcatOpenCLKernel::Prepare() {
  }
  kernel_name += "_NHWC4";
  MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
-  std::set<std::string> build_options;
  std::string source = concat_source;
  std::string program_name = "Concat";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetGlobalLocal();
  SetConstArgs();
@ -146,7 +145,7 @@ int ConcatOpenCLKernel::Run() {
    MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
    return RET_ERROR;
  }
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_

 #include <vector>
 #include <string>
@ -27,23 +27,27 @@

 namespace mindspore::kernel {

-class ConvolutionOpenCLKernel : public OpenCLKernel {
+class Conv2DOpenCLKernel : public OpenCLKernel {
 public:
-  ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                          const std::vector<lite::Tensor *> &outputs)
+  Conv2DOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                     const std::vector<lite::Tensor *> &outputs)
      : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {}
-  ~ConvolutionOpenCLKernel() override = default;
+  ~Conv2DOpenCLKernel() override = default;

-  int Init() override;
-  int Run() override;
-  int InitWeights() override;
+  int CheckSpecs() override;
+
+  int Prepare() override;
  void SetGlobalLocal() override;
+  int InitWeights() override;
+  void SetConstArgs() override;
+
+  int Run() override;

 private:
  void SetBlockSize();
-  int InitWeight();
+  int InitFilter();
  int InitBias();
-  int GenerateWinogradWeight();
+  int GenerateWinogradFilter();

  bool UseWinograd4x4To6x6() {
    const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 &&
@ -58,8 +62,9 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  cl::Kernel kernel_4x4to36_;
  cl::Kernel kernel_conv_;
  cl::Kernel kernel_36to4x4_;
-  std::vector<size_t> global_;
-  std::vector<size_t> local_;
+  cl::NDRange global_4x4to36_, local_4x4to36_;
+  cl::NDRange global_conv_, local_conv_;
+  cl::NDRange global_36to4x4_, local_36to4x4_;

  bool use_fp16_{false};
  size_t sizeof_FLT_{4};
@ -95,4 +100,4 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 };
 }  // namespace mindspore::kernel

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -48,10 +48,9 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::string source = conv2d_transpose_source;
-  std::set<std::string> build_options;
  std::string program_name = "conv2d_transpose";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
  InitWeights();
  SetGlobalLocal();
@ -194,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
  int arg_cnt = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return mindspore::lite::RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -69,10 +69,9 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::string program_name = "DepthwiseConv2d";
-  std::set<std::string> build_options;
  std::string source = depthwise_conv2d_source;
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
  InitWeights();
  SetGlobalLocal();
@ -193,7 +192,7 @@ int DepthwiseConv2dOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return mindspore::lite::RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@ -71,16 +71,15 @@ int FullConnectionOpenCLKernel::CheckSpecs() {

 int FullConnectionOpenCLKernel::Prepare() {
  std::string kernel_name = "FullConnection_NHWC4";
-  inShape = Image2DInfo(in_tensors_[0]);
-  outShape = Image2DInfo(out_tensors_[0]);
+  inShape = GpuTensorInfo(in_tensors_[0]);
+  outShape = GpuTensorInfo(out_tensors_[0]);
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
  std::string source = fullconnection_source;
  std::string program_name = "FullConnection";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
  InitWeights();
  SetConstArgs();
@ -203,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() {
  int arg_count = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
@ -48,8 +48,8 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
  bool transposeB{true};
  float activation_min_{-FLT_MAX};
  float activation_max_{FLT_MAX};
-  Image2DInfo inShape = Image2DInfo(nullptr);
-  Image2DInfo outShape = Image2DInfo(nullptr);
+  GpuTensorInfo inShape = GpuTensorInfo(nullptr);
+  GpuTensorInfo outShape = GpuTensorInfo(nullptr);
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@ -30,47 +30,88 @@ using mindspore::schema::PrimitiveType_Gather;

 namespace mindspore::kernel {

-int GatherOpenCLKernel::CheckSpecs() { return RET_OK; }
+int GatherOpenCLKernel::CheckSpecs() {
+  if (in_tensors_.size() != 2) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 2 input Tensor but get " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1 output Tensor but get " << out_tensors_.size();
+    return RET_ERROR;
+  }
+
+  if (in_tensors_.at(1)->category() == lite::Tensor::VAR) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports indices Tensor is weight.";
+    return RET_ERROR;
+  }
+
+  int input_ndim = in_tensors_.front()->shape().size();
+  if (input_ndim < 0 || input_ndim > 4) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1-4D input Tensor but get " << input_ndim << "D.";
+    return RET_ERROR;
+  }
+  int indices_ndim = in_tensors_.at(1)->shape().size();
+  if (indices_ndim != 1) {
+    MS_LOG(ERROR) << "GatherOpenCLKernel only supports 1D indices Tensor but get " << indices_ndim << "D.";
+    return RET_ERROR;
+  }
+
+  TypeId data_type = in_tensors_.at(1)->data_type();
+  if (data_type != kNumberTypeInt32 && data_type != kNumberTypeInt64 && data_type != kNumberTypeFloat32 &&
+      data_type != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Conv2D only supports Int32/Int64/Float32/Float16 indices Tensor.";
+    return RET_ERROR;
+  }
+
+  auto *param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
+  axis_ = param->axis_;
+  if (axis_ < 0) {
+    axis_ += input_ndim;
+  }
+  if (axis_ < 0 || axis_ >= input_ndim) {
+    MS_LOG(ERROR) << "axis is invalid: axis=" << axis_ << ".";
+    return RET_ERROR;
+  } else {
+    return RET_OK;
+  }
+}

 void GatherOpenCLKernel::SetConstArgs() {
-  auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
-  param->axis_ = (param->axis_ + in_tensors_[0]->shape().size()) % in_tensors_[0]->shape().size();
-  auto input_shape = in_tensors_[0]->shape();
-  auto output_shape = out_tensors_[0]->shape();
-  int indices_num = in_tensors_[1]->ElementsNum();
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
-  cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
-  cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
-                      (cl_int)out_tensors_[0]->Batch()};
+  auto input = GpuTensorInfo(in_tensors_.front());
+  auto output = GpuTensorInfo(out_tensors_.front());
+  int indices_num = in_tensors_.at(1)->ElementsNum();
+  cl_int4 src_size = {static_cast<cl_int>(input.W), static_cast<cl_int>(input.H), static_cast<cl_int>(input.Slice),
+                      static_cast<cl_int>(input.N)};
+  cl_int4 dst_size = {static_cast<cl_int>(output.W), static_cast<cl_int>(output.H), static_cast<cl_int>(output.Slice),
+                      static_cast<cl_int>(output.N)};
  int arg_cnt = 3;
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, param->axis_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cnt, axis_);
 }

 void GatherOpenCLKernel::SetGlobalLocal() {
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  auto output = GpuTensorInfo(out_tensors_.front());
  std::vector<size_t> local = {1, 1, 1};
-  std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(),
-                                (size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4};
+  std::vector<size_t> global = {output.W, output.N * output.H, output.Slice};
  OpenCLKernel::AlignGlobalLocal(global, local);
 }

 int GatherOpenCLKernel::Prepare() {
-  std::string kernel_name = "gather_NHWC4";
+  std::string kernel_name = "gather";
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
-  std::string source = gather_source;
  std::string program_name = "gather";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->LoadSource(program_name, gather_source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif

-  InitWeights();
+  int ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetGlobalLocal();
  SetConstArgs();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
@ -79,58 +120,42 @@ int GatherOpenCLKernel::Prepare() {

 int GatherOpenCLKernel::InitWeights() {
  auto indices_tensor = in_tensors_.at(1);
-  int indices_num = indices_tensor->ElementsNum();
-  bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
+  auto indices_num = indices_tensor->ElementsNum();
  auto allocator = ocl_runtime_->GetAllocator();
-  if (!isIndicesInt32) {
-    indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
-    if (indices_data_ == nullptr) {
-      MS_LOG(ERROR) << "Memory allocation failed";
-      return RET_ERROR;
-    }
+  indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
+  if (indices_data_ == nullptr) {
+    MS_LOG(ERROR) << "Memory allocation failed";
+    return RET_ERROR;
  }
-  return RET_OK;
-}

-int GatherOpenCLKernel::UpdateWeights() {
-  auto indices_tensor = in_tensors_.at(1);
-  int indices_num = indices_tensor->ElementsNum();
-  bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
-  if (!isIndicesInt32) {
-    if (indices_tensor->data_type() == kNumberTypeInt64) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<int64_t *>(indices_tensor->data_c())[i];
-      }
-    } else if (indices_tensor->data_type() == kNumberTypeFloat32) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<float *>(indices_tensor->data_c())[i];
-      }
-    } else if (indices_tensor->data_type() == kNumberTypeFloat16) {
-      for (int i = 0; i < indices_num; i++) {
-        indices_data_[i] = reinterpret_cast<float16_t *>(indices_tensor->data_c())[i];
-      }
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type: " << indices_tensor->data_type();
-      return RET_ERROR;
+  auto data_type = indices_tensor->data_type();
+  auto data = indices_tensor->data_c();
+  if (data_type == kNumberTypeInt32) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<int32_t *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeInt64) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<int64_t *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeFloat32) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<float *>(data)[i];
+    }
+  } else if (data_type == kNumberTypeFloat16) {
+    for (int i = 0; i < indices_num; i++) {
+      indices_data_[i] = reinterpret_cast<float16_t *>(data)[i];
    }
-  } else {
-    indices_data_ = reinterpret_cast<int32_t *>(indices_tensor->data_c());
  }
  return RET_OK;
 }

 int GatherOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
-
-  if (UpdateWeights() != RET_OK) {
-    return RET_ERROR;
-  }
-
-  ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
-  ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+  ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c());
  ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
-
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
@ -45,6 +45,7 @@ class GatherOpenCLKernel : public OpenCLKernel {
 private:
  cl::Kernel kernel_;
  int32_t *indices_data_{nullptr};
+  int axis_ = {0};
 };
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc
@ -1,128 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/opencl/kernel/hswish.h"
-#include <cstring>
-#include <string>
-#include <algorithm>
-#include <set>
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "src/runtime/kernel/opencl/cl/hswish.cl.inc"
-
-using mindspore::kernel::KERNEL_ARCH::kGPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Activation;
-
-namespace mindspore::kernel {
-
-int HswishOpenCLKernel::Init() {
-  if (out_tensors_[0]->shape().size() > 4) {
-    MS_LOG(ERROR) << " only support dim <= 4";
-    return RET_ERROR;
-  }
-
-  std::string kernel_name = "hswish";
-  std::set<std::string> build_options;
-  std::string source = hswish_source;
-  std::string program_name = "hswish";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  return RET_OK;
-}
-
-void HswishGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
-  const int max_divider = 8;
-  const int max_x = 2, max_y = 8;
-  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
-  int yz = max_size / x;
-  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
-  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
-
-  local->clear();
-  local->push_back(x);
-  local->push_back(y);
-  local->push_back(z);
-}
-
-int HswishOpenCLKernel::InferShapeTo4D() {
-  if (in_tensors_[0]->shape().size() <= 4) {
-    if (in_tensors_[0]->shape().size() == 1) {
-      N_ = in_tensors_[0]->shape()[0];
-    } else if (in_tensors_[0]->shape().size() == 2) {
-      N_ = in_tensors_[0]->shape()[0];
-      C_ = in_tensors_[0]->shape()[1];
-    } else if (in_tensors_[0]->shape().size() == 3) {
-      N_ = in_tensors_[0]->shape()[0];
-      W_ = in_tensors_[0]->shape()[1];
-      C_ = in_tensors_[0]->shape()[2];
-    } else {
-      N_ = in_tensors_[0]->shape()[0];
-      H_ = in_tensors_[0]->shape()[1];
-      W_ = in_tensors_[0]->shape()[2];
-      C_ = in_tensors_[0]->shape()[3];
-    }
-  } else {
-    MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size();
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int HswishOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
-  auto output_shape = out_tensors_[0]->shape();
-  InferShapeTo4D();
-  cl_int4 output_shape_ = {static_cast<cl_int>(N_), static_cast<cl_int>(H_), static_cast<cl_int>(W_),
-                           static_cast<cl_int>(UP_DIV(C_, C4NUM))};
-  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
-  std::vector<size_t> local = {1, 1, 1};
-  uint32_t OH = N_ * H_;
-  uint32_t OW = W_;
-  uint32_t OC = UP_DIV(C_, C4NUM);
-  std::vector<size_t> global = {OH, OW, OC};
-  HswishGetWorkGroup(global, &local, max_global[0]);
-  int arg_cn = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
-  return RET_OK;
-}
-
-kernel::LiteKernel *HswishOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                              const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) HswishOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new HswishOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: hswish ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-}  // namespace mindspore::kernel
--- a/Show More
+++ b/Show More