opencl_strided_slice

4 years ago · c94563c06b
parent c324fec6d4
commit c94563c06b
10 changed files with 852 additions and 609 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
@ -1,146 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define INT2 int2
-#define INT4 int4
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-__kernel void slice_NHWC4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
-                          INT4 begin, INT2 sharedNoUpdiv) {
-  int X = get_global_id(1);  // H
-  int Y = get_global_id(2);  // W
-  if (X >= out_shape.y || Y >= out_shape.z) {
-    return;
-  }
-  FLT4 result;
-  if (sharedNoUpdiv.x % 4 == 0) {
-    for (int i = 0; i < out_shape.w; i++) {
-      result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
-      WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i, (X)), result);
-    }
-  } else {
-    int begin_postion = sharedNoUpdiv.x % 4;
-    FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
-    if (begin_postion == 1) {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
-        result.x = first.y;
-        result.y = first.z;
-        result.z = first.w;
-        result.w = second.x;
-        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
-        first.y = second.y;
-        first.z = second.z;
-        first.w = second.w;
-      }
-    } else if (begin_postion == 2) {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
-        result.x = first.z;
-        result.y = first.w;
-        result.z = second.x;
-        result.w = second.y;
-        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
-        first.z = second.z;
-        first.w = second.w;
-      }
-    } else {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
-        result.x = first.w;
-        result.y = second.x;
-        result.z = second.y;
-        result.w = second.z;
-        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
-        first.w = second.w;
-      }
-    }
-  }
-  // judge the line of size
-  int size = sharedNoUpdiv.y % 4;
-  FLT4 result_fill0;
-  if (size == 1) {
-    result_fill0.x = result.x;
-    result_fill0.y = 0;
-    result_fill0.z = 0;
-    result_fill0.w = 0;
-    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
-  } else if (size == 2) {
-    result_fill0.x = result.x;
-    result_fill0.y = result.y;
-    result_fill0.z = 0;
-    result_fill0.w = 0;
-    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
-  } else if (size == 3) {
-    result_fill0.x = result.x;
-    result_fill0.y = result.y;
-    result_fill0.z = result.z;
-    result_fill0.w = 0;
-    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
-  }
-}
-__kernel void slice_NC4HW4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
-                           INT4 begin, INT2 sharedNoUpdiv) {
-  int X = get_global_id(1);  // H
-  int Y = get_global_id(2);  // W
-  if (X >= out_shape.y || Y >= out_shape.z) {
-    return;
-  }
-  FLT4 result;
-  if (sharedNoUpdiv.x % 4 == 0) {
-    for (int i = 0; i < out_shape.w; i++) {
-      result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
-      WRITE_IMAGE(output, (INT2)((Y), (i * out_shape.y + X)), result);
-    }
-  } else {
-    int begin_postion = sharedNoUpdiv.x % 4;
-    FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (begin.w) * input_shape.y + (X + begin.y)));
-    if (begin_postion == 1) {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
-        result.x = first.y;
-        result.y = first.z;
-        result.z = first.w;
-        result.w = second.x;
-        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
-        first.y = second.y;
-        first.z = second.z;
-        first.w = second.w;
-      }
-    } else if (begin_postion == 2) {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
-        result.x = first.z;
-        result.y = first.w;
-        result.z = second.x;
-        result.w = second.y;
-        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
-        first.z = second.z;
-        first.w = second.w;
-      }
-    } else {
-      for (int i = 1; i <= out_shape.w; i++) {
-        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
-        result.x = first.w;
-        result.y = second.x;
-        result.z = second.y;
-        result.w = second.z;
-        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
-        first.w = second.w;
-      }
-    }
-  }
-  // judge the line of size
-  int size = sharedNoUpdiv.y % 4;
-  FLT4 result_fill0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  if (size == 1) {
-    result_fill0.x = result.x;
-    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
-  } else if (size == 2) {
-    result_fill0.x = result.x;
-    result_fill0.y = result.y;
-    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
-  } else if (size == 3) {
-    result_fill0.x = result.x;
-    result_fill0.y = result.y;
-    result_fill0.z = result.z;
-    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
-  }
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/strided_slice.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/strided_slice.cl
@ -0,0 +1,59 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+__kernel void strided_slice(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape,
+                            int4 output_shape, int2 io_slices, int4 begin, int4 stride, int4 size) {
+  int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
+  int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
+  int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
+  int on_oh = get_global_id(0);
+  int ow = get_global_id(1);
+  int co_slice = get_global_id(2);
+  int on = on_oh / OH;
+  int oh = on_oh % OH;
+  if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
+    return;
+  }
+
+  FLT tmp[4];
+  for (int i = 0; i < 4; ++i) {
+    // output_shape idx -> size idx. because squeeze(output_shape)=squeeze(size)
+    // for example:
+    // python code: B = A[1, 1:16, 2:16, 3:16]
+    // input_shape  = [16, 16, 16, 16]
+    // begin        = [ 1,  1,  2,  3]
+    // end          = [ 2, 16, 16, 16]
+    // stride       = [ 1,  1,  1,  1]
+    // size         = [ 1, 15, 14, 13] = ceil((end - begin) / stride)
+    // output_shape = [    15, 14, 13]
+    int idx = ((on * OH + oh) * OW + ow) * CO + co_slice * 4 + i;
+    int co_ = idx % size.w;
+    idx /= size.w;
+    int ow_ = idx % size.z;
+    idx /= size.z;
+    int oh_ = idx % size.y;
+    idx /= size.y;
+    int on_ = idx;
+
+    int in = begin.x + stride.x * on_;
+    int ih = begin.y + stride.y * oh_;
+    int iw = begin.z + stride.z * ow_;
+    int ci = begin.w + stride.w * co_;
+
+    FLT4 src = READ_IMAGE(input, smp_none, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
+    int offset = ci % 4;
+    if (offset == 0) {
+      tmp[i] = src.x;
+    } else if (offset == 1) {
+      tmp[i] = src.y;
+    } else if (offset == 2) {
+      tmp[i] = src.z;
+    } else {
+      tmp[i] = src.w;
+    }
+  }
+
+  FLT4 out = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);
+  WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out);
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@ -1,106 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cstring>
-#include <string>
-#include <algorithm>
-#include <set>
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/opencl/kernel/slice.h"
-#include "src/runtime/kernel/opencl/utils.h"
-#include "src/runtime/kernel/opencl/cl/slice.cl.inc"
-
-using mindspore::kernel::KERNEL_ARCH::kGPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Slice;
-
-namespace mindspore::kernel {
-
-int SliceOpenCLKernel::Init() {
-  std::set<std::string> build_options;
-  std::string source = slice_source;
-  std::string program_name = "slice";
-  std::string kernel_name = "slice_NHWC4";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  return RET_OK;
-}
-
-void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
-  const int max_divider = 8;
-  const int max_x = 4, max_y = 8;
-  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
-  int yz = max_size / x;
-  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
-  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
-
-  local->clear();
-  local->push_back(x);
-  local->push_back(y);
-  local->push_back(z);
-}
-
-int SliceOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
-  auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
-  auto input_shape = in_tensors_[0]->shape();
-  cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
-  cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
-  cl_int4 begin_ = {param->begin_[0], param->begin_[1], param->begin_[2], param->begin_[3] / 4};
-  cl_int2 sharedNoUpdiv = {param->begin_[3], param->size_[3]};
-  uint32_t OH = param->size_[1];
-  uint32_t OW = param->size_[2];
-
-  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
-  std::vector<size_t> local = {1, 1, 1};  // init local
-  std::vector<size_t> global = {1, OH, OW};
-  SlcieGetWorkGroup(global, &local, max_global[0]);
-  int arg_cn = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
-
-  return RET_OK;
-}
-
-kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                             const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                             const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new SliceOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
-}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
@ -0,0 +1,192 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstring>
+#include <deque>
+#include <string>
+#include <algorithm>
+#include <set>
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/opencl/kernel/strided_slice.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "src/runtime/kernel/opencl/cl/strided_slice.cl.inc"
+#include "nnacl/strided_slice.h"
+
+using mindspore::kernel::KERNEL_ARCH::kGPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Slice;
+using mindspore::schema::PrimitiveType_StridedSlice;
+
+namespace mindspore::kernel {
+
+int SliceOpenCLKernel::CheckSpecs() {
+  const std::string kernel_name = op_parameter_->type_ == PrimitiveType_Slice ? "Slice" : "StridedSlice";
+  if (in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << kernel_name + " only supports 1 input Tensor.";
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << kernel_name + " only supports 1 output Tensor.";
+    return RET_ERROR;
+  }
+  auto in_ndim = in_tensors_.front()->shape().size();
+  if (in_ndim == 0 || in_ndim > 4) {
+    MS_LOG(ERROR) << kernel_name + " only supports 1D-4D input tensor";
+    return RET_ERROR;
+  }
+  auto out_ndim = out_tensors_.front()->shape().size();
+  if (out_ndim > 4) {
+    MS_LOG(ERROR) << kernel_name + " only supports 0D-4D output tensor";
+    return RET_ERROR;
+  }
+  if (InitConstArgs() != RET_OK) {
+    MS_LOG(ERROR) << "call SliceOpenCLKernel::InitConstArgs() failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int SliceOpenCLKernel::Prepare() {
+  std::set<std::string> build_options;
+  std::string program_name = "strided_slice";
+  ocl_runtime_->LoadSource(program_name, strided_slice_source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options);
+  SetConstArgs();
+  SetGlobalLocal();
+  return RET_OK;
+}
+
+int SliceOpenCLKernel::InitConstArgs() {
+  auto input_info = Image2DInfo(in_tensors_.front());
+  auto output_info = Image2DInfo(out_tensors_.front());
+  input_shape_ = {static_cast<cl_int>(input_info.N), static_cast<cl_int>(input_info.H),
+                  static_cast<cl_int>(input_info.W), static_cast<cl_int>(input_info.C)};
+  output_shape_ = {static_cast<cl_int>(output_info.N), static_cast<cl_int>(output_info.H),
+                   static_cast<cl_int>(output_info.W), static_cast<cl_int>(output_info.C)};
+  io_slices_ = {static_cast<cl_int>(input_info.Slice), static_cast<cl_int>(output_info.Slice)};
+
+  if (op_parameter_->type_ == PrimitiveType_Slice) {
+    auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
+    Broadcast2GpuShape(param->begin_, begin_.s, param->param_length_, 0);
+    Broadcast2GpuShape(param->size_, size_.s, param->param_length_, -1);
+    for (int i = 0; i < 4; ++i) {
+      if (begin_.s[i] < 0) {
+        begin_.s[i] += input_shape_.s[i];
+      }
+      if (begin_.s[i] < 0 || begin_.s[i] >= input_shape_.s[i]) {
+        MS_LOG(ERROR) << "Slice kernel only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
+                      << " input_shape[i]=" << input_shape_.s[i];
+        return RET_ERROR;
+      }
+      if (size_.s[i] < -1 || size_.s[i] == 0) {
+        MS_LOG(ERROR) << "Slice kernel only supports size=-1 or size>0 but size[i]=" << size_.s[i];
+        return RET_ERROR;
+      }
+      if (size_.s[i] == -1 || begin_.s[i] + size_.s[i] > input_shape_.s[i]) {
+        size_.s[i] = input_shape_.s[i] - begin_.s[i];
+      }
+    }
+  } else {
+    auto param = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
+    cl_int4 end = input_shape_;
+    Broadcast2GpuShape(param->begins_, begin_.s, param->num_axes_, 0);
+    Broadcast2GpuShape(param->strides_, stride_.s, param->num_axes_, 1);
+    Broadcast2GpuShape(param->ends_, end.s, param->num_axes_);
+
+    for (int i = 0; i < 4; ++i) {
+      // begin is negative
+      if (begin_.s[i] < 0) {
+        begin_.s[i] += input_shape_.s[i];
+      }
+      // avoid begin is out of range
+      begin_.s[i] = std::clamp(begin_.s[i], 0, input_shape_.s[i] - 1);
+      // end is negative
+      if (end.s[i] < 0) {
+        end.s[i] += input_shape_.s[i];
+      }
+      // avoid end is out of range
+      end.s[i] = std::clamp(end.s[i], -1, input_shape_.s[i]);
+
+      // check stride begin end
+      if (stride_.s[i] > 0) {
+        if (begin_.s[i] >= end.s[i]) {
+          MS_LOG(ERROR) << "StridedSlice kernel only supports begin_<end when stride>0";
+          return RET_ERROR;
+        }
+      } else if (stride_.s[i] < 0) {
+        if (begin_.s[i] <= end.s[i]) {
+          MS_LOG(ERROR) << "StridedSlice kernel only supports begin_>end when stride<0";
+          return RET_ERROR;
+        }
+      } else {
+        MS_LOG(ERROR) << "StridedSlice kernel only supports stride!=0";
+        return RET_ERROR;
+      }
+      size_.s[i] = std::ceil(static_cast<float>(end.s[i] - begin_.s[i]) / static_cast<float>(stride_.s[i]));
+    }
+  }
+
+  // check size
+  std::vector<int> shape_not_1;
+  std::vector<int> size_not_1;
+  std::copy_if(out_tensors_.front()->shape().begin(), out_tensors_.front()->shape().end(), shape_not_1.begin(),
+               [](int x) { return x > 1; });
+  std::copy_if(size_.s, size_.s + 4, size_not_1.begin(), [](int x) { return x > 1; });
+  if (shape_not_1 != size_not_1) {
+    MS_LOG(ERROR) << "Slice/StridedSlice kernel output shape infer error";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void SliceOpenCLKernel::SetConstArgs() {
+  int arg_cn = 2;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
+}
+
+void SliceOpenCLKernel::SetGlobalLocal() {
+  auto output_info = Image2DInfo(out_tensors_.front());
+  std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};
+
+  const int max_divider = 8;
+  auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
+  size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
+  size_t local_hw = max_work_group_size / local_c;
+  size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
+  size_t local_w = std::min(local_hw / local_h, global[1]);
+  std::vector<size_t> local = {local_h, local_w, local_c};
+  AlignGlobalLocal(global, local);
+}
+
+int SliceOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running! ";
+  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
+  return RET_OK;
+}
+
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_

 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
@ -31,12 +31,23 @@ class SliceOpenCLKernel : public OpenCLKernel {

  ~SliceOpenCLKernel() override = default;

-  int Init() override;
-
+  int Prepare() override;
  int Run() override;

+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;
+
 private:
+  int InitConstArgs();
+
  cl::Kernel kernel_;
+  cl_int4 input_shape_{};
+  cl_int4 output_shape_{};
+  cl_int2 io_slices_{};
+  cl_int4 begin_{};
+  cl_int4 stride_{{1, 1, 1, 1}};
+  cl_int4 size_{};
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -34,33 +34,67 @@ struct OpenCLToFormatParameter {
  lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
 };

+template <typename SrcT, typename DstT>
+void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num) {
+  auto *N = dst;
+  auto *H = dst + 1;
+  auto *W = dst + 2;
+  auto *C = dst + 3;
+  if (src_num == 1) {
+    *N = src[0];
+  } else if (src_num == 2) {
+    *N = src[0];
+    *C = src[1];
+  } else if (src_num == 3) {
+    *N = src[0];
+    *W = src[1];
+    *C = src[2];
+  } else if (src_num == 4) {
+    *N = src[0];
+    *H = src[1];
+    *W = src[2];
+    *C = src[3];
+  } else if (src_num >= 5) {
+    MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num;
+  }
+}
+
+template <typename SrcT, typename DstT>
+void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num, DstT default_value) {
+  for (int i = 0; i < 4; ++i) {
+    dst[i] = default_value;
+  }
+  Broadcast2GpuShape(src, dst, src_num);
+}
+
 struct Image2DInfo {
  explicit Image2DInfo(const lite::Tensor *tensor) {
    if (tensor == nullptr) {
      return;
    }
-
    auto shape = tensor->shape();
-    if (shape.size() == 1) {
+    auto ndim = shape.size();
+    if (ndim == 1) {
      N = shape[0];
-    } else if (shape.size() == 2) {
+    } else if (ndim == 2) {
      N = shape[0];
      C = shape[1];
-    } else if (shape.size() == 3) {
+    } else if (ndim == 3) {
      N = shape[0];
      W = shape[1];
      C = shape[2];
-    } else if (shape.size() == 4) {
+    } else if (ndim == 4) {
      N = shape[0];
      H = shape[1];
      W = shape[2];
      C = shape[3];
-    } else if (shape.size() >= 5) {
-      MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
+    } else if (ndim >= 5) {
+      MS_LOG(ERROR) << "GPU doesn't support Tensor with ndim>=" << ndim;
    }
+    Slice = UP_DIV(C, C4NUM);
+
    FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
    FLT4_size = FLT_size * 4;
-    Slice = UP_DIV(C, C4NUM);
    if (W * Slice <= MAX_IMAGE2D_SIZE) {
      height = N * H;
      width = W * Slice;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
@ -15,11 +15,17 @@
 */

 #include <string>
-#include "src/common/log_adapter.h"
-#include "mindspore/lite/src/common/file_utils.h"
+#include "common/common_test.h"
+#include "src/kernel_registry.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"

+using mindspore::kernel::LiteKernel;
+using mindspore::kernel::SubGraphOpenCLKernel;
+using mindspore::lite::KernelRegistry;
+using mindspore::lite::Tensor;
+using mindspore::schema::Format::Format_NHWC;
+
 namespace mindspore {

 void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
@ -35,4 +41,80 @@ void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
  }
 }

+void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
+              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable,
+              float atol, bool print_output) {
+  MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  auto ocl_runtime = runtime_wrapper.GetInstance();
+  EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
+  ocl_runtime->SetFp16Enable(fp16_enable);
+  auto allocator = ocl_runtime->GetAllocator();
+
+  MS_LOG(DEBUG) << "create Tensors & init weight data";
+  std::vector<Tensor> tensors;
+  std::vector<Tensor *> kernel_inputs;
+  std::vector<Tensor *> subgraph_inputs;
+  std::map<Tensor *, float *> subgraph_inputs_data;
+  for (auto input_info : input_infos) {
+    const std::vector<int> &shape = std::get<0>(input_info);
+    auto *input_data = std::get<1>(input_info);
+    const Tensor::Category category = std::get<2>(input_info);
+    tensors.emplace_back(kNumberTypeFloat32, shape, Format_NHWC, category);
+    auto *new_tensor = &tensors.back();
+    kernel_inputs.push_back(new_tensor);
+    if (category != Tensor::Category::VAR) {
+      memcpy(new_tensor->MutableData(), input_data, new_tensor->Size());
+    } else {
+      subgraph_inputs.push_back(new_tensor);
+      subgraph_inputs_data[new_tensor] = input_data;
+    }
+  }
+  const std::vector<int> &output_shape = std::get<0>(output_info);
+  float *expect_data = std::get<1>(output_info);
+  auto output = Tensor(kNumberTypeFloat32, output_shape, Format_NHWC, Tensor::Category::VAR);
+
+  MS_LOG(DEBUG) << "create OpenCL Kernel";
+  auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
+  kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  if (creator == nullptr) {
+    std::cerr << "get kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
+    free(op_parameter);
+    FAIL();
+  }
+  auto *kernel = creator(kernel_inputs, {&output}, op_parameter, nullptr, key, nullptr);
+  if (kernel == nullptr) {
+    std::cerr << "call kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
+    free(op_parameter);
+    FAIL();
+  }
+
+  MS_LOG(DEBUG) << "create SubGraph & init input data";
+  std::vector<LiteKernel *> kernels{kernel};
+  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    return;
+  }
+  for (auto input : subgraph_inputs) {
+    EXPECT_TRUE(input->MallocData(allocator) == RET_OK);
+  }
+  EXPECT_TRUE(sub_graph->Init() == RET_OK);
+  for (auto input : subgraph_inputs) {
+    memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
+  }
+
+  MS_LOG(DEBUG) << "run SubGraph & compare result";
+  EXPECT_TRUE(sub_graph->Run() == RET_OK);
+  if (print_output) {
+    for (int i = 0; i < output.ElementsNum(); ++i) {
+      printf("%d: expect=%.3f output=%.3f\n", i, expect_data[i], reinterpret_cast<float *>(output.data_c())[i]);
+    }
+  }
+  CommonTest::CompareOutputData(reinterpret_cast<float *>(output.data_c()), expect_data, output.ElementsNum(), atol);
+
+  MS_LOG(DEBUG) << "release resources";
+  delete sub_graph;
+}
+
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
@ -14,16 +14,18 @@
 * limitations under the License.
 */

+#ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
+#define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
+
 #include <string>
 #include <iostream>
-#include "tests/ut/cpp/common/common_test.h"
-#include "src/common/log_adapter.h"
+#include <vector>
+#include <tuple>
+#include <map>
+#include "mindspore/lite/src/tensor.h"
 #include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"

-#ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
-#define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
+using mindspore::lite::Tensor;

 namespace mindspore {

@ -63,6 +65,10 @@ void CompareOutput(lite::Tensor *output_tensor, const std::string &file_path, T
  CompareOutput(output_tensor->data_c(), expect_data, output_tensor->ElementsNum(), atol, rtol);
 }

+void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
+              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable = false,
+              float atol = 10e-9, bool print_output = false);
+
 }  // namespace mindspore

 #endif  // TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_