!13252 [MS][LITE][Develop] fix some problems for gpu ops

From: @pengyongrong Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
4 years ago · d8a5b2e06a
parent 75faccfcdc 1177c11705
commit d8a5b2e06a
7 changed files with 51 additions and 32 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl
@ -8,7 +8,7 @@ __kernel void Cast_fp32_to_fp16(__read_only image2d_t input, __write_only image2
  if (x >= XY.x || y >= XY.y) {
    return;
  }
-  half4 result = convert_half4(READ_IMAGE(input, smp_none, (int2)(x, y)));
+  half4 result = convert_half4(read_imagef(input, smp_none, (int2)(x, y)));
  write_imageh(output, (int2)(x, y), result);
 }

@ -18,8 +18,8 @@ __kernel void Cast_fp32_to_fp32(__read_only image2d_t input, __write_only image2
  if (x >= XY.x || y >= XY.y) {
    return;
  }
-  float4 result = READ_IMAGE(input, smp_none, (int2)(x, y));
-  write_imageh(output, (int2)(x, y), result);
+  float4 result = read_imagef(input, smp_none, (int2)(x, y));
+  write_imagef(output, (int2)(x, y), result);
 }

 __kernel void Cast_fp16_to_fp16(__read_only image2d_t input, __write_only image2d_t output, int2 XY) {
@ -28,7 +28,7 @@ __kernel void Cast_fp16_to_fp16(__read_only image2d_t input, __write_only image2
  if (x >= XY.x || y >= XY.y) {
    return;
  }
-  half4 result = READ_IMAGE(input, smp_none, (int2)(x, y));
+  half4 result = read_imageh(input, smp_none, (int2)(x, y));
  write_imageh(output, (int2)(x, y), result);
 }

@ -38,6 +38,6 @@ __kernel void Cast_fp16_to_fp32(__read_only image2d_t input, __write_only image2
  if (x >= XY.x || y >= XY.y) {
    return;
  }
-  float4 result = convert_float4(READ_IMAGE(input, smp_none, (int2)(x, y)));
-  write_imageh(output, (int2)(x, y), result);
+  float4 result = convert_float4(read_imageh(input, smp_none, (int2)(x, y)));
+  write_imagef(output, (int2)(x, y), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/sparse_to_dense.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/sparse_to_dense.cl
@ -10,16 +10,17 @@ __kernel void SparseToDenseScalar(__read_only image2d_t input, __global float *o
    return;
  }
  FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
+  int4 index_input_int = *((int4 *)&index_input);
  int index = 0;
  if (inshapeindex1_dim == 1) {
-    index = ((int)index_input.x) * stride_w;
+    index = (index_input_int.x) * stride_w;
  } else if (inshapeindex1_dim == 2) {
-    index = ((int)index_input.x) * stride_w + ((int)index_input.y);
+    index = (index_input_int.x) * stride_w + (index_input_int.y);
  } else if (inshapeindex1_dim == 3) {
-    index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + ((int)index_input.z);
+    index = (index_input_int.x) * stride_w + (index_input_int.y) * outputshape.w * C4NUM + (index_input_int.z);
  } else {
-    index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
-            ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
+    index = (index_input_int.x) * outputshape.y * stride_w + (index_input_int.y) * stride_w +
+            (index_input_int.z) * outputshape.w * C4NUM + index_input_int.w;
  }
  output[index] = weight;
 }
@ -33,16 +34,17 @@ __kernel void SparseToDenseVector(__read_only image2d_t input, __global float *o
    return;
  }
  FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
+  int4 index_input_int = *((int4 *)&index_input);
  int index = 0;
  if (inshapeindex1_dim == 1) {
-    index = ((int)index_input.x) * stride_w;
+    index = (index_input_int.x) * stride_w;
  } else if (inshapeindex1_dim == 2) {
-    index = ((int)index_input.x) * stride_w + (int)index_input.y;
+    index = (index_input_int.x) * stride_w + index_input_int.y;
  } else if (inshapeindex1_dim == 3) {
-    index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + (int)index_input.z;
+    index = (index_input_int.x) * stride_w + (index_input_int.y) * outputshape.w * C4NUM + index_input_int.z;
  } else {
-    index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
-            ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
+    index = (index_input_int.x) * outputshape.y * stride_w + (index_input_int.y) * stride_w +
+            (index_input_int.z) * outputshape.w * C4NUM + index_input_int.w;
  }
  output[index] = weight_vector[X];
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
@ -45,16 +45,18 @@ int CastOpenCLKernel::CheckSpecs() {
  auto input_dtype = in_tensors_.front()->data_type();
  if (input_dtype != kNumberTypeFloat32 && input_dtype != kNumberTypeFloat16) {
    MS_LOG(ERROR) << "input dtype must be float32/float16";
+    return RET_ERROR;
  }
  auto output_dtype = out_tensors_.front()->data_type();
  if (output_dtype != kNumberTypeFloat32 && output_dtype != kNumberTypeFloat16) {
    MS_LOG(ERROR) << "output dtype must be float32/float16";
+    return RET_ERROR;
  }
  return RET_OK;
 }

 void CastOpenCLKernel::SetConstArgs() {
-  cl_int4 shape = {static_cast<int>(shape_.width), static_cast<int>(shape_.height)};
+  cl_int2 shape = {static_cast<int>(shape_.width), static_cast<int>(shape_.height)};
  ocl_runtime_->SetKernelArg(kernel_, 2, shape);
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
@ -108,10 +108,6 @@ int SparseToDenseOpenCLKernel::CheckSpecs() {
      return ERROR;
    }
  }
-  if (inshapeindex1_dim > 4) {
-    MS_LOG(ERROR) << "Unsupported input_indices[1] > 4: ";
-    return ERROR;
-  }
  auto param = reinterpret_cast<SparseToDenseParameter *>(op_parameter_);
  if (param->validate_indices_) {
    MS_LOG(ERROR) << "Unsupported unordered for in_tensors_indices";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
@ -59,15 +59,9 @@ int SplitOpenCLKernel::RunAxis0() {

 int SplitOpenCLKernel::CheckSpecs() {
  auto param = reinterpret_cast<SplitParameter *>(this->op_parameter_);
-  if (param->split_dim_) {
-    if (out_tensors_.size() != 2 || in_tensors_.size() != 1) {
-      MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
-      return RET_ERROR;
-    }
-    if (param->num_split_ != 2) {
-      MS_LOG(ERROR) << "num_split_(should be 2): " << param->num_split_;
-      return RET_ERROR;
-    }
+  if ((out_tensors_.size() != 2 || (out_tensors_.size() != 3 && param->split_dim_ == 0)) && in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
+    return RET_ERROR;
  }
  if (in_tensors_.at(0)->IsConst()) {
    MS_LOG(ERROR) << "in_tensors_ must be tensor";
@ -79,6 +73,11 @@ int SplitOpenCLKernel::CheckSpecs() {
      return RET_ERROR;
    }
  }
+
+  if (param->num_split_ != 2 && (param->num_split_ != 3 && param->split_dim_ == 0)) {
+    MS_LOG(ERROR) << "num_split_ only supported 2 or (3 && split_dim_ = 0) yet";
+    return RET_ERROR;
+  }
  if (param->split_dim_ < 0 || param->split_dim_ > 3) {
    MS_LOG(ERROR) << "split_dim_ must between 0~3";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@ -67,8 +67,8 @@ void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
 int StackOpenCLKernel::CheckSpecs() {
  auto param = reinterpret_cast<StackParameter *>(this->op_parameter_);
  axis_ = param->axis_;
-  if (in_tensors_.size() != 2 && (axis_ != 0)) {
-    MS_LOG(ERROR) << " only support input size = 2 ";
+  if (in_tensors_.size() != 2 && out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << " only support input size = 2 and output size = 1";
    return RET_ERROR;
  }
  if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/split_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/split_tests.cc
@ -54,4 +54,24 @@ TEST_F(TestOpenCL_Split, input2_axis3) {
             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
  }
 }
+
+TEST_F(TestOpenCL_Split, input3_axis0) {
+  std::vector<int> input_shape = {8, 1, 1, 1};
+  std::vector<int> output_shape1 = {2, 1, 1, 1};
+  std::vector<int> output_shape2 = {3, 1, 1, 1};
+  std::vector<int> output_shape3 = {3, 1, 1, 1};
+  int split_dim_ = 0;
+  int num_split_ = 3;  // len of split_sizes_
+  std::vector<int> split_sizes_{2, 3, 3};
+  float input_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37};
+  float output_data1[] = {0.75, 0.06};
+  float output_data2[] = {0.74, 0.30, 0.9};
+  float output_data3[] = {0.59, 0.03, 0.37};
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(split_dim_, num_split_, split_sizes_);
+    TestMain({{input_shape, input_data, VAR}},
+             {{output_shape1, output_data1}, {output_shape2, output_data2}, {output_shape3, output_data3}}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
 }  // namespace mindspore::lite::opencl::test