!13330 [MS][LITE][GPU]conv2d transpose support n > 1

From: @chenzupeng Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
4 years ago · d1592cee9e
parent a05ba06c07 12dc5968cf
commit d1592cee9e
3 changed files with 124 additions and 22 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
@ -13,8 +13,10 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
  int rem_w = dst_w % stride.y;
  int ceil_w = dst_w / stride.y;
  dst_w = ceil_w * stride.y * 2 + rem_w;
-  int dst_c = get_global_id(2);
-  if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z) return;
+  int dst_c = get_global_id(2);  // n * c4
+  int n = dst_c / dst_size.z;
+  dst_c = dst_c % dst_size.z;
+  if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z || n >= dst_size.w) return;
  int weight_base = dst_c * src_size.z * kernel_size.x * kernel_size.y;
  FLT4 r0 = (FLT4)(0.f);
  FLT4 r1 = (FLT4)(0.f);
@ -40,10 +42,18 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
      int kernel_w = kw_start - kw_copy;
      int weight_offset = weight_base + (kernel_h * kernel_size.y + kernel_w) * src_size.z;
      for (int ci = 0; ci < src_size.z; ++ci) {
-        FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out0_src_h));
-        FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out1_src_h));
-        FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out0_src_h));
-        FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out1_src_h));
+        FLT4 x0 = (FLT4)0.f;
+        FLT4 x2 = (FLT4)0.f;
+        if (out0_src_h < src_size.x) {
+          x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
+          x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
+        }
+        FLT4 x1 = (FLT4)0.f;
+        FLT4 x3 = (FLT4)0.f;
+        if (out1_src_h < src_size.x) {
+          x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
+          x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
+        }
        FLT16 weight_cache = weight[weight_offset++];
        r0 += x0.x * weight_cache.s0123;
        r0 += x0.y * weight_cache.s4567;
@ -85,14 +95,14 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
    r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f));
  }

-  WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0);
+  WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h), r0);
  if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) {
-    WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1);
+    WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r1);
  }
  if (dst_h < dst_size.x && dst_w + stride.y < dst_size.y) {
-    WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h), r2);
+    WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h), r2);
  }
  if (dst_h + stride.x < dst_size.x && dst_w + stride.y < dst_size.y) {
-    WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h + stride.x), r3);
+    WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r3);
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -83,10 +83,12 @@ void Conv2dTransposeOpenCLKernel::SetGlobalLocal() {
  int co4 = UP_DIV(co, C4NUM);
  int stride_h = param->stride_h_;
  int stride_w = param->stride_w_;
+  int n = out_tensors_[0]->shape()[0];
  int oh = out_tensors_[0]->shape()[1];
  int ow = out_tensors_[0]->shape()[2];
  local_size_ = {16, 1, 16};
-  global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w), (size_t)co4};
+  global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w),
+                  (size_t)co4 * (size_t)n};
  AlignGlobalLocal(global_size_, local_size_);
 }

@ -103,13 +105,14 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
  int stride_w = param->stride_w_;
  int oh = out_tensors_[0]->shape()[1];
  int ow = out_tensors_[0]->shape()[2];
+  int n = in_tensors_[0]->shape()[0];
  int h = in_tensors_[0]->shape()[1];
  int w = in_tensors_[0]->shape()[2];
  cl_int2 kernel_size = {kh, kw};
  cl_int2 stride = {stride_h, stride_w};
  cl_int2 padding = {pad_h, pad_w};
-  cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
-  cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
+  cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), n};
+  cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), n};
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
@ -38,9 +38,10 @@ OpParameter *CreateParameter(int n, int h, int w, int ci, int co, int kh, int kw
  param->dilation_h_ = 1;
  param->dilation_w_ = 1;
  param->act_type_ = ActType_No;
+  param->group_ = 1;

  *input_shape = {n, h, w, ci};
-  *weight_shape = {co, kh, kw, ci};
+  *weight_shape = {ci, kh, kw, co};
  *bias_shape = {co};
  *output_shape = {1, oh, ow, co};
  return reinterpret_cast<OpParameter *>(param);
@ -59,9 +60,9 @@ TEST_F(TestOpenCL_Conv2dTranspose, test0) {
  int kw = 2;
  std::vector<int> pad = {0, 0, 0, 0};
  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
-  float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
  float bias_data[] = {0.5};
-  float output_data[] = {5.5, 6.5, 17.5, 22.5, 7.5, 8.5, 27.5, 32.5, 29.5, 38.5, 41.5, 54.5, 47.5, 56.5, 67.5, 80.5};
+  float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5};

  for (auto fp16_enable : {false, true}) {
    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
@ -78,19 +79,18 @@ TEST_F(TestOpenCL_Conv2dTranspose, test1) {
  int n = 1;
  int h = 3;
  int w = 3;
-  int oh = 6;
-  int ow = 6;
+  int oh = 5;
+  int ow = 5;
  int ci = 2;
  int co = 1;
  int kh = 2;
  int kw = 2;
-  std::vector<int> pad = {0, 1, 0, 1};
+  std::vector<int> pad = {0, 0, 0, 0};
  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
  float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
  float bias_data[] = {0.5};
-  float output_data[] = {1.5,  3.5,  3.5,  13.5, 5.5,  23.5, 5.5,   7.5,   23.5,  33.5,  41.5,  59.5,
-                         7.5,  33.5, 9.5,  43.5, 11.5, 53.5, 59.5,  85.5,  77.5,  111.5, 95.5,  137.5,
-                         13.5, 63.5, 15.5, 73.5, 17.5, 83.5, 113.5, 163.5, 131.5, 189.5, 149.5, 215.5};
+  float output_data[] = {1.5,  3.5,  3.5,  13.5, 5.5,  5.5,   7.5,  23.5, 33.5, 41.5, 7.5,  33.5, 9.5,
+                         43.5, 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5};

  for (auto fp16_enable : {false, true}) {
    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
@ -130,4 +130,93 @@ TEST_F(TestOpenCL_Conv2dTranspose, test2) {
             {output_shape, output_data}, param, fp16_enable);
  }
 }
+
+TEST_F(TestOpenCL_Conv2dTranspose, test0MultiBatch) {
+  int n = 2;
+  int h = 2;
+  int w = 2;
+  int oh = 4;
+  int ow = 4;
+  int ci = 2;
+  int co = 1;
+  int kh = 2;
+  int kw = 2;
+  std::vector<int> pad = {0, 0, 0, 0};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
+  float bias_data[] = {0.5};
+  float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5,
+                         1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5};
+
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param =
+      CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
+}
+
+TEST_F(TestOpenCL_Conv2dTranspose, test1MultiBatch) {
+  int n = 2;
+  int h = 3;
+  int w = 3;
+  int oh = 5;
+  int ow = 5;
+  int ci = 2;
+  int co = 1;
+  int kh = 2;
+  int kw = 2;
+  std::vector<int> pad = {0, 0, 0, 0};
+  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
+  float bias_data[] = {0.5};
+  float output_data[] = {1.5,  3.5,  3.5,  13.5, 5.5,   5.5,   7.5,  23.5, 33.5, 41.5, 7.5,  33.5, 9.5,
+                         43.5, 11.5, 59.5, 85.5, 77.5,  111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5, 1.5,
+                         3.5,  3.5,  13.5, 5.5,  5.5,   7.5,   23.5, 33.5, 41.5, 7.5,  33.5, 9.5,  43.5,
+                         11.5, 59.5, 85.5, 77.5, 111.5, 95.5,  13.5, 63.5, 15.5, 73.5, 17.5};
+
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param =
+      CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
+}
+TEST_F(TestOpenCL_Conv2dTranspose, test2MultiBatch) {
+  int n = 2;
+  int h = 2;
+  int w = 2;
+  int oh = 5;
+  int ow = 5;
+  int ci = 2;
+  int co = 1;
+  int kh = 3;
+  int kw = 3;
+  std::vector<int> pad = {0, 0, 0, 0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+  float weight_data[] = {0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
+                         1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0};
+  float bias_data[] = {0.5};
+  float output_data[] = {1.5,   3.5,   8.5,  13.5,  23.5,  7.5,   9.5,   44.5,  43.5,  53.5,  18.5,  38.5,  128.5,
+                         106.5, 142.5, 59.5, 77.5,  180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5, 1.5,
+                         3.5,   8.5,   13.5, 23.5,  7.5,   9.5,   44.5,  43.5,  53.5,  18.5,  38.5,  128.5, 106.5,
+                         142.5, 59.5,  77.5, 180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5};
+
+  for (auto fp16_enable : {false, true}) {
+    std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
+    auto *param =
+      CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable);
+  }
+}
 }  // namespace mindspore::lite::opencl::test