!13330 [MS][LITE][GPU]conv2d transpose support n > 1

From: @chenzupeng
Reviewed-by: @ddwsky,@zhanghaibo5
Signed-off-by: @ddwsky
pull/13330/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit d1592cee9e

@ -13,8 +13,10 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
int rem_w = dst_w % stride.y;
int ceil_w = dst_w / stride.y;
dst_w = ceil_w * stride.y * 2 + rem_w;
int dst_c = get_global_id(2);
if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z) return;
int dst_c = get_global_id(2); // n * c4
int n = dst_c / dst_size.z;
dst_c = dst_c % dst_size.z;
if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z || n >= dst_size.w) return;
int weight_base = dst_c * src_size.z * kernel_size.x * kernel_size.y;
FLT4 r0 = (FLT4)(0.f);
FLT4 r1 = (FLT4)(0.f);
@ -40,10 +42,18 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
int kernel_w = kw_start - kw_copy;
int weight_offset = weight_base + (kernel_h * kernel_size.y + kernel_w) * src_size.z;
for (int ci = 0; ci < src_size.z; ++ci) {
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out0_src_h));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out1_src_h));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out0_src_h));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out1_src_h));
FLT4 x0 = (FLT4)0.f;
FLT4 x2 = (FLT4)0.f;
if (out0_src_h < src_size.x) {
x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
}
FLT4 x1 = (FLT4)0.f;
FLT4 x3 = (FLT4)0.f;
if (out1_src_h < src_size.x) {
x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
}
FLT16 weight_cache = weight[weight_offset++];
r0 += x0.x * weight_cache.s0123;
r0 += x0.y * weight_cache.s4567;
@ -85,14 +95,14 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f));
}
WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0);
WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h), r0);
if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) {
WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1);
WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r1);
}
if (dst_h < dst_size.x && dst_w + stride.y < dst_size.y) {
WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h), r2);
WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h), r2);
}
if (dst_h + stride.x < dst_size.x && dst_w + stride.y < dst_size.y) {
WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h + stride.x), r3);
WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r3);
}
}

@ -83,10 +83,12 @@ void Conv2dTransposeOpenCLKernel::SetGlobalLocal() {
int co4 = UP_DIV(co, C4NUM);
int stride_h = param->stride_h_;
int stride_w = param->stride_w_;
int n = out_tensors_[0]->shape()[0];
int oh = out_tensors_[0]->shape()[1];
int ow = out_tensors_[0]->shape()[2];
local_size_ = {16, 1, 16};
global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w), (size_t)co4};
global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w),
(size_t)co4 * (size_t)n};
AlignGlobalLocal(global_size_, local_size_);
}
@ -103,13 +105,14 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
int stride_w = param->stride_w_;
int oh = out_tensors_[0]->shape()[1];
int ow = out_tensors_[0]->shape()[2];
int n = in_tensors_[0]->shape()[0];
int h = in_tensors_[0]->shape()[1];
int w = in_tensors_[0]->shape()[2];
cl_int2 kernel_size = {kh, kw};
cl_int2 stride = {stride_h, stride_w};
cl_int2 padding = {pad_h, pad_w};
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), n};
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), n};
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);

@ -38,9 +38,10 @@ OpParameter *CreateParameter(int n, int h, int w, int ci, int co, int kh, int kw
param->dilation_h_ = 1;
param->dilation_w_ = 1;
param->act_type_ = ActType_No;
param->group_ = 1;
*input_shape = {n, h, w, ci};
*weight_shape = {co, kh, kw, ci};
*weight_shape = {ci, kh, kw, co};
*bias_shape = {co};
*output_shape = {1, oh, ow, co};
return reinterpret_cast<OpParameter *>(param);
@ -59,9 +60,9 @@ TEST_F(TestOpenCL_Conv2dTranspose, test0) {
int kw = 2;
std::vector<int> pad = {0, 0, 0, 0};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8};
float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
float bias_data[] = {0.5};
float output_data[] = {5.5, 6.5, 17.5, 22.5, 7.5, 8.5, 27.5, 32.5, 29.5, 38.5, 41.5, 54.5, 47.5, 56.5, 67.5, 80.5};
float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5};
for (auto fp16_enable : {false, true}) {
std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
@ -78,19 +79,18 @@ TEST_F(TestOpenCL_Conv2dTranspose, test1) {
int n = 1;
int h = 3;
int w = 3;
int oh = 6;
int ow = 6;
int oh = 5;
int ow = 5;
int ci = 2;
int co = 1;
int kh = 2;
int kw = 2;
std::vector<int> pad = {0, 1, 0, 1};
std::vector<int> pad = {0, 0, 0, 0};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
float bias_data[] = {0.5};
float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 23.5, 5.5, 7.5, 23.5, 33.5, 41.5, 59.5,
7.5, 33.5, 9.5, 43.5, 11.5, 53.5, 59.5, 85.5, 77.5, 111.5, 95.5, 137.5,
13.5, 63.5, 15.5, 73.5, 17.5, 83.5, 113.5, 163.5, 131.5, 189.5, 149.5, 215.5};
float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5,
43.5, 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5};
for (auto fp16_enable : {false, true}) {
std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
@ -130,4 +130,93 @@ TEST_F(TestOpenCL_Conv2dTranspose, test2) {
{output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Conv2dTranspose, test0MultiBatch) {
int n = 2;
int h = 2;
int w = 2;
int oh = 4;
int ow = 4;
int ci = 2;
int co = 1;
int kh = 2;
int kw = 2;
std::vector<int> pad = {0, 0, 0, 0};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
float bias_data[] = {0.5};
float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5,
1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5};
for (auto fp16_enable : {false, true}) {
std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
auto *param =
CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
TestMain({{input_shape, input_data, VAR},
{weight_shape, weight_data, CONST_TENSOR},
{bias_shape, bias_data, CONST_TENSOR}},
{output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Conv2dTranspose, test1MultiBatch) {
int n = 2;
int h = 3;
int w = 3;
int oh = 5;
int ow = 5;
int ci = 2;
int co = 1;
int kh = 2;
int kw = 2;
std::vector<int> pad = {0, 0, 0, 0};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7};
float bias_data[] = {0.5};
float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5,
43.5, 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5, 1.5,
3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5, 43.5,
11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5};
for (auto fp16_enable : {false, true}) {
std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
auto *param =
CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
TestMain({{input_shape, input_data, VAR},
{weight_shape, weight_data, CONST_TENSOR},
{bias_shape, bias_data, CONST_TENSOR}},
{output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Conv2dTranspose, test2MultiBatch) {
int n = 2;
int h = 2;
int w = 2;
int oh = 5;
int ow = 5;
int ci = 2;
int co = 1;
int kh = 3;
int kw = 3;
std::vector<int> pad = {0, 0, 0, 0};
float input_data[] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
float weight_data[] = {0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0};
float bias_data[] = {0.5};
float output_data[] = {1.5, 3.5, 8.5, 13.5, 23.5, 7.5, 9.5, 44.5, 43.5, 53.5, 18.5, 38.5, 128.5,
106.5, 142.5, 59.5, 77.5, 180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5, 1.5,
3.5, 8.5, 13.5, 23.5, 7.5, 9.5, 44.5, 43.5, 53.5, 18.5, 38.5, 128.5, 106.5,
142.5, 59.5, 77.5, 180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5};
for (auto fp16_enable : {false, true}) {
std::vector<int> input_shape, weight_shape, bias_shape, output_shape;
auto *param =
CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape);
TestMain({{input_shape, input_data, VAR},
{weight_shape, weight_data, CONST_TENSOR},
{bias_shape, bias_data, CONST_TENSOR}},
{output_shape, output_data}, param, fp16_enable);
}
}
} // namespace mindspore::lite::opencl::test

Loading…
Cancel
Save