!7357 support unaligned shape for opencl reshape op
Merge pull request !7357 from wandongdong/masterpull/7357/MERGE
commit
56b7562bf5
@ -1,24 +1,70 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define C4NUM 4
|
||||
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) {
|
||||
__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size,
|
||||
int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
if (X >= size_out.x * size_out.y * size_out.z * size_out.w) {
|
||||
int Y = get_global_id(1);
|
||||
int CO4 = UP_DIV(dst_size.z, C4NUM);
|
||||
int CO4_rem = dst_size.z % C4NUM;
|
||||
if (X >= dst_size.x || Y > dst_size.y) {
|
||||
return;
|
||||
}
|
||||
int in_img_x = size.z * size.w;
|
||||
int out_img_x = size_out.z * size_out.w;
|
||||
WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x),
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x)));
|
||||
int CI4 = UP_DIV(src_size.x, C4NUM);
|
||||
int CI4_rem = src_size.x % C4NUM;
|
||||
int in_img_x = CI4 * src_size.y;
|
||||
FLT4 res = (FLT4)(0.0f);
|
||||
FLT tmp[4];
|
||||
FLT res_tmp[4];
|
||||
int gcnt = 0;
|
||||
int start = 0;
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int n = 0;
|
||||
int cond = (((int)(CO4_rem > 0)) << 1) | (CI4_rem > 0);
|
||||
switch (cond) {
|
||||
case 1:
|
||||
start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y);
|
||||
gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM;
|
||||
start = (CI4 > 1 && gcnt < CI4) ? 0 : ((X + Y * dst_size.x) * C4NUM) % src_size.x % C4NUM;
|
||||
for (i = 0, n = 0, j = start; i < 4; ++n, j = 0) {
|
||||
int X_src = (gcnt + n) % in_img_x;
|
||||
res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x));
|
||||
tmp[0] = res.x;
|
||||
tmp[1] = res.y;
|
||||
tmp[2] = res.z;
|
||||
tmp[3] = res.w;
|
||||
int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : 4;
|
||||
for (; j < k && i < 4; ++j, ++i) {
|
||||
res_tmp[i] = tmp[j];
|
||||
}
|
||||
}
|
||||
res.x = res_tmp[0];
|
||||
res.y = res_tmp[1];
|
||||
res.z = res_tmp[2];
|
||||
res.w = res_tmp[3];
|
||||
WRITE_IMAGE(dst_data, (int2)(X, Y), res);
|
||||
break;
|
||||
default:
|
||||
gcnt = X + dst_size.x * Y;
|
||||
res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x));
|
||||
WRITE_IMAGE(dst_data, (int2)(X, Y), res);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 size_out) {
|
||||
__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size,
|
||||
int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
if (X >= size_out.x * size_out.y * size_out.z * size_out.w) {
|
||||
int Y = get_global_id(1);
|
||||
int CO4 = UP_DIV(dst_size.z, C4NUM);
|
||||
int CO4_rem = dst_size.z % C4NUM;
|
||||
if (X >= dst_size.x || Y > dst_size.y) {
|
||||
return;
|
||||
}
|
||||
int in_img_x = size.z;
|
||||
int out_img_x = size_out.z;
|
||||
WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x),
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x)));
|
||||
int CI4 = UP_DIV(src_size.x, C4NUM);
|
||||
int CI4_rem = src_size.x % C4NUM;
|
||||
int in_img_x = CI4 * src_size.y;
|
||||
int gcnt = X + dst_size.x * Y;
|
||||
WRITE_IMAGE(dst_data, (int2)(X, Y), READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x)));
|
||||
}
|
||||
|
Loading…
Reference in new issue