parent
147f563dfe
commit
357c156220
@ -1,36 +0,0 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
#define divide_no_check(a, b) (a / b)
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside_y = y_c < 0 || y_c >= input_shape.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
|
||||
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
|
||||
}
|
@ -0,0 +1,126 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
#define divide_no_check(a, b) (a / b)
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside_y = y_c < 0 || y_c >= input_shape.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
|
||||
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
|
||||
}
|
||||
|
||||
__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside_y = y_c < 0 || y_c >= input_shape.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
|
||||
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f)));
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue