parent
98565d8b54
commit
605c2b0b16
@ -0,0 +1,101 @@
|
||||
#define divide_no_check(a, b) (a / b)
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a + b);
|
||||
}
|
||||
|
||||
__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a - b);
|
||||
}
|
||||
|
||||
__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a * b);
|
||||
}
|
||||
|
||||
__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
|
||||
}
|
||||
|
||||
__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = weight * input_a[idx] + bias;
|
||||
}
|
@ -1,4 +1,3 @@
|
||||
#define FLT4 float4
|
||||
#define INT4 int4
|
||||
#define INT2 int2
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
@ -1,5 +1,4 @@
|
||||
// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define FLT4 float4
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
|
@ -1,8 +1,3 @@
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define FLT16 float16
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
@ -1,12 +1,3 @@
|
||||
#ifdef ENABLE_FP16
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define TO_FLT4 convert_half4
|
||||
#else
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define TO_FLT4 convert_float4
|
||||
#endif
|
||||
__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
|
||||
float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
|
@ -1,61 +0,0 @@
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define FLT16 half16
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
||||
int4 src_size, int4 dst_size) {
|
||||
int h = get_global_id(0);
|
||||
int kh = h % 2;
|
||||
int src_h = h / 2;
|
||||
src_h = src_h * 2;
|
||||
int w = get_global_id(1);
|
||||
int kw = w % 2;
|
||||
int src_w = w / 2;
|
||||
src_w = src_w * 2;
|
||||
int co = get_global_id(2);
|
||||
if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
|
||||
FLT4 r0 = (FLT4)(0.f);
|
||||
FLT4 r1 = (FLT4)(0.f);
|
||||
FLT4 r2 = (FLT4)(0.f);
|
||||
FLT4 r3 = (FLT4)(0.f);
|
||||
int base_w = (co * 4 + kh + kw * 2) * src_size.z;
|
||||
for (int ci = 0; ci < src_size.z; ++ci) {
|
||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
|
||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
|
||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
|
||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
|
||||
FLT16 weight_cache = weight[base_w++];
|
||||
r0 += x0.x * weight_cache.s0123;
|
||||
r0 += x0.y * weight_cache.s4567;
|
||||
r0 += x0.z * weight_cache.s89ab;
|
||||
r0 += x0.w * weight_cache.scdef;
|
||||
|
||||
r1 += x1.x * weight_cache.s0123;
|
||||
r1 += x1.y * weight_cache.s4567;
|
||||
r1 += x1.z * weight_cache.s89ab;
|
||||
r1 += x1.w * weight_cache.scdef;
|
||||
|
||||
r2 += x2.x * weight_cache.s0123;
|
||||
r2 += x2.y * weight_cache.s4567;
|
||||
r2 += x2.z * weight_cache.s89ab;
|
||||
r2 += x2.w * weight_cache.scdef;
|
||||
|
||||
r3 += x3.x * weight_cache.s0123;
|
||||
r3 += x3.y * weight_cache.s4567;
|
||||
r3 += x3.z * weight_cache.s89ab;
|
||||
r3 += x3.w * weight_cache.scdef;
|
||||
}
|
||||
FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
|
||||
r0 += bias_val;
|
||||
r1 += bias_val;
|
||||
r2 += bias_val;
|
||||
r3 += bias_val;
|
||||
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define ACCUM_FLT4 half4
|
||||
#define FLT half
|
||||
#define FLT2 half2
|
||||
#define FLT3 half3
|
||||
#define FLT4 half4
|
||||
#define TO_FLT4 convert_half4
|
||||
#define TO_ACCUM_TYPE convert_half4
|
||||
#define TO_ACCUM_FLT convert_half
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
|
||||
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
|
||||
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offseted = X * stride.x + padding.x;
|
||||
int y_offseted = Y * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offseted + ky * dilation.y;
|
||||
bool outside_y = y_c < 0 || y_c >= src_size.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offseted + kx * dilation.x;
|
||||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 f = filters[fx_c];
|
||||
FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
|
||||
r += TO_ACCUM_TYPE(src_final * f);
|
||||
}
|
||||
fx_c++;
|
||||
}
|
||||
}
|
||||
FLT4 bias_val = biases[Z];
|
||||
FLT4 res0 = TO_FLT4(r) + bias_val;
|
||||
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
|
||||
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
|
||||
}
|
||||
|
||||
__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
|
||||
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
|
||||
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offseted = X * stride.x + padding.x;
|
||||
int y_offseted = Y * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offseted + ky * dilation.y;
|
||||
bool outside_y = y_c < 0 || y_c >= src_size.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offseted + kx * dilation.x;
|
||||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 f = filters[fx_c];
|
||||
FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
|
||||
r += TO_ACCUM_TYPE(src_final * f);
|
||||
}
|
||||
fx_c++;
|
||||
}
|
||||
}
|
||||
FLT4 bias_val = biases[Z];
|
||||
FLT4 res0 = TO_FLT4(r) + bias_val;
|
||||
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
|
||||
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= HW.y || Y >= C.y) {
|
||||
return;
|
||||
}
|
||||
FLT4 result[4];
|
||||
result[0] = (FLT4)(0.0f);
|
||||
result[1] = (FLT4)(0.0f);
|
||||
result[2] = (FLT4)(0.0f);
|
||||
result[3] = (FLT4)(0.0f);
|
||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
|
||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
|
||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
|
||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
|
||||
result[0].x = x0.x;
|
||||
result[0].y = x1.x;
|
||||
result[0].z = x2.x;
|
||||
result[0].w = x3.x;
|
||||
|
||||
result[1].x = x0.y;
|
||||
result[1].y = x1.y;
|
||||
result[1].z = x2.y;
|
||||
result[1].w = x3.y;
|
||||
|
||||
result[2].x = x0.z;
|
||||
result[2].y = x1.z;
|
||||
result[2].z = x2.z;
|
||||
result[2].w = x3.z;
|
||||
|
||||
result[3].x = x0.w;
|
||||
result[3].y = x1.w;
|
||||
result[3].z = x2.w;
|
||||
result[3].w = x3.w;
|
||||
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] / input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = weight * input_a[idx] + bias;
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
#define divide_no_check(a, b) (a/b)
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a + b);
|
||||
}
|
||||
|
||||
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a - b);
|
||||
}
|
||||
|
||||
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a * b);
|
||||
}
|
||||
|
||||
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), weight * a + bias);
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
#define FLT4 float4
|
||||
#define FLT16 float16
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
||||
int2 gid = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 lid = (int2)(get_local_id(0), get_local_id(1));
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
bool inside = gid.x < offset_co.y;
|
||||
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
|
||||
FLT16 w = weight[gid.x + i * offset_co.y];
|
||||
result.x += dot(v, w.s0123);
|
||||
result.y += dot(v, w.s4567);
|
||||
result.z += dot(v, w.s89ab);
|
||||
result.w += dot(v, w.scdef);
|
||||
}
|
||||
__local FLT4 temp[64][4];
|
||||
temp[lid.x][lid.y] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid.y == 0 && inside) {
|
||||
result += temp[lid.x][1];
|
||||
result += temp[lid.x][2];
|
||||
result += temp[lid.x][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(gid.x, 0), result);
|
||||
}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
}
|
@ -1,7 +1,3 @@
|
||||
#define FLT4 half4
|
||||
#define FLT16 half16
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
@ -1,7 +1,3 @@
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0);
|
@ -1,22 +1,3 @@
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
// enum Format {
|
||||
// Format_NCHW = 0,
|
||||
// Format_NHWC = 1,
|
||||
// Format_NHWC4 = 2,
|
||||
// Format_HWKC = 3,
|
||||
// Format_HWCK = 4,
|
||||
// Format_KCHW = 5,
|
||||
// Format_CKHW = 6,
|
||||
// Format_KHWC = 7,
|
||||
// Format_CHWK = 8,
|
||||
// Format_NC4HW4 = 100,
|
||||
// Format_NUM_OF_FORMAT = 101,
|
||||
// Format_MIN = Format_NCHW,
|
||||
// Format_MAX = Format_NUM_OF_FORMAT
|
||||
//};
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
@ -1,7 +1,3 @@
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||
int X = get_global_id(0);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue