change opencl code path

pull/4815/head
chenzupeng 5 years ago
parent 98565d8b54
commit 605c2b0b16

@ -460,25 +460,21 @@ build_gtest() {
gene_clhpp() {
CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl"
for sub_dir in "${CL_SRC_DIR}"/*
do
data_type="$(basename ${sub_dir})"
if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then
continue
if [ ! -d ${CL_SRC_DIR} ]; then
return
fi
cd ${CL_SRC_DIR}/${data_type}
cd ${CL_SRC_DIR}/
rm -rf *.inc
echo "$(cd "$(dirname $0)"; pwd)"
for file_path in "${CL_SRC_DIR}/${data_type}"/*
for file_path in "${CL_SRC_DIR}"/*
do
file="$(basename ${file_path})"
inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'`
sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file}
inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'`
sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file}
sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
sed -i '$a\;' ${inc_file}
done
done
}
gene_ocl_program() {

@ -2,10 +2,7 @@
#define SLICES 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
#define FLT4 float4
#define MIN(X, Y) (X < Y ? X : Y)
#define READ_FLT4 read_imagef
#define WRITE_FLT4 write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
int Y = get_global_id(0); // height id
int X = get_global_id(1); // weight id
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 tmp;
tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
}
}
@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
int Y = get_global_id(0); // height id
int X = get_global_id(1); // weight id
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 tmp;
tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
}
}
@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
int Y = get_global_id(0); // height id
int X = get_global_id(1); // weight id
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 tmp;
tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
}
}
@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
int Y = get_global_id(0); // height id
int X = get_global_id(1); // weight id
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 tmp;
tmp.x = 1 / (1 + exp(-in_c4.x));
tmp.y = 1 / (1 + exp(-in_c4.y));
tmp.z = 1 / (1 + exp(-in_c4.z));
tmp.w = 1 / (1 + exp(-in_c4.w));
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
}
}

@ -0,0 +1,101 @@
#define divide_no_check(a, b) (a / b)
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a + b);
}
__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
}
__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
}
__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + input_b[idx];
}
__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b[idx];
}
__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}
__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}
__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = weight * input_a[idx] + bias;
}

@ -1,4 +1,3 @@
#define FLT4 float4
#define INT4 int4
#define INT2 int2
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

@ -2,9 +2,6 @@
#define SLICES 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
#define FLT4 float4
#define READ_FLT4 read_imagef
#define WRITE_FLT4 write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
int Y = get_global_id(0); // height id
int X = get_global_id(1); // weight id
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
FLT4 tmp;
int index = num * 4;
tmp.x = in_c4.x * alpha[index];
tmp.y = in_c4.y * alpha[index + 1];
tmp.z = in_c4.z * alpha[index + 2];
tmp.w = in_c4.w * alpha[index + 3];
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
}
}

@ -1,5 +1,4 @@
// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define FLT4 float4
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,

@ -1,8 +1,3 @@
#define FLT float
#define FLT4 float4
#define FLT16 float16
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,

@ -1,12 +1,3 @@
#ifdef ENABLE_FP16
#define FLT half
#define FLT4 half4
#define TO_FLT4 convert_half4
#else
#define FLT float
#define FLT4 float4
#define TO_FLT4 convert_float4
#endif
__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,

@ -1,61 +0,0 @@
#define FLT half
#define FLT4 half4
#define FLT16 half16
#define READ_IMAGE read_imageh
#define WRITE_IMAGE write_imageh
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
int4 src_size, int4 dst_size) {
int h = get_global_id(0);
int kh = h % 2;
int src_h = h / 2;
src_h = src_h * 2;
int w = get_global_id(1);
int kw = w % 2;
int src_w = w / 2;
src_w = src_w * 2;
int co = get_global_id(2);
if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
FLT4 r0 = (FLT4)(0.f);
FLT4 r1 = (FLT4)(0.f);
FLT4 r2 = (FLT4)(0.f);
FLT4 r3 = (FLT4)(0.f);
int base_w = (co * 4 + kh + kw * 2) * src_size.z;
for (int ci = 0; ci < src_size.z; ++ci) {
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
FLT16 weight_cache = weight[base_w++];
r0 += x0.x * weight_cache.s0123;
r0 += x0.y * weight_cache.s4567;
r0 += x0.z * weight_cache.s89ab;
r0 += x0.w * weight_cache.scdef;
r1 += x1.x * weight_cache.s0123;
r1 += x1.y * weight_cache.s4567;
r1 += x1.z * weight_cache.s89ab;
r1 += x1.w * weight_cache.scdef;
r2 += x2.x * weight_cache.s0123;
r2 += x2.y * weight_cache.s4567;
r2 += x2.z * weight_cache.s89ab;
r2 += x2.w * weight_cache.scdef;
r3 += x3.x * weight_cache.s0123;
r3 += x3.y * weight_cache.s4567;
r3 += x3.z * weight_cache.s89ab;
r3 += x3.w * weight_cache.scdef;
}
FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
r0 += bias_val;
r1 += bias_val;
r2 += bias_val;
r3 += bias_val;
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
}

@ -1,76 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define ACCUM_FLT4 half4
#define FLT half
#define FLT2 half2
#define FLT3 half3
#define FLT4 half4
#define TO_FLT4 convert_half4
#define TO_ACCUM_TYPE convert_half4
#define TO_ACCUM_FLT convert_half
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offseted = X * stride.x + padding.x;
int y_offseted = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offseted + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offseted + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c];
FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_ACCUM_TYPE(src_final * f);
}
fx_c++;
}
}
FLT4 bias_val = biases[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
}
__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offseted = X * stride.x + padding.x;
int y_offseted = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offseted + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offseted + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c];
FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_ACCUM_TYPE(src_final * f);
}
fx_c++;
}
}
FLT4 bias_val = biases[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
}

@ -1,45 +0,0 @@
#define FLT half
#define FLT4 half4
#define READ_IMAGE read_imageh
#define WRITE_IMAGE write_imageh
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
return;
}
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;
result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;
result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;
result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
}

@ -1,34 +0,0 @@
__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + input_b[idx];
}
__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b[idx];
}
__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}
__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] / input_b[idx];
}
__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = weight * input_a[idx] + bias;
}

@ -1,66 +0,0 @@
#define divide_no_check(a, b) (a/b)
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a + b);
}
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
}
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), weight * a + bias);
}

@ -1,32 +0,0 @@
#define FLT4 float4
#define FLT16 float16
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
int2 gid = (int2)(get_global_id(0), get_global_id(1));
int2 lid = (int2)(get_local_id(0), get_local_id(1));
FLT4 result = (FLT4)(0.0f);
bool inside = gid.x < offset_co.y;
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
FLT16 w = weight[gid.x + i * offset_co.y];
result.x += dot(v, w.s0123);
result.y += dot(v, w.s4567);
result.z += dot(v, w.s89ab);
result.w += dot(v, w.scdef);
}
__local FLT4 temp[64][4];
temp[lid.x][lid.y] = result;
barrier(CLK_LOCAL_MEM_FENCE);
if (lid.y == 0 && inside) {
result += temp[lid.x][1];
result += temp[lid.x][2];
result += temp[lid.x][3];
if (has_bias != 0) {
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
}
WRITE_IMAGE(output, (int2)(gid.x, 0), result);
}
}

@ -1,14 +0,0 @@
#define FLT float
#define FLT4 float4
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}

@ -1,7 +1,3 @@
#define FLT4 half4
#define FLT16 half16
#define READ_IMAGE read_imageh
#define WRITE_IMAGE write_imageh
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {

@ -1,7 +1,3 @@
#define FLT half
#define FLT4 half4
#define READ_IMAGE read_imageh
#define WRITE_IMAGE write_imageh
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0);

@ -1,5 +1,5 @@
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// what is mask and args.slices_x32
__kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
const int slices, const int slices_x32) {
@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
int tid = get_local_id(0);
float sum = 0.0f;
for (size_t i = tid; i < slices - 1; i += 32) {
float4 src = read_imagef(input, smp_none, (int2)(i, 0));
float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
sum += dot((float4)(1.0f), exp(src));
}
if ((slices - 1) % 32 == tid) {
float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
sum += dot(mask, exp(src));
}
@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
barrier(CLK_LOCAL_MEM_FENCE);
sum = tmpx1[0];
for (size_t i = tid; i < slices - 1; i += 32) {
float4 result = read_imagef(input, smp_none, (int2)(i, 0));
float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
result = exp(result) * sum;
output[i] = result;
}
if ((slices - 1) % 32 == tid) {
float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
result = exp(result) * sum;
__global float4 *remain_ptr4 = output;
remain_ptr4 += slices - 1;

@ -1,22 +1,3 @@
#define FLT float
#define FLT4 float4
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
// enum Format {
// Format_NCHW = 0,
// Format_NHWC = 1,
// Format_NHWC4 = 2,
// Format_HWKC = 3,
// Format_HWCK = 4,
// Format_KCHW = 5,
// Format_CKHW = 6,
// Format_KHWC = 7,
// Format_CHWK = 8,
// Format_NC4HW4 = 100,
// Format_NUM_OF_FORMAT = 101,
// Format_MIN = Format_NCHW,
// Format_MAX = Format_NUM_OF_FORMAT
//};
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {

@ -1,7 +1,3 @@
#define FLT float
#define FLT4 float4
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
int X = get_global_id(0);

@ -24,7 +24,7 @@
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
}
std::string program_name = "";
std::string kernel_name = "";
std::string source = activation_source_fp32;
std::string source = activation_source;
if (type_ == ActivationType_RELU) {
program_name = "RELU";
kernel_name = "Relu";

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save