refactor op code

pull/8360/head
chenzupeng 4 years ago
parent 147f563dfe
commit 357c156220

@ -1,36 +0,0 @@
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define divide_no_check(a, b) (a / b)
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
}

@ -1,119 +1,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void conv2d_transpose2x2_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight,
__read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int4 src_size, int4 dst_size) {
int h = get_global_id(0);
int kh = h % 2;
int src_h = h / 2;
src_h = src_h * 2;
int w = get_global_id(1);
int kw = w % 2;
int src_w = w / 2;
src_w = src_w * 2;
int co = get_global_id(2);
if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
FLT4 r0 = (FLT4)(0.f);
FLT4 r1 = (FLT4)(0.f);
FLT4 r2 = (FLT4)(0.f);
FLT4 r3 = (FLT4)(0.f);
int base_w = (co * 4 + kh * 2 + kw) * src_size.z;
for (int ci = 0; ci < src_size.z; ++ci) {
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
FLT16 weight_cache = weight[base_w++];
r0 += x0.x * weight_cache.s0123;
r0 += x0.y * weight_cache.s4567;
r0 += x0.z * weight_cache.s89ab;
r0 += x0.w * weight_cache.scdef;
r1 += x1.x * weight_cache.s0123;
r1 += x1.y * weight_cache.s4567;
r1 += x1.z * weight_cache.s89ab;
r1 += x1.w * weight_cache.scdef;
r2 += x2.x * weight_cache.s0123;
r2 += x2.y * weight_cache.s4567;
r2 += x2.z * weight_cache.s89ab;
r2 += x2.w * weight_cache.scdef;
r3 += x3.x * weight_cache.s0123;
r3 += x3.y * weight_cache.s4567;
r3 += x3.z * weight_cache.s89ab;
r3 += x3.w * weight_cache.scdef;
}
FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
r0 += bias_val;
r1 += bias_val;
r2 += bias_val;
r3 += bias_val;
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
}
__kernel void conv2d_transpose2x2_NC4HW4(__read_only image2d_t src_data, __global FLT16 *weight,
__read_only image2d_t biases, __write_only image2d_t dst_data,
int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) {
int h = get_global_id(0);
int kh = h % 2;
int src_h = h / 2;
src_h = src_h * 2;
int w = get_global_id(1);
int kw = w % 2;
int src_w = w / 2;
src_w = src_w * 2;
int co = get_global_id(2);
if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
FLT4 r0 = (FLT4)(0.f);
FLT4 r1 = (FLT4)(0.f);
FLT4 r2 = (FLT4)(0.f);
FLT4 r3 = (FLT4)(0.f);
int base_w = (co * 4 + kh * 2 + kw) * src_size.z;
for (int ci = 0; ci < src_size.z; ++ci) {
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h + 1));
FLT16 weight_cache = weight[base_w++];
r0 += x0.x * weight_cache.s0123;
r0 += x0.y * weight_cache.s4567;
r0 += x0.z * weight_cache.s89ab;
r0 += x0.w * weight_cache.scdef;
r1 += x1.x * weight_cache.s0123;
r1 += x1.y * weight_cache.s4567;
r1 += x1.z * weight_cache.s89ab;
r1 += x1.w * weight_cache.scdef;
r2 += x2.x * weight_cache.s0123;
r2 += x2.y * weight_cache.s4567;
r2 += x2.z * weight_cache.s89ab;
r2 += x2.w * weight_cache.scdef;
r3 += x3.x * weight_cache.s0123;
r3 += x3.y * weight_cache.s4567;
r3 += x3.z * weight_cache.s89ab;
r3 += x3.w * weight_cache.scdef;
}
FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
r0 += bias_val;
r1 += bias_val;
r2 += bias_val;
r3 += bias_val;
WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh), r0);
WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh + 2), r1);
WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh), r2);
WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh + 2), r3);
}
__kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight,
__read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size,
__kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
__global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size,
int2 stride, int2 padding, int4 src_size, int4 dst_size) {
int dst_h = get_global_id(0);
int rem_h = dst_h % stride.x;

@ -2,8 +2,8 @@
#define C4NUM 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int4 in_shape, int2 out_shape, float act_min,
__kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
__read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min,
float act_max) {
int gidx = get_global_id(0); // CO4
int gidz = get_global_id(2); // N

@ -2,7 +2,7 @@
#define C4NUM 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidz = get_global_id(2); // N
@ -32,37 +32,7 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh
}
}
__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidz = get_global_id(2); // N
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.z;
bool inside = gidx < co4 && gidz < n;
FLT4 result = (FLT4)(0.0f);
for (uint i = lidy; i < ci4 && inside; i += 4) {
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0));
FLT16 w = weight[i * co4 + gidx];
result.x += dot(v, w.s0123);
result.y += dot(v, w.s4567);
result.z += dot(v, w.s89ab);
result.w += dot(v, w.scdef);
}
__local FLT4 temp[32][4];
temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidy == 0 && inside) {
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result);
}
}
__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
@ -95,39 +65,3 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
}
}
__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.x;
int h = out_shape.y;
int w = out_shape.z;
int nh_index = gidy / 4;
bool inside = gidx < co4 && gidz < w && nh_index < n * h;
int n_index = nh_index / h;
int h_index = nh_index % h;
FLT4 result = (FLT4)(0.0f);
for (uint i = lidy; i < ci4 && inside; i += 4) {
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index));
FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
result.x += dot(v, weight_value.s0123);
result.y += dot(v, weight_value.s4567);
result.z += dot(v, weight_value.s89ab);
result.w += dot(v, weight_value.scdef);
}
__local FLT4 temp[32][4];
temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidy == 0 && inside) {
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result);
}
}

@ -1,61 +0,0 @@
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
}
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
}

@ -0,0 +1,126 @@
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define divide_no_check(a, b) (a / b)
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
}
__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f)));
}
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
}
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
return;
}
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
}

@ -42,30 +42,41 @@ using mindspore::schema::PrimitiveType_Activation;
namespace mindspore::kernel {
int ActivationOpenClKernel::Init() {
std::map<int, std::string> kernel_names{
std::string ActivationOpenCLKernel::GetActTypeString(int act_type) {
static std::map<int, std::string> supported_act_type = {
{ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"},
{ActivationType_RELU6, "Relu6"}, {ActivationType_TANH, "Tanh"}, {ActivationType_SWISH, "Swish"},
{ActivationType_HSWISH, "HSwish"}};
if (kernel_names.count(type_) == 0) {
auto result_iter = supported_act_type.find(act_type);
if (result_iter != supported_act_type.end()) {
return result_iter->second;
}
return "";
}
int ActivationOpenCLKernel::CheckSpecs() {
if (GetActTypeString(type_).empty()) {
MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found";
return mindspore::lite::RET_ERROR;
return RET_ERROR;
}
return RET_OK;
}
int ActivationOpenCLKernel::Prepare() {
outShape = Image2DInfo(out_tensors_[0]);
local_size_ = {};
global_size_ = {outShape.width, outShape.height};
std::string source = activation_source;
std::set<std::string> build_options;
std::string program_name = "Activation";
ocl_runtime_->LoadSource(program_name, source);
std::string kernel_name = kernel_names[type_];
std::string kernel_name = GetActTypeString(type_);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
SetArgs();
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " init Done!";
return mindspore::lite::RET_OK;
}
int ActivationOpenClKernel::SetArgs() {
void ActivationOpenCLKernel::SetConstArgs() {
int arg_idx = 2;
cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size);
@ -78,50 +89,26 @@ int ActivationOpenClKernel::SetArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4);
}
return RET_OK;
}
int ActivationOpenClKernel::Run() {
MS_LOG(DEBUG) << this->name() << " begin running!";
void ActivationOpenCLKernel::SetGlobalLocal() {
local_range_ = cl::NullRange;
global_range_ = {outShape.width, outShape.height};
}
int ActivationOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
if (ret != mindspore::lite::RET_OK) {
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
return mindspore::lite::RET_ERROR;
return RET_ERROR;
}
return mindspore::lite::RET_OK;
return RET_OK;
}
kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
if (inputs.empty()) {
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
return nullptr;
}
if (inputs[0]->shape().size() > 2 && inputs[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Activation kernel:" << opParameter->name_ << " failed: Unsupported multi-batch.";
free(opParameter);
return nullptr;
}
auto *kernel =
new (std::nothrow) ActivationOpenClKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "New kernel:" << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Init activation kernel:" << opParameter->name_ << " failed!";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenClActivationKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenClActivationKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenCLKernelCreator<ActivationOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenCLKernelCreator<ActivationOpenCLKernel>)
} // namespace mindspore::kernel

@ -18,26 +18,30 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ACTIVATION_H_
#include <vector>
#include <string>
#include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/fp32/activation.h"
namespace mindspore::kernel {
class ActivationOpenClKernel : public OpenCLKernel {
class ActivationOpenCLKernel : public OpenCLKernel {
public:
ActivationOpenClKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
ActivationOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs)
: OpenCLKernel(parameter, inputs, outputs),
type_(reinterpret_cast<ActivationParameter *>(parameter)->type_),
alpha_(reinterpret_cast<ActivationParameter *>(parameter)->alpha_) {}
~ActivationOpenClKernel() override = default;
~ActivationOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
int SetArgs();
static std::string GetActTypeString(int act_type);
cl::Kernel kernel_;
int type_;
float alpha_;

@ -18,6 +18,7 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_
#include <vector>
#include <string>
#include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"
@ -30,24 +31,21 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~ArithmeticOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
int SetArgs();
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
std::vector<size_t> InitGlobalSize() const;
void Image2dGetWorkGroupSize();
cl::Kernel kernel_;
bool element_flag_{true};
float activation_min_{-FLT_MAX};
float activation_max_{FLT_MAX};
std::vector<std::vector<int>> inputs_nhwc_shapes_;
std::vector<void *> inputs_weight_ptrs_;
std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
std::string kernel_name_;
};
} // namespace mindspore::kernel

@ -31,13 +31,17 @@ using mindspore::schema::PrimitiveType_DeConv2D;
namespace mindspore::kernel {
int Conv2dTransposeOpenCLKernel::Init() {
int Conv2dTransposeOpenCLKernel::CheckSpecs() {
ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
if (param->pad_l_ != param->pad_r_ || param->kernel_h_ - param->stride_h_ != 2 * param->pad_l_ ||
param->pad_u_ != param->pad_d_ || param->kernel_w_ - param->stride_w_ != 2 * param->pad_u_) {
MS_LOG(ERROR) << "only support kernel - stride == 2 * pad";
return RET_ERROR;
}
return RET_OK;
}
int Conv2dTransposeOpenCLKernel::Prepare() {
std::string kernel_name = "conv2d_transpose_NHWC4";
enable_fp16_ = ocl_runtime_->GetFp16Enable();
#ifdef PROGRAM_WITH_IL
@ -49,12 +53,56 @@ int Conv2dTransposeOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
PadWeight();
InitWeights();
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return mindspore::lite::RET_OK;
}
void Conv2dTransposeOpenCLKernel::PadWeight() {
void Conv2dTransposeOpenCLKernel::SetGlobalLocal() {
ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int co = out_tensors_[0]->shape()[3];
int co4 = UP_DIV(co, C4NUM);
int stride_h = param->stride_h_;
int stride_w = param->stride_w_;
int oh = out_tensors_[0]->shape()[1];
int ow = out_tensors_[0]->shape()[2];
local_size_ = {16, 1, 16};
global_size_ = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4};
AlignGlobalLocal(global_size_, local_size_);
}
void Conv2dTransposeOpenCLKernel::SetConstArgs() {
int arg_cnt = 2;
ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int ci = in_tensors_[0]->shape()[3];
int co = out_tensors_[0]->shape()[3];
int kh = param->kernel_h_;
int kw = param->kernel_w_;
int pad_h = param->pad_l_;
int pad_w = param->pad_u_;
int stride_h = param->stride_h_;
int stride_w = param->stride_w_;
int oh = out_tensors_[0]->shape()[1];
int ow = out_tensors_[0]->shape()[2];
int h = in_tensors_[0]->shape()[1];
int w = in_tensors_[0]->shape()[2];
cl_int2 kernel_size = {kh, kw};
cl_int2 stride = {stride_h, stride_w};
cl_int2 padding = {pad_h, pad_w};
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
}
int Conv2dTransposeOpenCLKernel::InitWeights() {
ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int ci = in_tensors_[0]->shape()[3];
int co = out_tensors_[0]->shape()[3];
@ -138,67 +186,18 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
}
}
allocator->UnmapBuffer(bias_);
return RET_OK;
}
int Conv2dTransposeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int ci = in_tensors_[0]->shape()[3];
int co = out_tensors_[0]->shape()[3];
int co4 = UP_DIV(co, C4NUM);
int kh = param->kernel_h_;
int kw = param->kernel_w_;
int pad_h = param->pad_l_;
int pad_w = param->pad_u_;
int stride_h = param->stride_h_;
int stride_w = param->stride_w_;
int oh = out_tensors_[0]->shape()[1];
int ow = out_tensors_[0]->shape()[2];
int h = in_tensors_[0]->shape()[1];
int w = in_tensors_[0]->shape()[2];
// local size should less than MAX_GROUP_SIZE
std::vector<size_t> local = {16, 1, 16};
std::vector<size_t> global = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4};
cl_int2 kernel_size = {kh, kw};
cl_int2 stride = {stride_h, stride_w};
cl_int2 padding = {pad_h, pad_w};
cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
int arg_cnt = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return mindspore::lite::RET_OK;
}
kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel =
new (std::nothrow) Conv2dTransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLKernelCreator<Conv2dTransposeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLKernelCreator<Conv2dTransposeOpenCLKernel>)
} // namespace mindspore::kernel

@ -32,12 +32,14 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~Conv2dTransposeOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
void PadWeight();
cl::Kernel kernel_;
void *padWeight_{nullptr};
void *bias_{nullptr};

@ -34,7 +34,11 @@ using mindspore::schema::PrimitiveType_FullConnection;
namespace mindspore::kernel {
int FullConnectionOpenCLKernel::Init() {
std::string kernel_name = "FullConnection_NHWC4";
// deleted soon
return CheckSpecs();
}
int FullConnectionOpenCLKernel::CheckSpecs() {
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
transposeA = param->a_transpose_;
if (transposeA) {
@ -48,9 +52,6 @@ int FullConnectionOpenCLKernel::Init() {
MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4";
return RET_ERROR;
}
// call default move constructor(elemwised moved)
inShape = Image2DInfo(in_tensors_[0]);
outShape = Image2DInfo(out_tensors_[0]);
switch (param->act_type_) {
case ActType_No:
break;
@ -65,6 +66,13 @@ int FullConnectionOpenCLKernel::Init() {
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR;
}
return RET_OK;
}
int FullConnectionOpenCLKernel::Prepare() {
std::string kernel_name = "FullConnection_NHWC4";
inShape = Image2DInfo(in_tensors_[0]);
outShape = Image2DInfo(out_tensors_[0]);
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
@ -74,13 +82,14 @@ int FullConnectionOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
PadWeight();
InitWeights();
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void FullConnectionOpenCLKernel::PadWeight() {
int FullConnectionOpenCLKernel::InitWeights() {
auto allocator = ocl_runtime_->GetAllocator();
int ci = inShape.C;
int ci4 = UP_DIV(ci, C4NUM);
@ -167,48 +176,37 @@ void FullConnectionOpenCLKernel::PadWeight() {
}
}
allocator->UnmapBuffer(bias_);
return RET_OK;
}
int FullConnectionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void FullConnectionOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> local = {32, 4, 1};
std::vector<size_t> global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N};
int arg_count = 0;
AlignGlobalLocal(global, local);
}
void FullConnectionOpenCLKernel::SetConstArgs() {
int arg_count = 2;
cl_int4 in_shape = {static_cast<int>(inShape.N), static_cast<int>(inShape.H), static_cast<int>(inShape.W),
static_cast<int>(inShape.C)};
cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)};
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel =
new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
int FullConnectionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLKernelCreator<FullConnectionOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLKernelCreator<FullConnectionOpenCLKernel>)
} // namespace mindspore::kernel

@ -31,12 +31,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~FullConnectionOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int Init() override;
private:
void PadWeight();
cl::Kernel kernel_;
void *padWeight_{nullptr};
void *bias_{nullptr};

@ -30,8 +30,7 @@ using mindspore::schema::PrimitiveType_MatMul;
namespace mindspore::kernel {
int MatMulOpenCLKernel::Init() {
std::string kernel_name = "MatMul_NHWC4";
int MatMulOpenCLKernel::CheckSpecs() {
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
transposeA = param->a_transpose_;
if (transposeA) {
@ -45,6 +44,11 @@ int MatMulOpenCLKernel::Init() {
MS_LOG(ERROR) << "matmul only support input shape size=2 or 4.";
return mindspore::lite::RET_ERROR;
}
return RET_OK;
}
int MatMulOpenCLKernel::Prepare() {
std::string kernel_name = "MatMul_NHWC4";
dims = in_tensors_[0]->shape().size();
for (int i = 0; i < dims; i++) {
inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i];
@ -61,13 +65,14 @@ int MatMulOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
PadWeight();
InitWeights();
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return mindspore::lite::RET_OK;
}
void MatMulOpenCLKernel::PadWeight() {
int MatMulOpenCLKernel::InitWeights() {
// ABMCI @ ABCICO = ABMCO
auto allocator = ocl_runtime_->GetAllocator();
int ci = inShape[3];
@ -128,45 +133,36 @@ void MatMulOpenCLKernel::PadWeight() {
}
}
allocator->UnmapBuffer(padWeight_);
return RET_OK;
}
int MatMulOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void MatMulOpenCLKernel::SetGlobalLocal() {
// local size should less than MAX_GROUP_SIZE
std::vector<size_t> local = {32, 4, 1};
std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
static_cast<size_t>(outShape[2])};
int arg_count = 0;
AlignGlobalLocal(global, local);
}
void MatMulOpenCLKernel::SetConstArgs() {
int arg_count = 2;
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return mindspore::lite::RET_OK;
}
kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
int MatMulOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return mindspore::lite::RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>)
} // namespace mindspore::kernel

@ -31,12 +31,14 @@ class MatMulOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~MatMulOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
void PadWeight();
cl::Kernel kernel_;
void *padWeight_{nullptr};
bool enable_fp16_{false};

@ -21,8 +21,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/pooling2d.cl.inc"
#endif
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -36,27 +35,25 @@ using mindspore::schema::PrimitiveType_Pooling;
namespace mindspore {
namespace kernel {
int PoolingOpenCLKernel::Init() {
int PoolingOpenCLKernel::CheckSpecs() {
if (parameter_->pool_mode_ != PoolMode_MaxPool && parameter_->pool_mode_ != PoolMode_AvgPool) {
MS_LOG(ERROR) << "Init `Pooling2d` kernel failed, unsupported pool mode!";
return RET_ERROR;
}
if (parameter_->act_type_ != ActType_No && parameter_->act_type_ != ActType_Relu) {
MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
return RET_ERROR;
}
return RET_OK;
}
int PoolingOpenCLKernel::Prepare() {
std::string kernel_name;
#ifndef PROGRAM_WITH_IL
std::string source;
std::string program_name;
#endif
if (parameter_->pool_mode_ == PoolMode_MaxPool) {
kernel_name = "MaxPooling2d";
#ifndef PROGRAM_WITH_IL
source = max_pool2d_source;
program_name = "MaxPooling2d";
#endif
} else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
kernel_name = "AvgPooling2d";
#ifndef PROGRAM_WITH_IL
source = avg_pool2d_source;
program_name = "AvgPooling2d";
#endif
} else {
MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!";
return RET_INVALID_OP_NAME;
}
switch (parameter_->act_type_) {
case ActType_No:
@ -66,42 +63,35 @@ int PoolingOpenCLKernel::Init() {
break;
default:
MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
return RET_ERROR;
break;
}
enable_fp16_ = ocl_runtime_->GetFp16Enable();
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
kernel_name += "_NHWC4";
if (out_mem_type_ == MemType::BUF) {
MS_LOG(ERROR) << "buffer output not support yet.";
return mindspore::lite::RET_ERROR;
} else {
kernel_name += "_IMG";
}
kernel_name += "_IMG";
std::set<std::string> build_options;
std::string source = pooling2d_source;
std::string program_name = "Pooling2d";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
InitGlobalSize();
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return mindspore::lite::RET_OK;
}
void PoolingOpenCLKernel::InitGlobalSize() {
void PoolingOpenCLKernel::SetGlobalLocal() {
const size_t global_x = out_tensors_[0]->shape()[1];
const size_t global_y = out_tensors_[0]->shape()[2];
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
global_size_ = {global_z, global_y, global_x};
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
global_size_ = GetCommonGlobalSize(local_size_, global_size_);
global_range_ = {global_z, global_y, global_x};
local_range_ = {};
}
int PoolingOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void PoolingOpenCLKernel::SetConstArgs() {
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3],
@ -109,40 +99,24 @@ int PoolingOpenCLKernel::Run() {
cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};
cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_};
cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return mindspore::lite::RET_OK;
}
kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (RET_OK != ret) {
MS_LOG(ERROR) << "Init OpenCL Pooling kernel failed!";
delete kernel;
return nullptr;
}
return kernel;
int PoolingOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return mindspore::lite::RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLKernelCreator<PoolingOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLKernelCreator<PoolingOpenCLKernel>)
} // namespace kernel
} // namespace mindspore

@ -31,14 +31,15 @@ class PoolingOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs), parameter_(reinterpret_cast<PoolingParameter *>(parameter)) {}
~PoolingOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
void InitGlobalSize();
PoolingParameter *parameter_;
cl::Kernel kernel_;
bool enable_fp16_{false};
std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
};

@ -40,14 +40,19 @@ using mindspore::schema::ReduceMode_ReduceSumSquare;
namespace mindspore::kernel {
int ReduceOpenCLKernel::Init() {
InitNHWCShape();
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (reduce_param == nullptr) {
return RET_NULL_PTR;
std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
static const std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"},
{ReduceMode_ReduceSum, "sum"}};
auto result_iter = reduce_type2str.find(type);
if (result_iter != reduce_type2str.end()) {
return result_iter->second;
}
std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}};
if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) {
return "";
}
int ReduceOpenCLKernel::CheckSpecs() {
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (GetReduceTypeStr(reduce_param->mode_).empty()) {
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
return RET_PARAM_INVALID;
}
@ -67,7 +72,17 @@ int ReduceOpenCLKernel::Init() {
MS_LOG(ERROR) << "reduce axis (2,3) should keep dims";
return RET_PARAM_INVALID;
}
std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
return RET_OK;
}
int ReduceOpenCLKernel::Prepare() {
outShape = Image2DInfo(out_tensors_[0]);
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (reduce_param == nullptr) {
return RET_NULL_PTR;
}
std::string kernel_name = GetReduceTypeStr(reduce_param->mode_);
if (wc_reduce_) {
kernel_name += "_WC";
}
@ -77,7 +92,6 @@ int ReduceOpenCLKernel::Init() {
kernel_name += "_local";
}
kernel_name += "_NHWC4";
enable_fp16_ = ocl_runtime_->GetFp16Enable();
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
@ -88,32 +102,26 @@ int ReduceOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return mindspore::lite::RET_OK;
}
void ReduceOpenCLKernel::InitNHWCShape() {
std::vector<int> shapex = out_tensors_[0]->shape();
size_t n = 1, h = 1, w = 1, c = 1;
if (shapex.size() == 2) {
n = shapex[0];
c = shapex[1];
} else if (shapex.size() == 4) {
n = shapex[0];
h = shapex[1];
w = shapex[2];
c = shapex[3];
}
nhwc_shape_ = {n, h, w, c};
}
int ReduceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void ReduceOpenCLKernel::SetConstArgs() {
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
int c4 = UP_DIV(c, C4NUM);
cl_int4 size = {h, w, c4, c};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
}
void ReduceOpenCLKernel::SetGlobalLocal() {
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int c = shapex[3];
int c4 = UP_DIV(c, C4NUM);
std::vector<size_t> local = {};
if (use_local_) {
local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
@ -122,35 +130,20 @@ int ReduceOpenCLKernel::Run() {
if (wc_reduce_) {
global = {static_cast<size_t>(h), 1, 1};
}
cl_int4 size = {h, w, c4, c};
AlignGlobalLocal(global, local);
}
int ReduceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return mindspore::lite::RET_OK;
}
kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLKernelCreator<ReduceOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLKernelCreator<ReduceOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLKernelCreator<ReduceOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLKernelCreator<ReduceOpenCLKernel>)
} // namespace mindspore::kernel

@ -18,7 +18,7 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
#include <vector>
#include <string>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/reduce_parameter.h"
@ -31,14 +31,16 @@ class ReduceOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~ReduceOpenCLKernel() override = default;
int Init() override;
int Run() override;
void InitNHWCShape();
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
static std::string GetReduceTypeStr(int type);
cl::Kernel kernel_;
bool enable_fp16_{false};
std::vector<size_t> nhwc_shape_;
Image2DInfo outShape = Image2DInfo(nullptr);
bool use_local_{false};
bool wc_reduce_{false};
static const size_t LOCAL_CACHE_THREAD{16};

@ -32,7 +32,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;

@ -32,27 +32,32 @@ using mindspore::schema::PrimitiveType_Resize;
namespace mindspore::kernel {
int ResizeOpenCLKernel::Init() {
auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_);
if (resize_param == nullptr) {
return RET_NULL_PTR;
}
alignCorner = resize_param->align_corners_;
preserveAspectRatio = resize_param->preserve_aspect_ratio_;
int ResizeOpenCLKernel::CheckSpecs() {
auto in_shape = in_tensors_[0]->shape();
auto out_shape = out_tensors_[0]->shape();
if (in_shape.size() != 4 || out_shape.size() != 4 || in_shape[0] != out_shape[0] || in_shape[3] != out_shape[3]) {
MS_LOG(ERROR) << "resize op only support 4D and axes HW";
return RET_PARAM_INVALID;
}
auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_);
if (resize_param->method_ != schema::ResizeMethod_LINEAR && resize_param->method_ != schema::ResizeMethod_NEAREST) {
MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_;
return RET_PARAM_INVALID;
}
return RET_OK;
}
int ResizeOpenCLKernel::Prepare() {
auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_);
alignCorner = resize_param->align_corners_;
preserveAspectRatio = resize_param->preserve_aspect_ratio_;
auto in_shape = in_tensors_[0]->shape();
auto out_shape = out_tensors_[0]->shape();
std::string kernel_name = "resize";
if (resize_param->method_ == schema::ResizeMethod_LINEAR) {
kernel_name += "_bilinear";
} else if (resize_param->method_ == schema::ResizeMethod_NEAREST) {
kernel_name += "_nearest_neighbor";
} else {
MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_;
return RET_PARAM_INVALID;
}
kernel_name += "_NHWC4";
#ifdef PROGRAM_WITH_IL
@ -64,6 +69,8 @@ int ResizeOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -74,8 +81,7 @@ float ResizeOpenCLKernel::getResizeScaleFactor(int input_size, int output_size)
: static_cast<float>(input_size) / static_cast<float>(output_size);
}
int ResizeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void ResizeOpenCLKernel::SetConstArgs() {
auto in_shape = in_tensors_[0]->shape();
auto out_shape = out_tensors_[0]->shape();
int n = out_shape[0];
@ -85,39 +91,30 @@ int ResizeOpenCLKernel::Run() {
int c4 = UP_DIV(c, C4NUM);
float scale_h = getResizeScaleFactor(in_tensors_[0]->shape()[1], out_tensors_[0]->shape()[1]);
float scale_w = getResizeScaleFactor(in_tensors_[0]->shape()[2], out_tensors_[0]->shape()[2]);
std::vector<size_t> local = {};
std::vector<size_t> global = {static_cast<size_t>(c4), static_cast<size_t>(w), static_cast<size_t>(h)};
cl_int4 in_size = {in_shape[0], in_shape[1], in_shape[2], UP_DIV(in_shape[3], C4NUM)};
cl_int4 out_size = {n, h, w, c4};
cl_float2 scale = {scale_h, scale_w};
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLResizeKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ResizeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
void ResizeOpenCLKernel::SetGlobalLocal() {
local_range_ = {};
auto out_shape = Image2DInfo(out_tensors_[0]);
global_range_ = {out_shape.Slice, out_shape.W, out_shape.H};
}
int ResizeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLResizeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLResizeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
} // namespace mindspore::kernel

@ -31,8 +31,11 @@ class ResizeOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~ResizeOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
float getResizeScaleFactor(int input_size, int output_size);

@ -42,51 +42,17 @@ std::vector<float> SoftmaxOpenCLKernel::GetMaskForLastChannel(int channels) {
return mask;
}
int SoftmaxOpenCLKernel::InitGlobalSize() {
size_t global_x, global_y;
const size_t global_z = 1;
if (axis_ == 1) {
global_x = UP_DIV(nhwc_shape_[3], C4NUM);
global_y = nhwc_shape_[2];
} else if (axis_ == 2) {
global_x = UP_DIV(nhwc_shape_[3], C4NUM);
global_y = nhwc_shape_[1];
} else if (axis_ == 3) {
global_x = nhwc_shape_[2];
global_y = nhwc_shape_[1];
} else {
global_x = 1;
global_y = 1;
}
global_size_ = {global_x, global_y, global_z};
return lite::RET_OK;
}
int SoftmaxOpenCLKernel::SetWorkGroupSize() {
// set work group size
InitGlobalSize();
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
global_size_ = GetCommonGlobalSize(local_size_, global_size_);
return lite::RET_OK;
}
int SoftmaxOpenCLKernel::SetWorkGroupSize1x1() {
local_size_ = {32, 1, 1};
global_size_ = {32, 1, 1};
return lite::RET_OK;
}
int SoftmaxOpenCLKernel::Init() {
std::string kernel_name = "SoftMax";
std::string program_name = "SoftMax";
auto softmax_param = reinterpret_cast<SoftmaxParameter *>(op_parameter_);
axis_ = softmax_param->axis_;
int SoftmaxOpenCLKernel::CheckSpecs() {
axis_ = parameter_->axis_;
auto in_shape = in_tensors_[0]->shape();
if (in_shape.size() > 4) {
MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_shape.size();
return RET_ERROR;
}
if (in_shape[0] > 1) {
MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch.";
return RET_ERROR;
}
if (axis_ < 0) {
axis_ = in_shape.size() + axis_;
}
@ -95,11 +61,15 @@ int SoftmaxOpenCLKernel::Init() {
MS_LOG(ERROR) << "Init `Softmax` kernel failed: softmax axis should be H W or C";
return RET_ERROR;
}
nhwc_shape_ = GetNHWCShape(in_shape);
return RET_OK;
}
int SoftmaxOpenCLKernel::Prepare() {
std::string kernel_name = "SoftMax";
out_shape = Image2DInfo(out_tensors_[0]);
std::string source = softmax_source;
enable_fp16_ = ocl_runtime_->GetFp16Enable();
// framework not set this param yet! just use default.
if (nhwc_shape_[1] == 1 && nhwc_shape_[2] == 1 && axis_ == 3) {
if (out_shape.H == 1 && out_shape.W == 1 && axis_ == 3) {
// support 4d tensor
onexone_flag_ = true;
kernel_name += "1x1";
@ -112,62 +82,63 @@ int SoftmaxOpenCLKernel::Init() {
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
#else
std::set<std::string> build_options;
std::string program_name = "SoftMax";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
SetConstArgs();
SetGlobalLocal();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return lite::RET_OK;
}
int SoftmaxOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void SoftmaxOpenCLKernel::SetGlobalLocal() {
if (onexone_flag_) {
local_size_ = {32};
global_size_ = {32};
} else {
size_t global_x, global_y;
if (axis_ == 1) {
global_x = out_shape.Slice;
global_y = out_shape.W;
} else if (axis_ == 2) {
global_x = out_shape.Slice;
global_y = out_shape.H;
} else if (axis_ == 3) {
global_x = out_shape.W;
global_y = out_shape.H;
} else {
global_x = 1;
global_y = 1;
}
global_size_ = {global_x, global_y};
local_size_ = {};
}
AlignGlobalLocal(global_size_, local_size_);
}
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
int channel = nhwc_shape_[3];
int c4 = UP_DIV(channel, C4NUM);
void SoftmaxOpenCLKernel::SetConstArgs() {
int arg_idx = 2;
int channel = out_shape.C;
int c4 = out_shape.Slice;
auto mask_ = GetMaskForLastChannel(channel);
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask);
cl_int4 input_shape = {nhwc_shape_[0], nhwc_shape_[1], nhwc_shape_[2], c4};
cl_int4 input_shape = {static_cast<int>(out_shape.N), static_cast<int>(out_shape.H), static_cast<int>(out_shape.W),
c4};
ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
if (onexone_flag_) {
SetWorkGroupSize1x1();
} else {
SetWorkGroupSize();
}
}
int SoftmaxOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
// run opengl kernel
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return lite::RET_OK;
}
kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
delete kernel;
return nullptr;
}
if (inputs[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch.";
delete kernel;
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Init `Softmax` kernel failed!";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLKernelCreator<SoftmaxOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLKernelCreator<SoftmaxOpenCLKernel>)
} // namespace mindspore::kernel

@ -33,8 +33,11 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
}
~SoftmaxOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
int InitGlobalSize();
@ -47,9 +50,8 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
bool onexone_flag_{false};
std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
bool enable_fp16_{false};
int axis_{0};
std::vector<int> nhwc_shape_;
Image2DInfo out_shape = Image2DInfo(nullptr);
};
} // namespace mindspore::kernel

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save