From 0c9e9e5d82dcfb3d4346bedc3f93404cc0680f2f Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Tue, 13 Oct 2020 09:31:02 +0800 Subject: [PATCH] arithmetic support 2tensor --- .../runtime/kernel/opencl/cl/arithmetic.cl | 351 ++++++++++++------ .../kernel/opencl/kernel/arithmetic.cc | 228 ++++++------ .../runtime/kernel/opencl/kernel/arithmetic.h | 5 +- .../src/runtime/kernel/opencl/kernel/scale.cc | 58 ++- .../src/runtime/kernel/opencl/kernel/scale.h | 1 + .../runtime/kernel/opencl/kernel/transpose.cc | 6 + .../lite/src/runtime/kernel/opencl/utils.cc | 37 ++ .../lite/src/runtime/kernel/opencl/utils.h | 4 + mindspore/lite/test/run_test.sh | 5 +- 9 files changed, 421 insertions(+), 274 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl index 13247d95c2..8f4cbccd41 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl @@ -3,7 +3,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -12,24 +12,13 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a + b); -} - -__kernel void ElementAddReLU_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), max(a + b, (FLT4)(0.f))); + FLT4 result = a + b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -38,11 +27,13 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a - b); + FLT4 result = a - b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -51,11 +42,13 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a * b); + FLT4 result = a * b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -64,11 +57,13 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); + FLT4 result = divide_no_check(a, b); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -77,11 +72,13 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & AS_UINT4(b))); + FLT4 result = AS_FLT4(AS_UINT4(a) & AS_UINT4(b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -90,11 +87,13 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | AS_UINT4(b))); + FLT4 result = AS_FLT4(AS_UINT4(a) | AS_UINT4(b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -103,11 +102,13 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), max(a, b)); + FLT4 result = max(a, b); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -116,11 +117,14 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_ FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), min(a, b)); + FLT4 result = min(a, b); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -129,11 +133,14 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b))); + FLT4 result = floor(divide_no_check(a, b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -142,11 +149,14 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b); + FLT4 result = floor(divide_no_check(a, b)) * b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -155,11 +165,13 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2)); + FLT4 result = pown((a - b), (int4)2); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -168,11 +180,15 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2 FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a == b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a == b ? (FLT4)1.f : (FLT4).0f; + // error? + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -181,11 +197,13 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a != b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a != b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -194,11 +212,14 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a < b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a < b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -207,11 +228,13 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a <= b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a <= b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -220,11 +243,14 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a > b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a > b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape) { + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -233,58 +259,117 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a >= b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a >= b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); +} + +__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, float act_min, float act_max) { + int X = get_global_id(0); // C4 + int Y = get_global_id(1); // W + int Z = get_global_id(2); // N * H + int N = Z / output_shape.y; + int H = Z % output_shape.y; + if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { + return; + } + int a_c = X < a_shape.w ? X : a_shape.w - 1; + int a_w = Y < a_shape.z ? Y : a_shape.z - 1; + int a_h = H < a_shape.y ? H : a_shape.y - 1; + int a_n = N < a_shape.x ? N : a_shape.x - 1; + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); + int b_c = X < b_shape.w ? X : b_shape.w - 1; + int b_w = Y < b_shape.z ? Y : b_shape.z - 1; + int b_h = H < b_shape.y ? H : b_shape.y - 1; + int b_n = N < b_shape.x ? N : b_shape.x - 1; + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); + FLT4 result = a + b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); +} + +__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, float act_min, float act_max) { + int X = get_global_id(0); // C4 + int Y = get_global_id(1); // W + int Z = get_global_id(2); // N * H + int N = Z / output_shape.y; + int H = Z % output_shape.y; + if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { + return; + } + int a_c = X < a_shape.w ? X : a_shape.w - 1; + int a_w = Y < a_shape.z ? Y : a_shape.z - 1; + int a_h = H < a_shape.y ? H : a_shape.y - 1; + int a_n = N < a_shape.x ? N : a_shape.x - 1; + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); + int b_c = X < b_shape.w ? X : b_shape.w - 1; + int b_w = Y < b_shape.z ? Y : b_shape.z - 1; + int b_h = H < b_shape.y ? H : b_shape.y - 1; + int b_n = N < b_shape.x ? N : b_shape.x - 1; + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); + FLT4 result = a - b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); +} + +__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, float act_min, float act_max) { + int X = get_global_id(0); // C4 + int Y = get_global_id(1); // W + int Z = get_global_id(2); // N * H + int N = Z / output_shape.y; + int H = Z % output_shape.y; + if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { + return; + } + int a_c = X < a_shape.w ? X : a_shape.w - 1; + int a_w = Y < a_shape.z ? Y : a_shape.z - 1; + int a_h = H < a_shape.y ? H : a_shape.y - 1; + int a_n = N < a_shape.x ? N : a_shape.x - 1; + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); + int b_c = X < b_shape.w ? X : b_shape.w - 1; + int b_w = Y < b_shape.z ? Y : b_shape.z - 1; + int b_h = H < b_shape.y ? H : b_shape.y - 1; + int b_n = N < b_shape.x ? N : b_shape.x - 1; + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); + FLT4 result = a * b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); +} + +__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, float act_min, float act_max) { + int X = get_global_id(0); // C4 + int Y = get_global_id(1); // W + int Z = get_global_id(2); // N * H + int N = Z / output_shape.y; + int H = Z % output_shape.y; + if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { + return; + } + int a_c = X < a_shape.w ? X : a_shape.w - 1; + int a_w = Y < a_shape.z ? Y : a_shape.z - 1; + int a_h = H < a_shape.y ? H : a_shape.y - 1; + int a_n = N < a_shape.x ? N : a_shape.x - 1; + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); + int b_c = X < b_shape.w ? X : b_shape.w - 1; + int b_w = Y < b_shape.z ? Y : b_shape.z - 1; + int b_h = H < b_shape.y ? H : b_shape.y - 1; + int b_n = N < b_shape.x ? N : b_shape.x - 1; + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); + FLT4 result = a / b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); } -__kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b); -} - -__kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b); -} - -__kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b); -} - -__kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b)); -} __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -292,11 +377,13 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b))); + FLT4 result = AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -304,11 +391,13 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b))); + FLT4 result = AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -316,11 +405,13 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b)); + FLT4 result = max(a, (FLT4)b); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -328,11 +419,13 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b)); + FLT4 result = min(a, (FLT4)b); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -340,11 +433,13 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b))); + FLT4 result = floor(divide_no_check(a, (FLT4)b)); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -352,11 +447,13 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b); + FLT4 result = floor(divide_no_check(a, (FLT4)b)) * (FLT)b; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -364,11 +461,13 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2)); + FLT4 result = pown((a - (FLT4)b), (int4)2); + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -376,11 +475,13 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a == (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a == (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -388,11 +489,13 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a != (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a != (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -400,11 +503,13 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_ } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a < (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a < (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -412,11 +517,13 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -424,11 +531,13 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a > (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a > (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape) { + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -436,7 +545,9 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f); + FLT4 result = a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + WRITE_IMAGE(output, (int2)(X, Y), result); } __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 40e9bc80a3..bc8e30bf49 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -32,13 +32,7 @@ using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { -ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() { - if (weight_ptr_ != nullptr) { - auto allocator = ocl_runtime_->GetAllocator(); - allocator->Free(weight_ptr_); - weight_ptr_ = nullptr; - } -} +ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {} std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { const size_t global_x = out_tensors_[0]->Width(); @@ -114,90 +108,83 @@ int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector *img_si } int ArithmeticOpenCLKernel::InitBuffer() { - const ArithmeticParameter *arithmetic_parameter = reinterpret_cast(op_parameter_); - if (!arithmetic_parameter->broadcasting_) { - if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->data_c() != nullptr) { + auto fp16_enable = ocl_runtime_->GetFp16Enable(); + auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float); + for (auto in_tensor_ : in_tensors_) { + auto nhwc_shape = GetNHWCShape(in_tensor_->shape()); + inputs_nhwc_shapes_.push_back(nhwc_shape); + if (in_tensor_->category() != lite::Tensor::Category::CONST || in_tensor_->data_c() == nullptr) { + inputs_weight_ptrs_.push_back(nullptr); + } else { auto allocator = ocl_runtime_->GetAllocator(); - std::vector img_size; - GetImageSize(0, &img_size); - int pack_weight_size = in_tensors_[1]->ElementsC4Num(); - int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width(); - int channel = in_tensors_[1]->Channel(); - int batch = in_tensors_[1]->Batch(); - - if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) { - if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) { - weight_ptr_ = - allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size); - } else { - MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " - << in_tensors_[0]->data_type(); + std::vector img_size = GetImage2dShapeFromNHWC(nhwc_shape, op_format_); + int pack_weight_size = img_size[0] * img_size[1] * C4NUM; + int plane = nhwc_shape[1] * nhwc_shape[2]; + int channel = nhwc_shape[3]; + int batch = nhwc_shape[0]; + img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT); + if (!fp16_enable) { + float *weight = new (std::nothrow) float[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - } else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { - if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { - if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - float *weight = new (std::nothrow) float[pack_weight_size]; - if (weight == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed!"; - return RET_ERROR; - } + memset(weight, 0x00, pack_weight_size * data_size); + if (op_format_ == schema::Format_NHWC4) { + if (in_tensor_->data_type() == kNumberTypeFloat32) { std::function to_dtype = [](float x) -> float { return x; }; - PackNHWCToNC4HW4(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype); - weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); - delete[] weight; - } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; - if (weight == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed!"; - return RET_ERROR; - } - std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; - PackNHWCToNC4HW4(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype); - weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); - delete[] weight; - } else { - MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " - << in_tensors_[0]->data_type(); - return RET_ERROR; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float { return static_cast(x); }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } + } else if (op_format_ == schema::Format_NC4HW4) { + if (in_tensor_->data_type() == kNumberTypeFloat32) { + std::function to_dtype = [](float x) -> float { return x; }; + PackNHWCToNC4HW4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float { return static_cast(x); }; + PackNHWCToNC4HW4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); } - } else { - MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " - << in_tensors_[0]->GetFormat(); + } + if (batch * plane * channel == 1) { + // scalar + weight[3] = weight[2] = weight[1] = weight[0]; + } + auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size); + inputs_weight_ptrs_.push_back(weight_ptr_); + delete[] weight; + } else { + float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { - if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { - if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - float *weight = new (std::nothrow) float[pack_weight_size]; - if (weight == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed!"; - return RET_ERROR; - } - std::function to_dtype = [](float x) -> float { return x; }; - PackNHWCToNHWC4(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype); - weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); - delete[] weight; - } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; - if (weight == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed!"; - return RET_ERROR; - } + memset(weight, 0x00, pack_weight_size * data_size); + if (op_format_ == schema::Format_NHWC4) { + if (in_tensor_->data_type() == kNumberTypeFloat32) { std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; - PackNHWCToNHWC4(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype); - weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); - delete[] weight; - } else { - MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " - << in_tensors_[0]->data_type(); - return RET_ERROR; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float16_t { return x; }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } + } else if (op_format_ == schema::Format_NC4HW4) { + if (in_tensor_->data_type() == kNumberTypeFloat32) { + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNC4HW4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float16_t { return x; }; + PackNHWCToNC4HW4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); } - } else { - MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " - << in_tensors_[0]->GetFormat(); - return RET_ERROR; } + if (batch * plane * channel == 1) { + // scalar + weight[3] = weight[2] = weight[1] = weight[0]; + } + auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size); + inputs_weight_ptrs_.push_back(weight_ptr_); + delete[] weight; } } } @@ -211,7 +198,13 @@ int ArithmeticOpenCLKernel::Init() { if (arithmetic_parameter->broadcasting_) { element_flag_ = false; - kernel_name = "Broadcast"; + if (op_format_ == schema::Format_NHWC4) { + kernel_name = "BroadcastNHWC4"; + } else { + kernel_name = "BroadcastNC4HW4"; + MS_LOG(ERROR) << "Don't support BroadcastNC4HW4 yet"; + return RET_ERROR; + } } else { kernel_name = "Element"; } @@ -277,15 +270,14 @@ int ArithmeticOpenCLKernel::Init() { case schema::ActivationType_NO_ACTIVATION: break; case schema::ActivationType_RELU: - if (op_parameter_->type_ == PrimitiveType_Add && element_flag_) { - kernel_name += "ReLU"; - } else { - MS_LOG(ERROR) << "Only support ElementAdd + ReLU"; - return RET_ERROR; - } + activation_min_ = 0.f; + break; + case schema::ActivationType_RELU6: + activation_min_ = 0.f; + activation_max_ = 6.f; break; default: - MS_LOG(ERROR) << "Error activation type " << arithmetic_parameter->activation_type_; + MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; return RET_ERROR; } @@ -328,47 +320,41 @@ int ArithmeticOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; int arg_idx = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - if (element_flag_) { - void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->data_c() : weight_ptr_; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight); + auto input_0_ptr = inputs_weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : inputs_weight_ptrs_[0]; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr); + auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + if (!element_flag_) { + cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], + UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); + cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], + UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); + auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); + cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); + ocl_runtime_->RunKernel(kernel_, + {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), + static_cast(out_shape[1] * out_shape[0])}, + {}, nullptr); } else { - float weight = 0.f; - if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { - weight = static_cast(in_tensors_[1]->data_c())[0]; - } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { - weight = static_cast(static_cast(in_tensors_[1]->data_c())[0]); - } else { - MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type(); - return RET_ERROR; - } - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight); + cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); + ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); } - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - - cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const lite::PrimitiveC *primitive); - kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, const mindspore::lite::PrimitiveC *primitive) { - const ArithmeticParameter *arithmetic_parameter = reinterpret_cast(opParameter); - if (arithmetic_parameter->broadcasting_) { - for (size_t i = 0; i < arithmetic_parameter->ndim_; i++) { - if (arithmetic_parameter->in_shape1_[i] != 0 && arithmetic_parameter->in_shape1_[i] != 1) { - return OpenCLBiasAddKernelCreator(inputs, outputs, opParameter, ctx, desc, primitive); - } - } - } auto *kernel = new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs, ctx); if (kernel == nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 650c67d88f..2bd469d914 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -42,7 +42,10 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { cl::Kernel kernel_; bool element_flag_{true}; - void *weight_ptr_{nullptr}; + float activation_min_{-FLT_MAX}; + float activation_max_{FLT_MAX}; + std::vector> inputs_nhwc_shapes_; + std::vector inputs_weight_ptrs_; std::vector local_size_; std::vector global_size_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index efe3e4fcf8..bfc9ad3d40 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -54,6 +54,12 @@ std::vector ScaleOpenCLKernel::InitGlobalSize() const { void ScaleOpenCLKernel::Image2dGetWorkGroupSize() { local_size_ = {16, 16}; + if (out_tensors_[0]->shape().size() == 2) { + size_t H = out_tensors_[0]->shape()[0]; + size_t W = UP_DIV(out_tensors_[0]->shape()[1], C4NUM); + global_size_ = {W, H}; + return; + } if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t W = out_tensors_[0]->Width(); @@ -78,18 +84,23 @@ void ScaleOpenCLKernel::BufferGetWorkGroupSize() { int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { - im_dst_x = out_tensors_[0]->Width(); - im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { - im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); - } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { - im_dst_y = out_tensors_[0]->Batch(); - im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + if (out_tensors_[0]->shape().size() == 2) { + im_dst_x = UP_DIV(out_tensors_[0]->shape()[1], C4NUM); + im_dst_y = out_tensors_[0]->shape()[0]; } else { - MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); - return RET_ERROR; + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + im_dst_x = out_tensors_[0]->Width(); + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { + im_dst_y = out_tensors_[0]->Batch(); + im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else { + MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); + return RET_ERROR; + } } size_t img_dtype = CL_FLOAT; @@ -114,7 +125,7 @@ int ScaleOpenCLKernel::InitBuffer() { auto allocator = ocl_runtime_->GetAllocator(); std::vector img_size; GetImageSize(0, &img_size); - if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) { + if (scale_C_flag_) { img_size[1] = 1; img_size[0] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM); scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size); @@ -256,8 +267,10 @@ int ScaleOpenCLKernel::Init() { if (scale_tensor->ElementsNum() == 1) { element_flag_ = false; kernel_name = "BoardcastScale"; - } else if (axis_ == 3 && scale_shape.size() == 1) { + } else if (((in_shape.size() == 4 && axis_ == 3) || (in_shape.size() == 2 && axis_ == 1)) && + scale_shape.size() == 1) { element_flag_ = true; + scale_C_flag_ = true; kernel_name = "Scale_C"; } } else { @@ -327,24 +340,9 @@ int ScaleOpenCLKernel::Run() { } } ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - int H = 0; - int W = 0; - if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { - H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - W = out_tensors_[0]->Width(); - } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { - H = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); - W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { - H = out_tensors_[0]->Batch(); - W = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - } else { - MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat(); - return RET_ERROR; - } - cl_int2 output_shape{W, H}; + cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); - if (element_flag_ && axis_ == 3) { + if (element_flag_ && scale_C_flag_) { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); } ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h index 8359396dd6..8f8c92adbc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -42,6 +42,7 @@ class ScaleOpenCLKernel : public OpenCLKernel { cl::Kernel kernel_; bool element_flag_{true}; + bool scale_C_flag_{false}; void *scale_ptr_{nullptr}; void *offset_ptr_{nullptr}; int axis_{0}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index 9904a60d16..f79f10c513 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -27,6 +27,8 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Nchw2Nhwc; +using mindspore::schema::PrimitiveType_Nhwc2Nchw; using mindspore::schema::PrimitiveType_Transpose; namespace mindspore::kernel { @@ -141,4 +143,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vectorUnmapBuffer(origin_data); } + +std::vector GetNHWCShape(const std::vector &tensor_shape) { + int n, h, w, c; + n = h = w = c = 1; + if (tensor_shape.size() == 1) { + c = tensor_shape[0]; + } else if (tensor_shape.size() == 2) { + n = tensor_shape[0]; + c = tensor_shape[1]; + } else if (tensor_shape.size() == 3) { + n = tensor_shape[0]; + h = tensor_shape[1]; + c = tensor_shape[2]; + } else if (tensor_shape.size() == 4) { + n = tensor_shape[0]; + h = tensor_shape[1]; + w = tensor_shape[2]; + c = tensor_shape[3]; + } + return {n, h, w, c}; +} + +std::vector GetImage2dShapeFromNHWC(const std::vector &tensor_shape, schema::Format format) { + if (tensor_shape.size() != 4) { + return {1, 1}; + } + size_t image_x, image_y; + image_x = image_y = 1; + if (format == schema::Format_NHWC4) { + image_x = tensor_shape[2] * UP_DIV(tensor_shape[3], C4NUM); + image_y = tensor_shape[0] * tensor_shape[1]; + } else if (format == schema::Format_NC4HW4) { + image_x = tensor_shape[2]; + image_y = tensor_shape[0] * tensor_shape[1] * UP_DIV(tensor_shape[3], C4NUM); + } + return {image_x, image_y}; +} } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 92cddb32ed..03931ffb56 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -48,6 +48,10 @@ void Write2File(void *mem, const std::string &file_name, int size); void PrintTensor(lite::Tensor *tensor, int num = 10, const std::string &out_file = ""); +std::vector GetNHWCShape(const std::vector &tensor_shape); + +std::vector GetImage2dShapeFromNHWC(const std::vector &tensor_shape, schema::Format format); + template void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function &to_dtype) { int c4 = UP_DIV(channel, C4NUM); diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh index 44ade02040..fdf6c642a1 100755 --- a/mindspore/lite/test/run_test.sh +++ b/mindspore/lite/test/run_test.sh @@ -37,8 +37,7 @@ cp -fr $TEST_DATA_DIR/testPK ./data ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*" ./lite-test --gtest_filter="TestAvgPoolingOpenCL*" ./lite-test --gtest_filter="TestConv2dTransposeOpenCL*" -./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*" -./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*" +./lite-test --gtest_filter="TestMatMulOpenCL*" ./lite-test --gtest_filter="TestMaxPoolingOpenCL*" ./lite-test --gtest_filter="TestReduceOpenCL*" ./lite-test --gtest_filter="TestReshapeOpenCL*" @@ -46,3 +45,5 @@ cp -fr $TEST_DATA_DIR/testPK ./data ./lite-test --gtest_filter="TestTransposeOpenCL*" ./lite-test --gtest_filter="TestArithmeticOpenCL*" ./lite-test --gtest_filter="TestScaleOpenCL*" +./lite-test --gtest_filter="TestFullConnectionOpenCL*" +./lite-test --gtest_filter="TestResizeOpenCL*"