From 605c2b0b16f0ef973e904471ef4644d9793adc6f Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 20 Aug 2020 14:30:20 +0800 Subject: [PATCH] change opencl code path --- build.sh | 30 +++--- .../kernel/opencl/cl/{fp32 => }/activation.cl | 19 ++-- .../runtime/kernel/opencl/cl/arithmetic.cl | 101 ++++++++++++++++++ .../kernel/opencl/cl/{fp32 => }/avg_pool2d.cl | 0 .../kernel/opencl/cl/{fp32 => }/batchnorm.cl | 1 - .../opencl/cl/{fp32 => }/caffe_prelu.cl | 7 +- .../kernel/opencl/cl/{fp32 => }/concat.cl | 1 - .../cl/{fp32 => }/conv2d_transpose2x2.cl | 5 - .../opencl/cl/{fp32 => }/convolution.cl | 0 .../opencl/cl/{fp32 => }/depthwise_conv2d.cl | 9 -- .../opencl/cl/fp16/conv2d_transpose2x2.cl | 61 ----------- .../kernel/opencl/cl/fp16/depthwise_conv2d.cl | 76 ------------- .../kernel/opencl/cl/fp16/transpose.cl | 45 -------- .../opencl/cl/fp32/arithmetic_buffer.cl | 34 ------ .../opencl/cl/fp32/arithmetic_image2d.cl | 66 ------------ .../runtime/kernel/opencl/cl/fp32/matmul.cl | 32 ------ .../runtime/kernel/opencl/cl/fp32/reshape.cl | 14 --- .../kernel/opencl/cl/{fp16 => }/matmul.cl | 4 - .../kernel/opencl/cl/{fp32 => }/max_pool2d.cl | 0 .../kernel/opencl/cl/{fp16 => }/reshape.cl | 4 - .../kernel/opencl/cl/{fp32 => }/softmax.cl | 0 .../kernel/opencl/cl/{fp32 => }/softmax1x1.cl | 10 +- .../kernel/opencl/cl/{fp32 => }/to_format.cl | 19 ---- .../kernel/opencl/cl/{fp32 => }/transpose.cl | 4 - .../kernel/opencl/kernel/activation.cc | 4 +- .../kernel/opencl/kernel/arithmetic.cc | 10 +- .../runtime/kernel/opencl/kernel/batchnorm.cc | 4 +- .../kernel/opencl/kernel/caffe_prelu.cc | 4 +- .../runtime/kernel/opencl/kernel/concat.cc | 6 +- .../kernel/opencl/kernel/conv2d_transpose.cc | 9 +- .../kernel/opencl/kernel/depthwise_conv2d.cc | 9 +- .../runtime/kernel/opencl/kernel/matmul.cc | 9 +- .../runtime/kernel/opencl/kernel/pooling2d.cc | 8 +- .../src/runtime/kernel/opencl/kernel/prelu.cc | 4 +- .../runtime/kernel/opencl/kernel/reshape.cc | 9 +- .../runtime/kernel/opencl/kernel/softmax.cc | 8 +- .../runtime/kernel/opencl/kernel/to_format.cc | 82 +++++++------- .../runtime/kernel/opencl/kernel/to_format.h | 2 + .../runtime/kernel/opencl/kernel/transpose.cc | 9 +- .../lite/src/runtime/opencl/opencl_runtime.cc | 8 +- 40 files changed, 216 insertions(+), 511 deletions(-) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/activation.cl (73%) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/avg_pool2d.cl (100%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/batchnorm.cl (98%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/caffe_prelu.cl (73%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/concat.cl (98%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/conv2d_transpose2x2.cl (95%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/convolution.cl (100%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/depthwise_conv2d.cl (98%) delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp16 => }/matmul.cl (91%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/max_pool2d.cl (100%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp16 => }/reshape.cl (82%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/softmax.cl (100%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/softmax1x1.cl (89%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/to_format.cl (95%) rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/transpose.cl (95%) diff --git a/build.sh b/build.sh index 094f0d1de1..911135906a 100755 --- a/build.sh +++ b/build.sh @@ -460,24 +460,20 @@ build_gtest() { gene_clhpp() { CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl" - for sub_dir in "${CL_SRC_DIR}"/* + if [ ! -d ${CL_SRC_DIR} ]; then + return + fi + cd ${CL_SRC_DIR}/ + rm -rf *.inc + echo "$(cd "$(dirname $0)"; pwd)" + for file_path in "${CL_SRC_DIR}"/* do - data_type="$(basename ${sub_dir})" - if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then - continue - fi - cd ${CL_SRC_DIR}/${data_type} - rm -rf *.inc - echo "$(cd "$(dirname $0)"; pwd)" - for file_path in "${CL_SRC_DIR}/${data_type}"/* - do - file="$(basename ${file_path})" - inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'` - sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file} - kernel_name=`echo ${file} | sed s'/.\{3\}$//'` - sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file} - sed -i '$a\;' ${inc_file} - done + file="$(basename ${file_path})" + inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'` + sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file} + kernel_name=`echo ${file} | sed s'/.\{3\}$//'` + sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file} + sed -i '$a\;' ${inc_file} done } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl similarity index 73% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl index e9c1f2519f..20287b25f7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl @@ -2,10 +2,7 @@ #define SLICES 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define FLT4 float4 #define MIN(X, Y) (X < Y ? X : Y) -#define READ_FLT4 read_imagef -#define WRITE_FLT4 write_imagef __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, @@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out int Y = get_global_id(0); // height id int X = get_global_id(1); // weight id for (int num = 0; num < UP_DIV(C, SLICES); ++num) { - FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC FLT4 tmp; tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha; tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha; tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha; tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha; - WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC + WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC } } @@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c int Y = get_global_id(0); // height id int X = get_global_id(1); // weight id for (int num = 0; num < UP_DIV(C, SLICES); ++num) { - FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC FLT4 tmp; tmp.x = in_c4.x >= 0 ? in_c4.x : 0; tmp.y = in_c4.y >= 0 ? in_c4.y : 0; tmp.z = in_c4.z >= 0 ? in_c4.z : 0; tmp.w = in_c4.w >= 0 ? in_c4.w : 0; - WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC + WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC } } @@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, int Y = get_global_id(0); // height id int X = get_global_id(1); // weight id for (int num = 0; num < UP_DIV(C, SLICES); ++num) { - FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC FLT4 tmp; tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0; tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0; tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0; tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0; - WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC + WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC } } @@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output int Y = get_global_id(0); // height id int X = get_global_id(1); // weight id for (int num = 0; num < UP_DIV(C, SLICES); ++num) { - FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC FLT4 tmp; tmp.x = 1 / (1 + exp(-in_c4.x)); tmp.y = 1 / (1 + exp(-in_c4.y)); tmp.z = 1 / (1 + exp(-in_c4.z)); tmp.w = 1 / (1 + exp(-in_c4.w)); - WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC + WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl new file mode 100644 index 0000000000..497d789a64 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl @@ -0,0 +1,101 @@ +#define divide_no_check(a, b) (a / b) +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a + b); +} + +__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a - b); +} + +__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a * b); +} + +__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); +} + +__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias); +} + +__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] + input_b[idx]; +} + +__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] - input_b[idx]; +} + +__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b[idx]; +} + +__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b[idx]; +} + +__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = weight * input_a[idx] + bias; +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl similarity index 100% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl similarity index 98% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl index a68141329b..554674da6a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl @@ -1,4 +1,3 @@ -#define FLT4 float4 #define INT4 int4 #define INT2 int2 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl similarity index 73% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl index d162be0839..bfaec1dfe1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl @@ -2,9 +2,6 @@ #define SLICES 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define FLT4 float4 -#define READ_FLT4 read_imagef -#define WRITE_FLT4 write_imagef __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, @@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out int Y = get_global_id(0); // height id int X = get_global_id(1); // weight id for (int num = 0; num < UP_DIV(C, SLICES); ++num) { - FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC + FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC FLT4 tmp; int index = num * 4; tmp.x = in_c4.x * alpha[index]; tmp.y = in_c4.y * alpha[index + 1]; tmp.z = in_c4.z * alpha[index + 2]; tmp.w = in_c4.w * alpha[index + 3]; - WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC + WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl similarity index 98% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl index 6758707012..c16daf9c81 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl @@ -1,5 +1,4 @@ // #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define FLT4 float4 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl similarity index 95% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl index 7a22a10efe..2014bb589d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl @@ -1,8 +1,3 @@ -#define FLT float -#define FLT4 float4 -#define FLT16 float16 -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl similarity index 100% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl similarity index 98% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl index f7944e9a2b..a6e1c322e1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl @@ -1,12 +1,3 @@ -#ifdef ENABLE_FP16 -#define FLT half -#define FLT4 half4 -#define TO_FLT4 convert_half4 -#else -#define FLT float -#define FLT4 float4 -#define TO_FLT4 convert_float4 -#endif __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl deleted file mode 100644 index e166e699c5..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl +++ /dev/null @@ -1,61 +0,0 @@ -#define FLT half -#define FLT4 half4 -#define FLT16 half16 -#define READ_IMAGE read_imageh -#define WRITE_IMAGE write_imageh -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, - int4 src_size, int4 dst_size) { - int h = get_global_id(0); - int kh = h % 2; - int src_h = h / 2; - src_h = src_h * 2; - int w = get_global_id(1); - int kw = w % 2; - int src_w = w / 2; - src_w = src_w * 2; - int co = get_global_id(2); - if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; - FLT4 r0 = (FLT4)(0.f); - FLT4 r1 = (FLT4)(0.f); - FLT4 r2 = (FLT4)(0.f); - FLT4 r3 = (FLT4)(0.f); - int base_w = (co * 4 + kh + kw * 2) * src_size.z; - for (int ci = 0; ci < src_size.z; ++ci) { - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1)); - FLT16 weight_cache = weight[base_w++]; - r0 += x0.x * weight_cache.s0123; - r0 += x0.y * weight_cache.s4567; - r0 += x0.z * weight_cache.s89ab; - r0 += x0.w * weight_cache.scdef; - - r1 += x1.x * weight_cache.s0123; - r1 += x1.y * weight_cache.s4567; - r1 += x1.z * weight_cache.s89ab; - r1 += x1.w * weight_cache.scdef; - - r2 += x2.x * weight_cache.s0123; - r2 += x2.y * weight_cache.s4567; - r2 += x2.z * weight_cache.s89ab; - r2 += x2.w * weight_cache.scdef; - - r3 += x3.x * weight_cache.s0123; - r3 += x3.y * weight_cache.s4567; - r3 += x3.z * weight_cache.s89ab; - r3 += x3.w * weight_cache.scdef; - } - FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); - r0 += bias_val; - r1 += bias_val; - r2 += bias_val; - r3 += bias_val; - - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl deleted file mode 100644 index 2725ca9261..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl +++ /dev/null @@ -1,76 +0,0 @@ -#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define ACCUM_FLT4 half4 -#define FLT half -#define FLT2 half2 -#define FLT3 half3 -#define FLT4 half4 -#define TO_FLT4 convert_half4 -#define TO_ACCUM_TYPE convert_half4 -#define TO_ACCUM_FLT convert_half -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef -__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; -__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases, - float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, - int2 padding, int2 dilation, int4 src_size, int4 dst_size) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; - ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; - int fx_c = Z * kernel_size.x * kernel_size.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; - bool outside_y = y_c < 0 || y_c >= src_size.y; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; - bool outside_x = x_c < 0 || x_c >= src_size.x; - if (!outside_x && !outside_y) { - FLT4 f = filters[fx_c]; - FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; - r += TO_ACCUM_TYPE(src_final * f); - } - fx_c++; - } - } - FLT4 bias_val = biases[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0; -} - -__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases, - float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, - int2 padding, int2 dilation, int4 src_size, int4 dst_size) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; - ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; - int fx_c = Z * kernel_size.x * kernel_size.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; - bool outside_y = y_c < 0 || y_c >= src_size.y; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; - bool outside_x = x_c < 0 || x_c >= src_size.x; - if (!outside_x && !outside_y) { - FLT4 f = filters[fx_c]; - FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - r += TO_ACCUM_TYPE(src_final * f); - } - fx_c++; - } - } - FLT4 bias_val = biases[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl deleted file mode 100644 index 73c4c076e7..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl +++ /dev/null @@ -1,45 +0,0 @@ -#define FLT half -#define FLT4 half4 -#define READ_IMAGE read_imageh -#define WRITE_IMAGE write_imageh -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= HW.y || Y >= C.y) { - return; - } - FLT4 result[4]; - result[0] = (FLT4)(0.0f); - result[1] = (FLT4)(0.0f); - result[2] = (FLT4)(0.0f); - result[3] = (FLT4)(0.0f); - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3)); - result[0].x = x0.x; - result[0].y = x1.x; - result[0].z = x2.x; - result[0].w = x3.x; - - result[1].x = x0.y; - result[1].y = x1.y; - result[1].z = x2.y; - result[1].w = x3.y; - - result[2].x = x0.z; - result[2].y = x1.z; - result[2].z = x2.z; - result[2].w = x3.z; - - result[3].x = x0.w; - result[3].y = x1.w; - result[3].z = x2.w; - result[3].w = x3.w; - - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl deleted file mode 100644 index 8b3f2c7ffd..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl +++ /dev/null @@ -1,34 +0,0 @@ -__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] + input_b[idx]; -} - -__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] - input_b[idx]; -} - -__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * input_b[idx]; -} - -__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] / input_b[idx]; -} - -__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = weight * input_a[idx] + bias; -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl deleted file mode 100644 index 45b8f90b53..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl +++ /dev/null @@ -1,66 +0,0 @@ -#define divide_no_check(a, b) (a/b) -__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; - -__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); - float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); - write_imagef(output, (int2)(X, Y), a + b); -} - -__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); - float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); - write_imagef(output, (int2)(X, Y), a - b); -} - -__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); - float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); - write_imagef(output, (int2)(X, Y), a * b); -} - -__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); - float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); - write_imagef(output, (int2)(X, Y), divide_no_check(a, b)); -} - -__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, - const int2 output_shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= output_shape.x || Y >= output_shape.y) { - return; - } - - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); - write_imagef(output, (int2)(X, Y), weight * a + bias); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl deleted file mode 100644 index 1dcc884e0e..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl +++ /dev/null @@ -1,32 +0,0 @@ -#define FLT4 float4 -#define FLT16 float16 -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { - int2 gid = (int2)(get_global_id(0), get_global_id(1)); - int2 lid = (int2)(get_local_id(0), get_local_id(1)); - FLT4 result = (FLT4)(0.0f); - bool inside = gid.x < offset_co.y; - for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { - FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0)); - FLT16 w = weight[gid.x + i * offset_co.y]; - result.x += dot(v, w.s0123); - result.y += dot(v, w.s4567); - result.z += dot(v, w.s89ab); - result.w += dot(v, w.scdef); - } - __local FLT4 temp[64][4]; - temp[lid.x][lid.y] = result; - barrier(CLK_LOCAL_MEM_FENCE); - if (lid.y == 0 && inside) { - result += temp[lid.x][1]; - result += temp[lid.x][2]; - result += temp[lid.x][3]; - if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); - } - WRITE_IMAGE(output, (int2)(gid.x, 0), result); - } -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl deleted file mode 100644 index e752c4b3f5..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl +++ /dev/null @@ -1,14 +0,0 @@ -#define FLT float -#define FLT4 float4 -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl similarity index 91% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl index c121f824bd..480be10107 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl @@ -1,7 +1,3 @@ -#define FLT4 half4 -#define FLT16 half16 -#define READ_IMAGE read_imageh -#define WRITE_IMAGE write_imageh __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl similarity index 100% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl similarity index 82% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index 867d1e3d18..bb9892b575 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -1,7 +1,3 @@ -#define FLT half -#define FLT4 half4 -#define READ_IMAGE read_imageh -#define WRITE_IMAGE write_imageh __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { int X = get_global_id(0); diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl similarity index 100% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl similarity index 89% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl index 672d56c08c..68cdad1f70 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl @@ -1,5 +1,5 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; - +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; // what is mask and args.slices_x32 __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, const int slices, const int slices_x32) { @@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu int tid = get_local_id(0); float sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { - float4 src = read_imagef(input, smp_none, (int2)(i, 0)); + float4 src = read_imagef(input, smp_zero, (int2)(i, 0)); sum += dot((float4)(1.0f), exp(src)); } if ((slices - 1) % 32 == tid) { - float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0)); + float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0)); sum += dot(mask, exp(src)); } @@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; for (size_t i = tid; i < slices - 1; i += 32) { - float4 result = read_imagef(input, smp_none, (int2)(i, 0)); + float4 result = read_imagef(input, smp_zero, (int2)(i, 0)); result = exp(result) * sum; output[i] = result; } if ((slices - 1) % 32 == tid) { - float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0)); + float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0)); result = exp(result) * sum; __global float4 *remain_ptr4 = output; remain_ptr4 += slices - 1; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl similarity index 95% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index 0811d7ce52..31b6e02b55 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -1,22 +1,3 @@ -#define FLT float -#define FLT4 float4 -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef -// enum Format { -// Format_NCHW = 0, -// Format_NHWC = 1, -// Format_NHWC4 = 2, -// Format_HWKC = 3, -// Format_HWCK = 4, -// Format_KCHW = 5, -// Format_CKHW = 6, -// Format_KHWC = 7, -// Format_CHWK = 8, -// Format_NC4HW4 = 100, -// Format_NUM_OF_FORMAT = 101, -// Format_MIN = Format_NCHW, -// Format_MAX = Format_NUM_OF_FORMAT -//}; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, int4 shape) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl similarity index 95% rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl rename to mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl index 9a71aa4c0d..05f903602e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl @@ -1,7 +1,3 @@ -#define FLT float -#define FLT4 float4 -#define READ_IMAGE read_imagef -#define WRITE_IMAGE write_imagef __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { int X = get_global_id(0); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index 94c86eb4c2..6124803173 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -24,7 +24,7 @@ #include "src/runtime/runtime_api.h" #include "include/errorcode.h" -#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc" +#include "src/runtime/kernel/opencl/cl/activation.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() { } std::string program_name = ""; std::string kernel_name = ""; - std::string source = activation_source_fp32; + std::string source = activation_source; if (type_ == ActivationType_RELU) { program_name = "RELU"; kernel_name = "Relu"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 84aad52780..d52b68ffc4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -22,8 +22,7 @@ #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc" +#include "src/runtime/kernel/opencl/cl/arithmetic.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -109,9 +108,14 @@ int ArithmeticOpenCLKernel::Init() { error_code = RET_ERROR; } #else + if (out_mem_type_ == OpenCLMemType::IMG) { + kernel_name += "_IMG"; + } else { + kernel_name += "_BUF"; + } std::string program_name = "Arithmetic"; std::set build_options; - std::string source = arithmetic_image2d_source_fp32; + std::string source = arithmetic_source; runtime_->LoadSource(program_name, source); error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index 88c752ebba..b1b5cbb367 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -20,7 +20,7 @@ #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/batchnorm.h" -#include "src/runtime/kernel/opencl/cl/fp32/batchnorm.cl.inc" +#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -50,7 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector *img_siz } int BatchNormOpenCLKernel::Init() { std::set build_options; - std::string source = batchnorm_source_fp32; + std::string source = batchnorm_source; std::string program_name = "batch_normalization"; std::string kernel_name = "batch_normalization"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc index 1ffa18e5e7..9db594a30d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc @@ -24,7 +24,7 @@ #include "include/errorcode.h" #include "src/runtime/kernel/opencl/kernel/caffe_prelu.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl.inc" +#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc" #include "src/runtime/kernel/arm/nnacl/caffeprelu.h" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -57,7 +57,7 @@ int CaffePReluOpenCLKernel::Init() { } CaffeWeight(); std::set build_options; - std::string source = caffe_prelu_source_fp32; + std::string source = caffe_prelu_source; std::string program_name = "CaffePRelu"; std::string kernel_name = "CaffePRelu"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index af5e54c19b..d2b4fadce7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -20,7 +20,7 @@ #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/concat.h" -#include "src/runtime/kernel/opencl/cl/fp32/concat.cl.inc" +#include "src/runtime/kernel/opencl/cl/concat.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() { } if (in_tensors_.size() == 2) { std::set build_options; - std::string source = concat_source_fp32; + std::string source = concat_source; std::string program_name = "Concat"; std::string kernel_name = "Concat"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); @@ -76,7 +76,7 @@ int ConcatOpenCLKernel::Init() { if (in_tensors_.size() == 3) { std::set build_options; - std::string source = concat_source_fp32; + std::string source = concat_source; std::string program_name = "Concat3input"; std::string kernel_name = "Concat3input"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index bb9276e671..027bcd664e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -20,8 +20,7 @@ #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl.inc" +#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -45,11 +44,7 @@ int Conv2dTransposeOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else -#ifdef ENABLE_FP16 - std::string source = conv2d_transpose2x2_source_fp16; -#else - std::string source = conv2d_transpose2x2_source_fp32; -#endif + std::string source = conv2d_transpose2x2_source; std::set build_options; std::string program_name = "conv2d_transpose2x2"; ocl_runtime->LoadSource(program_name, source); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 07ea195514..421a1dce00 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -26,8 +26,7 @@ #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl.inc" +#include "src/runtime/kernel/opencl/cl/depthwise_conv2d.cl.inc" #endif @@ -68,11 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() { #else std::string program_name = "DepthwiseConv2d"; std::set build_options; -#ifdef ENABLE_FP16 - std::string source = depthwise_conv2d_source_fp16; -#else - std::string source = depthwise_conv2d_source_fp32; -#endif + std::string source = depthwise_conv2d_source; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index 8b8dac11b7..b724e5f7e1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -21,8 +21,7 @@ #include "src/runtime/kernel/arm/nnacl/fp32/matmul.h" #include "src/runtime/kernel/opencl/kernel/matmul.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp16/matmul.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/matmul.cl.inc" +#include "src/runtime/kernel/opencl/cl/matmul.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -40,11 +39,7 @@ int MatMulOpenCLKernel::Init() { ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else std::set build_options; -#ifdef ENABLE_FP16 - std::string source = matmul_source_fp16; -#else - std::string source = matmul_source_fp32; -#endif + std::string source = matmul_source; std::string program_name = "MatMul"; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 7c1daf618a..276cd699bd 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -24,8 +24,8 @@ #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/image_format.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl.inc" +#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc" +#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -46,13 +46,13 @@ int PoolingOpenCLKernel::Init() { if (parameter_->max_pooling_) { kernel_name = "MaxPooling2d"; #ifndef PROGRAM_WITH_IL - source = max_pool2d_source_fp32; + source = max_pool2d_source; program_name = "MaxPooling2d"; #endif } else if (parameter_->avg_pooling_) { kernel_name = "AvgPooling2d"; #ifndef PROGRAM_WITH_IL - source = avg_pool2d_source_fp32; + source = avg_pool2d_source; program_name = "AvgPooling2d"; #endif } else { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc index ed5c0803bd..7ff7993035 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc @@ -23,7 +23,7 @@ #include "include/errorcode.h" #include "src/runtime/kernel/opencl/kernel/prelu.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc" +#include "src/runtime/kernel/opencl/cl/activation.cl.inc" #include "src/runtime/kernel/arm/nnacl/prelu_parameter.h" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -40,7 +40,7 @@ int PReluOpenCLKernel::Init() { return RET_ERROR; } std::set build_options; - std::string source = activation_source_fp32; + std::string source = activation_source; std::string program_name = "PRelu"; std::string kernel_name = "ReluScalar"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index 064caffaf6..e58013e493 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -20,8 +20,7 @@ #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/reshape.h" -#include "src/runtime/kernel/opencl/cl/fp16/reshape.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/reshape.cl.inc" +#include "src/runtime/kernel/opencl/cl/reshape.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -39,11 +38,7 @@ int ReshapeOpenCLKernel::Init() { ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else std::set build_options; -#ifdef ENABLE_FP16 - std::string source = reshape_source_fp16; -#else - std::string source = reshape_source_fp32; -#endif + std::string source = reshape_source; std::string program_name = "reshape"; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 1fc02fc03f..9d55ba5d53 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -22,8 +22,8 @@ #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/utils.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp32/softmax.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl.inc" +#include "src/runtime/kernel/opencl/cl/softmax.cl.inc" +#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -88,7 +88,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) int SoftmaxOpenCLKernel::Init() { std::string kernel_name = "SoftMax"; std::string program_name = "SoftMax"; - std::string source = softmax_source_fp32; + std::string source = softmax_source; runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); // framework not set this param yet! just use default. if (parameter_->axis_ == -1) { @@ -101,7 +101,7 @@ int SoftmaxOpenCLKernel::Init() { // support 2d tensor kernel_name += "1x1"; program_name += "1x1"; - source = softmax1x1_source_fp32; + source = softmax1x1_source; onexone_flag_ = true; } else { MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 323bb5b296..92c2c03479 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -22,7 +22,7 @@ #include "include/errorcode.h" #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc" +#include "src/runtime/kernel/opencl/cl/to_format.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -37,10 +37,9 @@ int ToFormatOpenCLKernel::Init() { auto parameter = reinterpret_cast(op_parameter_); out_mem_type_ = parameter->out_mem_type; std::string program_name = "to_format"; - std::map format_str{{schema::Format_NCHW, "NCHW"}, - {schema::Format_NHWC, "NHWC"}, - {schema::Format_NC4HW4, "NC4HW4"}, - {schema::Format_NHWC4, "NHWC4"}}; + std::map format_str{{schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"}, + {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NC4, "NHWC4"}, + {schema::Format_NC, "NHWC"}, {schema::Format_NHWC4, "NHWC4"}}; std::string kernel_name = "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; if (out_mem_type_ == OpenCLMemType::IMG) { @@ -49,49 +48,54 @@ int ToFormatOpenCLKernel::Init() { kernel_name += "_BUF"; } + this->set_name(kernel_name); #ifdef PROGRAM_WITH_IL ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else std::set build_options; -#ifdef ENABLE_FP16 - std::string source = to_format_source_fp16; -#else - std::string source = to_format_source_fp32; -#endif + std::string source = to_format_source; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + InitNHWCShape(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } -int ToFormatOpenCLKernel::ReSize() { return RET_OK; } - -int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { +int ToFormatOpenCLKernel::InitNHWCShape() { std::vector shapex = out_tensors_[0]->shape(); + size_t n, h, w, c; if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) { - int h = shapex[1]; - int w = shapex[2]; - int c = shapex[3]; - int c4 = UP_DIV(c, C4NUM); - std::vector vec = {(size_t)h, (size_t)w, (size_t)c4}; - *global_size = std::move(vec); + n = shapex[0]; + h = shapex[1]; + w = shapex[2]; + c = shapex[3]; } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || out_tensors_[0]->GetFormat() == schema::Format_NCHW) { - int h = shapex[2]; - int w = shapex[3]; - int c = shapex[1]; - int c4 = UP_DIV(c, C4NUM); - std::vector vec = {(size_t)c4, (size_t)h, (size_t)w}; - *global_size = std::move(vec); - } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) { - int h = shapex[2]; - int w = shapex[3]; - int c = shapex[1]; - int w4 = UP_DIV(w, C4NUM); - std::vector vec = {(size_t)w4, (size_t)h, (size_t)c}; - *global_size = std::move(vec); + n = shapex[0]; + h = shapex[2]; + w = shapex[3]; + c = shapex[1]; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4 || out_tensors_[0]->GetFormat() == schema::Format_NC) { + n = shapex[0]; + h = 1; + w = 1; + c = shapex[1]; + } else { + n = shapex[0]; + h = shapex[1]; + w = shapex[2]; + c = shapex[3]; } + nhwc_shape_ = {n, h, w, c}; + return RET_OK; +} + +int ToFormatOpenCLKernel::ReSize() { return RET_OK; } + +int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { + std::vector vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; + *global_size = std::move(vec); return RET_OK; } int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, @@ -114,6 +118,12 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size int c = shapex[3]; im_dst_x = w * UP_DIV(c, C4NUM); im_dst_y = h; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { + int h = 1; + int w = 1; + int c = shapex[1]; + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = h; } else { MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); } @@ -128,15 +138,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size return RET_OK; } int ToFormatOpenCLKernel::Run() { - MS_LOG(DEBUG) << "ToFormat" - << " Running!"; + MS_LOG(DEBUG) << this->name() << " Running!"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); std::vector local = {}; std::vector global; GetGlobalSize(0, &global); - auto shapex = in_tensors_[0]->shape(); - cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1, - shapex.size() > 3 ? shapex[3] : 1}; + + cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]}; cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h index 09d31b7454..febf17cd4d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h @@ -37,9 +37,11 @@ class ToFormatOpenCLKernel : public OpenCLKernel { int GetImageSize(size_t idx, std::vector *img_size) override; int GetGlobalSize(size_t idx, std::vector *global_size) override; int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) override; + int InitNHWCShape(); private: cl::Kernel kernel_; + std::vector nhwc_shape_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index a42f39195a..c57dcf72ac 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -21,8 +21,7 @@ #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/transpose.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp16/transpose.cl.inc" -#include "src/runtime/kernel/opencl/cl/fp32/transpose.cl.inc" +#include "src/runtime/kernel/opencl/cl/transpose.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -45,11 +44,7 @@ int TransposeOpenCLKernel::Init() { ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else std::set build_options; -#ifdef ENABLE_FP16 - std::string source = transpose_source_fp16; -#else - std::string source = transpose_source_fp32; -#endif + std::string source = transpose_source; std::string program_name = "transpose"; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index dccbbdae47..96229e8ada 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -288,13 +288,13 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na if (fp16_enable_) { // fp16 enable, kernel will use half and read_imageh and write_imageh. build_options_str = - "-DFLOAT=half -DFLOAT4=half4 -DRI_F=read_imageh " - "-DWI_F=write_imageh"; + "-DFLT=half -DFLT4=half4 -DFLT16=half16 " + "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4"; } else { // fp16 not enable, kernel will use float and read_imagef and write_imagef. build_options_str = - "-DFLOAT=float -DFLOAT4=float4 -DRI_F=read_imagef " - "-DWI_F=write_imagef"; + "-DFLT=float -DFLT4=float4 -DFLT16=float16 " + "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4"; } build_options_str = std::accumulate(