change opencl code path

5 years ago · 605c2b0b16
parent 98565d8b54
commit 605c2b0b16
40 changed files with 216 additions and 511 deletions
--- a/build.sh
+++ b/build.sh
@ -460,24 +460,20 @@ build_gtest() {

 gene_clhpp() {
    CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl"
-    for sub_dir in "${CL_SRC_DIR}"/*
+    if [ ! -d ${CL_SRC_DIR} ]; then
+      return
+    fi
+    cd ${CL_SRC_DIR}/
+    rm -rf *.inc
+    echo "$(cd "$(dirname $0)"; pwd)"
+    for file_path in "${CL_SRC_DIR}"/*
    do
-        data_type="$(basename ${sub_dir})"
-        if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then
-          continue
-        fi
-        cd ${CL_SRC_DIR}/${data_type}
-        rm -rf *.inc
-        echo "$(cd "$(dirname $0)"; pwd)"
-        for file_path in "${CL_SRC_DIR}/${data_type}"/*
-        do
-            file="$(basename ${file_path})"
-            inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'`
-            sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file}
-            kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
-	    sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file}
-            sed -i '$a\;' ${inc_file}
-        done
+        file="$(basename ${file_path})"
+        inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'`
+        sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
+        kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
+  sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
+        sed -i '$a\;' ${inc_file}
    done
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
@ -2,10 +2,7 @@

 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
 #define MIN(X, Y) (X < Y ? X : Y)
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
    tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
    tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
    tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
    tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
    tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
    tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
    tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
    tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
    tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }

@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    tmp.x = 1 / (1 + exp(-in_c4.x));
    tmp.y = 1 / (1 + exp(-in_c4.y));
    tmp.z = 1 / (1 + exp(-in_c4.z));
    tmp.w = 1 / (1 + exp(-in_c4.w));
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@ -0,0 +1,101 @@
+#define divide_no_check(a, b) (a / b)
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a + b);
+}
+
+__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a - b);
+}
+
+__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a * b);
+}
+
+__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
+}
+
+__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+                                 const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
+}
+
+__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] + input_b[idx];
+}
+
+__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] - input_b[idx];
+}
+
+__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
+                                 const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = weight * input_a[idx] + bias;
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
@ -1,4 +1,3 @@
-#define FLT4 float4
 #define INT4 int4
 #define INT2 int2
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
@ -2,9 +2,6 @@

 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
    int index = num * 4;
    tmp.x = in_c4.x * alpha[index];
    tmp.y = in_c4.y * alpha[index + 1];
    tmp.z = in_c4.z * alpha[index + 2];
    tmp.w = in_c4.w * alpha[index + 3];
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
@ -1,5 +1,4 @@
 // #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define FLT4 float4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
@ -1,8 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
@ -1,12 +1,3 @@
-#ifdef ENABLE_FP16
-#define FLT half
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#else
-#define FLT float
-#define FLT4 float4
-#define TO_FLT4 convert_float4
-#endif
 __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
                                         float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
@ -1,61 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
-                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
-                                  int4 src_size, int4 dst_size) {
-  int h = get_global_id(0);
-  int kh = h % 2;
-  int src_h = h / 2;
-  src_h = src_h * 2;
-  int w = get_global_id(1);
-  int kw = w % 2;
-  int src_w = w / 2;
-  src_w = src_w * 2;
-  int co = get_global_id(2);
-  if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
-  FLT4 r0 = (FLT4)(0.f);
-  FLT4 r1 = (FLT4)(0.f);
-  FLT4 r2 = (FLT4)(0.f);
-  FLT4 r3 = (FLT4)(0.f);
-  int base_w = (co * 4 + kh + kw * 2) * src_size.z;
-  for (int ci = 0; ci < src_size.z; ++ci) {
-    FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
-    FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
-    FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
-    FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
-    FLT16 weight_cache = weight[base_w++];
-    r0 += x0.x * weight_cache.s0123;
-    r0 += x0.y * weight_cache.s4567;
-    r0 += x0.z * weight_cache.s89ab;
-    r0 += x0.w * weight_cache.scdef;
-
-    r1 += x1.x * weight_cache.s0123;
-    r1 += x1.y * weight_cache.s4567;
-    r1 += x1.z * weight_cache.s89ab;
-    r1 += x1.w * weight_cache.scdef;
-
-    r2 += x2.x * weight_cache.s0123;
-    r2 += x2.y * weight_cache.s4567;
-    r2 += x2.z * weight_cache.s89ab;
-    r2 += x2.w * weight_cache.scdef;
-
-    r3 += x3.x * weight_cache.s0123;
-    r3 += x3.y * weight_cache.s4567;
-    r3 += x3.z * weight_cache.s89ab;
-    r3 += x3.w * weight_cache.scdef;
-  }
-  FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
-  r0 += bias_val;
-  r1 += bias_val;
-  r2 += bias_val;
-  r3 += bias_val;
-
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
@ -1,76 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define ACCUM_FLT4 half4
-#define FLT half
-#define FLT2 half2
-#define FLT3 half3
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#define TO_ACCUM_TYPE convert_half4
-#define TO_ACCUM_FLT convert_half
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                     float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                     int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
-}
-
-__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                    float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                    int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
@ -1,45 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= HW.y || Y >= C.y) {
-    return;
-  }
-  FLT4 result[4];
-  result[0] = (FLT4)(0.0f);
-  result[1] = (FLT4)(0.0f);
-  result[2] = (FLT4)(0.0f);
-  result[3] = (FLT4)(0.0f);
-  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
-  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
-  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
-  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
-  result[0].x = x0.x;
-  result[0].y = x1.x;
-  result[0].z = x2.x;
-  result[0].w = x3.x;
-
-  result[1].x = x0.y;
-  result[1].y = x1.y;
-  result[1].z = x2.y;
-  result[1].w = x3.y;
-
-  result[2].x = x0.z;
-  result[2].y = x1.z;
-  result[2].z = x2.z;
-  result[2].w = x3.z;
-
-  result[3].x = x0.w;
-  result[3].y = x1.w;
-  result[3].z = x2.w;
-  result[3].w = x3.w;
-
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
@ -1,34 +0,0 @@
-__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b[idx];
-}
-
-__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b[idx];
-}
-
-__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b[idx];
-}
-
-__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] / input_b[idx];
-}
-
-__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = weight * input_a[idx] + bias;
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
@ -1,66 +0,0 @@
-#define divide_no_check(a, b) (a/b)
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
-__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a + b);
-}
-
-__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a - b);
-}
-
-__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a * b);
-}
-
-__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
-}
-
-__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
-                             const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), weight * a + bias);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
@ -1,32 +0,0 @@
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
-  int2 gid = (int2)(get_global_id(0), get_global_id(1));
-  int2 lid = (int2)(get_local_id(0), get_local_id(1));
-  FLT4 result = (FLT4)(0.0f);
-  bool inside = gid.x < offset_co.y;
-  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
-    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    FLT16 w = weight[gid.x + i * offset_co.y];
-    result.x += dot(v, w.s0123);
-    result.y += dot(v, w.s4567);
-    result.z += dot(v, w.s89ab);
-    result.w += dot(v, w.scdef);
-  }
-  __local FLT4 temp[64][4];
-  temp[lid.x][lid.y] = result;
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if (lid.y == 0 && inside) {
-    result += temp[lid.x][1];
-    result += temp[lid.x][2];
-    result += temp[lid.x][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
-    }
-    WRITE_IMAGE(output, (int2)(gid.x, 0), result);
-  }
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
@ -1,14 +0,0 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
@ -1,7 +1,3 @@
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
@ -1,7 +1,3 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
@ -1,5 +1,5 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 // what is mask and args.slices_x32
 __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                             const int slices, const int slices_x32) {
@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
  int tid = get_local_id(0);
  float sum = 0.0f;
  for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 src = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
    sum += dot((float4)(1.0f), exp(src));
  }
  if ((slices - 1) % 32 == tid) {
-    float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
    sum += dot(mask, exp(src));
  }

@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
  barrier(CLK_LOCAL_MEM_FENCE);
  sum = tmpx1[0];
  for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 result = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
    result = exp(result) * sum;
    output[i] = result;
  }
  if ((slices - 1) % 32 == tid) {
-    float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
    result = exp(result) * sum;
    __global float4 *remain_ptr4 = output;
    remain_ptr4 += slices - 1;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
@ -1,22 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-// enum Format {
-//  Format_NCHW = 0,
-//  Format_NHWC = 1,
-//  Format_NHWC4 = 2,
-//  Format_HWKC = 3,
-//  Format_HWCK = 4,
-//  Format_KCHW = 5,
-//  Format_CKHW = 6,
-//  Format_KHWC = 7,
-//  Format_CHWK = 8,
-//  Format_NC4HW4 = 100,
-//  Format_NUM_OF_FORMAT = 101,
-//  Format_MIN = Format_NCHW,
-//  Format_MAX = Format_NUM_OF_FORMAT
-//};
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                          int4 shape) {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
@ -1,7 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -24,7 +24,7 @@
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"

-#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
+#include "src/runtime/kernel/opencl/cl/activation.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
  }
  std::string program_name = "";
  std::string kernel_name = "";
-  std::string source = activation_source_fp32;
+  std::string source = activation_source;
  if (type_ == ActivationType_RELU) {
    program_name = "RELU";
    kernel_name = "Relu";
--- a/Show More
+++ b/Show More