From 605c2b0b16f0ef973e904471ef4644d9793adc6f Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Thu, 20 Aug 2020 14:30:20 +0800
Subject: [PATCH] change opencl code path

---
 build.sh                                      |  30 +++---
 .../kernel/opencl/cl/{fp32 => }/activation.cl |  19 ++--
 .../runtime/kernel/opencl/cl/arithmetic.cl    | 101 ++++++++++++++++++
 .../kernel/opencl/cl/{fp32 => }/avg_pool2d.cl |   0
 .../kernel/opencl/cl/{fp32 => }/batchnorm.cl  |   1 -
 .../opencl/cl/{fp32 => }/caffe_prelu.cl       |   7 +-
 .../kernel/opencl/cl/{fp32 => }/concat.cl     |   1 -
 .../cl/{fp32 => }/conv2d_transpose2x2.cl      |   5 -
 .../opencl/cl/{fp32 => }/convolution.cl       |   0
 .../opencl/cl/{fp32 => }/depthwise_conv2d.cl  |   9 --
 .../opencl/cl/fp16/conv2d_transpose2x2.cl     |  61 -----------
 .../kernel/opencl/cl/fp16/depthwise_conv2d.cl |  76 -------------
 .../kernel/opencl/cl/fp16/transpose.cl        |  45 --------
 .../opencl/cl/fp32/arithmetic_buffer.cl       |  34 ------
 .../opencl/cl/fp32/arithmetic_image2d.cl      |  66 ------------
 .../runtime/kernel/opencl/cl/fp32/matmul.cl   |  32 ------
 .../runtime/kernel/opencl/cl/fp32/reshape.cl  |  14 ---
 .../kernel/opencl/cl/{fp16 => }/matmul.cl     |   4 -
 .../kernel/opencl/cl/{fp32 => }/max_pool2d.cl |   0
 .../kernel/opencl/cl/{fp16 => }/reshape.cl    |   4 -
 .../kernel/opencl/cl/{fp32 => }/softmax.cl    |   0
 .../kernel/opencl/cl/{fp32 => }/softmax1x1.cl |  10 +-
 .../kernel/opencl/cl/{fp32 => }/to_format.cl  |  19 ----
 .../kernel/opencl/cl/{fp32 => }/transpose.cl  |   4 -
 .../kernel/opencl/kernel/activation.cc        |   4 +-
 .../kernel/opencl/kernel/arithmetic.cc        |  10 +-
 .../runtime/kernel/opencl/kernel/batchnorm.cc |   4 +-
 .../kernel/opencl/kernel/caffe_prelu.cc       |   4 +-
 .../runtime/kernel/opencl/kernel/concat.cc    |   6 +-
 .../kernel/opencl/kernel/conv2d_transpose.cc  |   9 +-
 .../kernel/opencl/kernel/depthwise_conv2d.cc  |   9 +-
 .../runtime/kernel/opencl/kernel/matmul.cc    |   9 +-
 .../runtime/kernel/opencl/kernel/pooling2d.cc |   8 +-
 .../src/runtime/kernel/opencl/kernel/prelu.cc |   4 +-
 .../runtime/kernel/opencl/kernel/reshape.cc   |   9 +-
 .../runtime/kernel/opencl/kernel/softmax.cc   |   8 +-
 .../runtime/kernel/opencl/kernel/to_format.cc |  82 +++++++-------
 .../runtime/kernel/opencl/kernel/to_format.h  |   2 +
 .../runtime/kernel/opencl/kernel/transpose.cc |   9 +-
 .../lite/src/runtime/opencl/opencl_runtime.cc |   8 +-
 40 files changed, 216 insertions(+), 511 deletions(-)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/activation.cl (73%)
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/avg_pool2d.cl (100%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/batchnorm.cl (98%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/caffe_prelu.cl (73%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/concat.cl (98%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/conv2d_transpose2x2.cl (95%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/convolution.cl (100%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/depthwise_conv2d.cl (98%)
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
 delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp16 => }/matmul.cl (91%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/max_pool2d.cl (100%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp16 => }/reshape.cl (82%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/softmax.cl (100%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/softmax1x1.cl (89%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/to_format.cl (95%)
 rename mindspore/lite/src/runtime/kernel/opencl/cl/{fp32 => }/transpose.cl (95%)

diff --git a/build.sh b/build.sh
index 094f0d1de1..911135906a 100755
--- a/build.sh
+++ b/build.sh
@@ -460,24 +460,20 @@ build_gtest() {
 
 gene_clhpp() {
     CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl"
-    for sub_dir in "${CL_SRC_DIR}"/*
+    if [ ! -d ${CL_SRC_DIR} ]; then
+      return
+    fi
+    cd ${CL_SRC_DIR}/
+    rm -rf *.inc
+    echo "$(cd "$(dirname $0)"; pwd)"
+    for file_path in "${CL_SRC_DIR}"/*
     do
-        data_type="$(basename ${sub_dir})"
-        if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then
-          continue
-        fi
-        cd ${CL_SRC_DIR}/${data_type}
-        rm -rf *.inc
-        echo "$(cd "$(dirname $0)"; pwd)"
-        for file_path in "${CL_SRC_DIR}/${data_type}"/*
-        do
-            file="$(basename ${file_path})"
-            inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'`
-            sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file}
-            kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
-	    sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file}
-            sed -i '$a\;' ${inc_file}
-        done
+        file="$(basename ${file_path})"
+        inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'`
+        sed 's/^/\"/;s/$/    \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
+        kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
+  sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
+        sed -i '$a\;' ${inc_file}
     done
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
similarity index 73%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
index e9c1f2519f..20287b25f7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
@@ -2,10 +2,7 @@
 
 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
 #define MIN(X, Y) (X < Y ? X : Y)
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
   int Y = get_global_id(0);  // height id
   int X = get_global_id(1);  // weight id
   for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
     FLT4 tmp;
     tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
     tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
     tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
     tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
   }
 }
 
@@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
   int Y = get_global_id(0);  // height id
   int X = get_global_id(1);  // weight id
   for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
     FLT4 tmp;
     tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
     tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
     tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
     tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
   }
 }
 
@@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
   int Y = get_global_id(0);  // height id
   int X = get_global_id(1);  // weight id
   for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
     FLT4 tmp;
     tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
     tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
     tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
     tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
   }
 }
 
@@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
   int Y = get_global_id(0);  // height id
   int X = get_global_id(1);  // weight id
   for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
     FLT4 tmp;
     tmp.x = 1 / (1 + exp(-in_c4.x));
     tmp.y = 1 / (1 + exp(-in_c4.y));
     tmp.z = 1 / (1 + exp(-in_c4.z));
     tmp.w = 1 / (1 + exp(-in_c4.w));
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
new file mode 100644
index 0000000000..497d789a64
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -0,0 +1,101 @@
+#define divide_no_check(a, b) (a / b)
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a + b);
+}
+
+__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a - b);
+}
+
+__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a * b);
+}
+
+__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
+}
+
+__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+                                 const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
+}
+
+__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] + input_b[idx];
+}
+
+__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] - input_b[idx];
+}
+
+__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
+                             const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * input_b[idx];
+}
+
+__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
+                                 const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = weight * input_a[idx] + bias;
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
index a68141329b..554674da6a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/batchnorm.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
@@ -1,4 +1,3 @@
-#define FLT4 float4
 #define INT4 int4
 #define INT2 int2
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
similarity index 73%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
index d162be0839..bfaec1dfe1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
@@ -2,9 +2,6 @@
 
 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define FLT4 float4
-#define READ_FLT4 read_imagef
-#define WRITE_FLT4 write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
@@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
   int Y = get_global_id(0);  // height id
   int X = get_global_id(1);  // weight id
   for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
-    FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
+    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
     FLT4 tmp;
     int index = num * 4;
     tmp.x = in_c4.x * alpha[index];
     tmp.y = in_c4.y * alpha[index + 1];
     tmp.z = in_c4.z * alpha[index + 2];
     tmp.w = in_c4.w * alpha[index + 3];
-    WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
+    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
index 6758707012..c16daf9c81 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
@@ -1,5 +1,4 @@
 // #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define FLT4 float4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
similarity index 95%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
index 7a22a10efe..2014bb589d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
@@ -1,8 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
                                   __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
index f7944e9a2b..a6e1c322e1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
@@ -1,12 +1,3 @@
-#ifdef ENABLE_FP16
-#define FLT half
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#else
-#define FLT float
-#define FLT4 float4
-#define TO_FLT4 convert_float4
-#endif
 __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
                                          float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
deleted file mode 100644
index e166e699c5..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
+++ /dev/null
@@ -1,61 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
-                                  __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
-                                  int4 src_size, int4 dst_size) {
-  int h = get_global_id(0);
-  int kh = h % 2;
-  int src_h = h / 2;
-  src_h = src_h * 2;
-  int w = get_global_id(1);
-  int kw = w % 2;
-  int src_w = w / 2;
-  src_w = src_w * 2;
-  int co = get_global_id(2);
-  if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
-  FLT4 r0 = (FLT4)(0.f);
-  FLT4 r1 = (FLT4)(0.f);
-  FLT4 r2 = (FLT4)(0.f);
-  FLT4 r3 = (FLT4)(0.f);
-  int base_w = (co * 4 + kh + kw * 2) * src_size.z;
-  for (int ci = 0; ci < src_size.z; ++ci) {
-    FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
-    FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
-    FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
-    FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
-    FLT16 weight_cache = weight[base_w++];
-    r0 += x0.x * weight_cache.s0123;
-    r0 += x0.y * weight_cache.s4567;
-    r0 += x0.z * weight_cache.s89ab;
-    r0 += x0.w * weight_cache.scdef;
-
-    r1 += x1.x * weight_cache.s0123;
-    r1 += x1.y * weight_cache.s4567;
-    r1 += x1.z * weight_cache.s89ab;
-    r1 += x1.w * weight_cache.scdef;
-
-    r2 += x2.x * weight_cache.s0123;
-    r2 += x2.y * weight_cache.s4567;
-    r2 += x2.z * weight_cache.s89ab;
-    r2 += x2.w * weight_cache.scdef;
-
-    r3 += x3.x * weight_cache.s0123;
-    r3 += x3.y * weight_cache.s4567;
-    r3 += x3.z * weight_cache.s89ab;
-    r3 += x3.w * weight_cache.scdef;
-  }
-  FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
-  r0 += bias_val;
-  r1 += bias_val;
-  r2 += bias_val;
-  r3 += bias_val;
-
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
-  WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
deleted file mode 100644
index 2725ca9261..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define ACCUM_FLT4 half4
-#define FLT half
-#define FLT2 half2
-#define FLT3 half3
-#define FLT4 half4
-#define TO_FLT4 convert_half4
-#define TO_ACCUM_TYPE convert_half4
-#define TO_ACCUM_FLT convert_half
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                     float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                     int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
-}
-
-__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-                                    float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-                                    int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
-  int fx_c = Z * kernel_size.x * kernel_size.y;
-  for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
-    for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 f = filters[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_ACCUM_TYPE(src_final * f);
-      }
-      fx_c++;
-    }
-  }
-  FLT4 bias_val = biases[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
deleted file mode 100644
index 73c4c076e7..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
+++ /dev/null
@@ -1,45 +0,0 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= HW.y || Y >= C.y) {
-    return;
-  }
-  FLT4 result[4];
-  result[0] = (FLT4)(0.0f);
-  result[1] = (FLT4)(0.0f);
-  result[2] = (FLT4)(0.0f);
-  result[3] = (FLT4)(0.0f);
-  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
-  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
-  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
-  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
-  result[0].x = x0.x;
-  result[0].y = x1.x;
-  result[0].z = x2.x;
-  result[0].w = x3.x;
-
-  result[1].x = x0.y;
-  result[1].y = x1.y;
-  result[1].z = x2.y;
-  result[1].w = x3.y;
-
-  result[2].x = x0.z;
-  result[2].y = x1.z;
-  result[2].z = x2.z;
-  result[2].w = x3.z;
-
-  result[3].x = x0.w;
-  result[3].y = x1.w;
-  result[3].z = x2.w;
-  result[3].w = x3.w;
-
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
-  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
deleted file mode 100644
index 8b3f2c7ffd..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b[idx];
-}
-
-__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b[idx];
-}
-
-__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b[idx];
-}
-
-__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
-                         const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] / input_b[idx];
-}
-
-__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = weight * input_a[idx] + bias;
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
deleted file mode 100644
index 45b8f90b53..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ /dev/null
@@ -1,66 +0,0 @@
-#define divide_no_check(a, b) (a/b)
-__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
-__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a + b);
-}
-
-__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a - b);
-}
-
-__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), a * b);
-}
-
-__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                         const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
-}
-
-__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
-                             const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
-  write_imagef(output, (int2)(X, Y), weight * a + bias);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
deleted file mode 100644
index 1dcc884e0e..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-#define FLT4 float4
-#define FLT16 float16
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                     __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
-  int2 gid = (int2)(get_global_id(0), get_global_id(1));
-  int2 lid = (int2)(get_local_id(0), get_local_id(1));
-  FLT4 result = (FLT4)(0.0f);
-  bool inside = gid.x < offset_co.y;
-  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
-    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    FLT16 w = weight[gid.x + i * offset_co.y];
-    result.x += dot(v, w.s0123);
-    result.y += dot(v, w.s4567);
-    result.z += dot(v, w.s89ab);
-    result.w += dot(v, w.scdef);
-  }
-  __local FLT4 temp[64][4];
-  temp[lid.x][lid.y] = result;
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if (lid.y == 0 && inside) {
-    result += temp[lid.x][1];
-    result += temp[lid.x][2];
-    result += temp[lid.x][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
-    }
-    WRITE_IMAGE(output, (int2)(gid.x, 0), result);
-  }
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
deleted file mode 100644
index e752c4b3f5..0000000000
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
similarity index 91%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
index c121f824bd..480be10107 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@@ -1,7 +1,3 @@
-#define FLT4 half4
-#define FLT16 half16
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
                      __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
similarity index 82%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
index 867d1e3d18..bb9892b575 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
@@ -1,7 +1,3 @@
-#define FLT half
-#define FLT4 half4
-#define READ_IMAGE read_imageh
-#define WRITE_IMAGE write_imageh
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
   int X = get_global_id(0);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
similarity index 89%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
index 672d56c08c..68cdad1f70 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
@@ -1,5 +1,5 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 // what is mask and args.slices_x32
 __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                              const int slices, const int slices_x32) {
@@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
   int tid = get_local_id(0);
   float sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 src = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
     sum += dot((float4)(1.0f), exp(src));
   }
   if ((slices - 1) % 32 == tid) {
-    float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
     sum += dot(mask, exp(src));
   }
 
@@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
   for (size_t i = tid; i < slices - 1; i += 32) {
-    float4 result = read_imagef(input, smp_none, (int2)(i, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
     result = exp(result) * sum;
     output[i] = result;
   }
   if ((slices - 1) % 32 == tid) {
-    float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
     result = exp(result) * sum;
     __global float4 *remain_ptr4 = output;
     remain_ptr4 += slices - 1;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
similarity index 95%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
index 0811d7ce52..31b6e02b55 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -1,22 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
-// enum Format {
-//  Format_NCHW = 0,
-//  Format_NHWC = 1,
-//  Format_NHWC4 = 2,
-//  Format_HWKC = 3,
-//  Format_HWCK = 4,
-//  Format_KCHW = 5,
-//  Format_CKHW = 6,
-//  Format_KHWC = 7,
-//  Format_CHWK = 8,
-//  Format_NC4HW4 = 100,
-//  Format_NUM_OF_FORMAT = 101,
-//  Format_MIN = Format_NCHW,
-//  Format_MAX = Format_NUM_OF_FORMAT
-//};
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                           int4 shape) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
similarity index 95%
rename from mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
rename to mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
index 9a71aa4c0d..05f903602e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@@ -1,7 +1,3 @@
-#define FLT float
-#define FLT4 float4
-#define READ_IMAGE read_imagef
-#define WRITE_IMAGE write_imagef
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
   int X = get_global_id(0);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
index 94c86eb4c2..6124803173 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -24,7 +24,7 @@
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"
 
-#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
+#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
   }
   std::string program_name = "";
   std::string kernel_name = "";
-  std::string source = activation_source_fp32;
+  std::string source = activation_source;
   if (type_ == ActivationType_RELU) {
     program_name = "RELU";
     kernel_name = "Relu";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 84aad52780..d52b68ffc4 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -22,8 +22,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/arithmetic.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -109,9 +108,14 @@ int ArithmeticOpenCLKernel::Init() {
     error_code = RET_ERROR;
   }
 #else
+  if (out_mem_type_ == OpenCLMemType::IMG) {
+    kernel_name += "_IMG";
+  } else {
+    kernel_name += "_BUF";
+  }
   std::string program_name = "Arithmetic";
   std::set<std::string> build_options;
-  std::string source = arithmetic_image2d_source_fp32;
+  std::string source = arithmetic_source;
   runtime_->LoadSource(program_name, source);
   error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
index 88c752ebba..b1b5cbb367 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -20,7 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/batchnorm.h"
-#include "src/runtime/kernel/opencl/cl/fp32/batchnorm.cl.inc"
+#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -50,7 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
 }
 int BatchNormOpenCLKernel::Init() {
   std::set<std::string> build_options;
-  std::string source = batchnorm_source_fp32;
+  std::string source = batchnorm_source;
   std::string program_name = "batch_normalization";
   std::string kernel_name = "batch_normalization";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
index 1ffa18e5e7..9db594a30d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
@@ -24,7 +24,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/caffe_prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl.inc"
+#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc"
 #include "src/runtime/kernel/arm/nnacl/caffeprelu.h"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -57,7 +57,7 @@ int CaffePReluOpenCLKernel::Init() {
   }
   CaffeWeight();
   std::set<std::string> build_options;
-  std::string source = caffe_prelu_source_fp32;
+  std::string source = caffe_prelu_source;
   std::string program_name = "CaffePRelu";
   std::string kernel_name = "CaffePRelu";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index af5e54c19b..d2b4fadce7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -20,7 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/concat.h"
-#include "src/runtime/kernel/opencl/cl/fp32/concat.cl.inc"
+#include "src/runtime/kernel/opencl/cl/concat.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() {
   }
   if (in_tensors_.size() == 2) {
     std::set<std::string> build_options;
-    std::string source = concat_source_fp32;
+    std::string source = concat_source;
     std::string program_name = "Concat";
     std::string kernel_name = "Concat";
     auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
@@ -76,7 +76,7 @@ int ConcatOpenCLKernel::Init() {
 
   if (in_tensors_.size() == 3) {
     std::set<std::string> build_options;
-    std::string source = concat_source_fp32;
+    std::string source = concat_source;
     std::string program_name = "Concat3input";
     std::string kernel_name = "Concat3input";
     auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index bb9276e671..027bcd664e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -20,8 +20,7 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl.inc"
+#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -45,11 +44,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
   ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
-#ifdef ENABLE_FP16
-  std::string source = conv2d_transpose2x2_source_fp16;
-#else
-  std::string source = conv2d_transpose2x2_source_fp32;
-#endif
+  std::string source = conv2d_transpose2x2_source;
   std::set<std::string> build_options;
   std::string program_name = "conv2d_transpose2x2";
   ocl_runtime->LoadSource(program_name, source);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 07ea195514..421a1dce00 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -26,8 +26,7 @@
 
 #ifndef PROGRAM_WITH_IL
 
-#include "src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/depthwise_conv2d.cl.inc"
 
 #endif
 
@@ -68,11 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
 #else
   std::string program_name = "DepthwiseConv2d";
   std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = depthwise_conv2d_source_fp16;
-#else
-  std::string source = depthwise_conv2d_source_fp32;
-#endif
+  std::string source = depthwise_conv2d_source;
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index 8b8dac11b7..b724e5f7e1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -21,8 +21,7 @@
 #include "src/runtime/kernel/arm/nnacl/fp32/matmul.h"
 #include "src/runtime/kernel/opencl/kernel/matmul.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/matmul.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/matmul.cl.inc"
+#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -40,11 +39,7 @@ int MatMulOpenCLKernel::Init() {
   ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
   std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = matmul_source_fp16;
-#else
-  std::string source = matmul_source_fp32;
-#endif
+  std::string source = matmul_source;
   std::string program_name = "MatMul";
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 7c1daf618a..276cd699bd 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -24,8 +24,8 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/image_format.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
+#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -46,13 +46,13 @@ int PoolingOpenCLKernel::Init() {
   if (parameter_->max_pooling_) {
     kernel_name = "MaxPooling2d";
 #ifndef PROGRAM_WITH_IL
-    source = max_pool2d_source_fp32;
+    source = max_pool2d_source;
     program_name = "MaxPooling2d";
 #endif
   } else if (parameter_->avg_pooling_) {
     kernel_name = "AvgPooling2d";
 #ifndef PROGRAM_WITH_IL
-    source = avg_pool2d_source_fp32;
+    source = avg_pool2d_source;
     program_name = "AvgPooling2d";
 #endif
   } else {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
index ed5c0803bd..7ff7993035 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@@ -23,7 +23,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
+#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
 #include "src/runtime/kernel/arm/nnacl/prelu_parameter.h"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -40,7 +40,7 @@ int PReluOpenCLKernel::Init() {
     return RET_ERROR;
   }
   std::set<std::string> build_options;
-  std::string source = activation_source_fp32;
+  std::string source = activation_source;
   std::string program_name = "PRelu";
   std::string kernel_name = "ReluScalar";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index 064caffaf6..e58013e493 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -20,8 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/reshape.h"
-#include "src/runtime/kernel/opencl/cl/fp16/reshape.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/reshape.cl.inc"
+#include "src/runtime/kernel/opencl/cl/reshape.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -39,11 +38,7 @@ int ReshapeOpenCLKernel::Init() {
   ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
   std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = reshape_source_fp16;
-#else
-  std::string source = reshape_source_fp32;
-#endif
+  std::string source = reshape_source;
   std::string program_name = "reshape";
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
index 1fc02fc03f..9d55ba5d53 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -22,8 +22,8 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp32/softmax.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl.inc"
+#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
+#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -88,7 +88,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 int SoftmaxOpenCLKernel::Init() {
   std::string kernel_name = "SoftMax";
   std::string program_name = "SoftMax";
-  std::string source = softmax_source_fp32;
+  std::string source = softmax_source;
   runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
   // framework not set this param yet! just use default.
   if (parameter_->axis_ == -1) {
@@ -101,7 +101,7 @@ int SoftmaxOpenCLKernel::Init() {
     // support 2d tensor
     kernel_name += "1x1";
     program_name += "1x1";
-    source = softmax1x1_source_fp32;
+    source = softmax1x1_source;
     onexone_flag_ = true;
   } else {
     MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 323bb5b296..92c2c03479 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -22,7 +22,7 @@
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc"
+#include "src/runtime/kernel/opencl/cl/to_format.cl.inc"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -37,10 +37,9 @@ int ToFormatOpenCLKernel::Init() {
   auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
   out_mem_type_ = parameter->out_mem_type;
   std::string program_name = "to_format";
-  std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},
-                                                   {schema::Format_NHWC, "NHWC"},
-                                                   {schema::Format_NC4HW4, "NC4HW4"},
-                                                   {schema::Format_NHWC4, "NHWC4"}};
+  std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},     {schema::Format_NHWC, "NHWC"},
+                                                   {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NC4, "NHWC4"},
+                                                   {schema::Format_NC, "NHWC"},       {schema::Format_NHWC4, "NHWC4"}};
   std::string kernel_name =
     "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
   if (out_mem_type_ == OpenCLMemType::IMG) {
@@ -49,49 +48,54 @@ int ToFormatOpenCLKernel::Init() {
     kernel_name += "_BUF";
   }
 
+  this->set_name(kernel_name);
 #ifdef PROGRAM_WITH_IL
   ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
   std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = to_format_source_fp16;
-#else
-  std::string source = to_format_source_fp32;
-#endif
+  std::string source = to_format_source;
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
+  InitNHWCShape();
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
   return RET_OK;
 }
 
-int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
-
-int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
+int ToFormatOpenCLKernel::InitNHWCShape() {
   std::vector<int> shapex = out_tensors_[0]->shape();
+  size_t n, h, w, c;
   if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
-    int h = shapex[1];
-    int w = shapex[2];
-    int c = shapex[3];
-    int c4 = UP_DIV(c, C4NUM);
-    std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4};
-    *global_size = std::move(vec);
+    n = shapex[0];
+    h = shapex[1];
+    w = shapex[2];
+    c = shapex[3];
   } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
              out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
-    int h = shapex[2];
-    int w = shapex[3];
-    int c = shapex[1];
-    int c4 = UP_DIV(c, C4NUM);
-    std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w};
-    *global_size = std::move(vec);
-  } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
-    int h = shapex[2];
-    int w = shapex[3];
-    int c = shapex[1];
-    int w4 = UP_DIV(w, C4NUM);
-    std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c};
-    *global_size = std::move(vec);
+    n = shapex[0];
+    h = shapex[2];
+    w = shapex[3];
+    c = shapex[1];
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4 || out_tensors_[0]->GetFormat() == schema::Format_NC) {
+    n = shapex[0];
+    h = 1;
+    w = 1;
+    c = shapex[1];
+  } else {
+    n = shapex[0];
+    h = shapex[1];
+    w = shapex[2];
+    c = shapex[3];
   }
+  nhwc_shape_ = {n, h, w, c};
+  return RET_OK;
+}
+
+int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
+
+int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
+  std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
+  *global_size = std::move(vec);
   return RET_OK;
 }
 int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
@@ -114,6 +118,12 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
     int c = shapex[3];
     im_dst_x = w * UP_DIV(c, C4NUM);
     im_dst_y = h;
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
+    int h = 1;
+    int w = 1;
+    int c = shapex[1];
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = h;
   } else {
     MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
   }
@@ -128,15 +138,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
   return RET_OK;
 }
 int ToFormatOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << "ToFormat"
-                << " Running!";
+  MS_LOG(DEBUG) << this->name() << " Running!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   std::vector<size_t> local = {};
   std::vector<size_t> global;
   GetGlobalSize(0, &global);
-  auto shapex = in_tensors_[0]->shape();
-  cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1,
-                shapex.size() > 3 ? shapex[3] : 1};
+
+  cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
   cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
   ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
   ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
index 09d31b7454..febf17cd4d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
@@ -37,9 +37,11 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
   int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
   int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
+  int InitNHWCShape();
 
  private:
   cl::Kernel kernel_;
+  std::vector<size_t> nhwc_shape_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index a42f39195a..c57dcf72ac 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -21,8 +21,7 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/transpose.h"
 #ifndef PROGRAM_WITH_IL
-#include "src/runtime/kernel/opencl/cl/fp16/transpose.cl.inc"
-#include "src/runtime/kernel/opencl/cl/fp32/transpose.cl.inc"
+#include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
 #endif
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -45,11 +44,7 @@ int TransposeOpenCLKernel::Init() {
   ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
   std::set<std::string> build_options;
-#ifdef ENABLE_FP16
-  std::string source = transpose_source_fp16;
-#else
-  std::string source = transpose_source_fp32;
-#endif
+  std::string source = transpose_source;
   std::string program_name = "transpose";
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
index dccbbdae47..96229e8ada 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -288,13 +288,13 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
   if (fp16_enable_) {
     // fp16 enable, kernel will use half and read_imageh and write_imageh.
     build_options_str =
-      "-DFLOAT=half -DFLOAT4=half4 -DRI_F=read_imageh "
-      "-DWI_F=write_imageh";
+      "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
+      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4";
   } else {
     // fp16 not enable, kernel will use float and read_imagef and write_imagef.
     build_options_str =
-      "-DFLOAT=float -DFLOAT4=float4 -DRI_F=read_imagef "
-      "-DWI_F=write_imagef";
+      "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
+      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4";
   }
 
   build_options_str = std::accumulate(