From 0c9e9e5d82dcfb3d4346bedc3f93404cc0680f2f Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Tue, 13 Oct 2020 09:31:02 +0800
Subject: [PATCH] arithmetic support 2tensor

---
 .../runtime/kernel/opencl/cl/arithmetic.cl    | 351 ++++++++++++------
 .../kernel/opencl/kernel/arithmetic.cc        | 228 ++++++------
 .../runtime/kernel/opencl/kernel/arithmetic.h |   5 +-
 .../src/runtime/kernel/opencl/kernel/scale.cc |  58 ++-
 .../src/runtime/kernel/opencl/kernel/scale.h  |   1 +
 .../runtime/kernel/opencl/kernel/transpose.cc |   6 +
 .../lite/src/runtime/kernel/opencl/utils.cc   |  37 ++
 .../lite/src/runtime/kernel/opencl/utils.h    |   4 +
 mindspore/lite/test/run_test.sh               |   5 +-
 9 files changed, 421 insertions(+), 274 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
index 13247d95c2..8f4cbccd41 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -3,7 +3,7 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -12,24 +12,13 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a + b);
-}
-
-__kernel void ElementAddReLU_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), max(a + b, (FLT4)(0.f)));
+  FLT4 result = a + b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -38,11 +27,13 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a - b);
+  FLT4 result = a - b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -51,11 +42,13 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a * b);
+  FLT4 result = a * b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -64,11 +57,13 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
+  FLT4 result = divide_no_check(a, b);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -77,11 +72,13 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & AS_UINT4(b)));
+  FLT4 result = AS_FLT4(AS_UINT4(a) & AS_UINT4(b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                            const int2 output_shape) {
+                            const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -90,11 +87,13 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | AS_UINT4(b)));
+  FLT4 result = AS_FLT4(AS_UINT4(a) | AS_UINT4(b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -103,11 +102,13 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), max(a, b));
+  FLT4 result = max(a, b);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape) {
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -116,11 +117,14 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), min(a, b));
+  FLT4 result = min(a, b);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape) {
+                                  __write_only image2d_t output, const int2 output_shape, float act_min,
+                                  float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -129,11 +133,14 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)));
+  FLT4 result = floor(divide_no_check(a, b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape) {
+                                  __write_only image2d_t output, const int2 output_shape, float act_min,
+                                  float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -142,11 +149,14 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b);
+  FLT4 result = floor(divide_no_check(a, b)) * b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                           __write_only image2d_t output, const int2 output_shape) {
+                                           __write_only image2d_t output, const int2 output_shape, float act_min,
+                                           float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -155,11 +165,13 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2));
+  FLT4 result = pown((a - b), (int4)2);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                               __write_only image2d_t output, const int2 output_shape) {
+                               __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -168,11 +180,15 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a == b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a == b ? (FLT4)1.f : (FLT4).0f;
+  // error?
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape) {
+                                  __write_only image2d_t output, const int2 output_shape, float act_min,
+                                  float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -181,11 +197,13 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a != b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a != b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                              __write_only image2d_t output, const int2 output_shape) {
+                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -194,11 +212,14 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a < b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a < b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                   __write_only image2d_t output, const int2 output_shape) {
+                                   __write_only image2d_t output, const int2 output_shape, float act_min,
+                                   float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -207,11 +228,13 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a <= b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a <= b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                 __write_only image2d_t output, const int2 output_shape) {
+                                 __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -220,11 +243,14 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a > b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a > b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                      __write_only image2d_t output, const int2 output_shape) {
+                                      __write_only image2d_t output, const int2 output_shape, float act_min,
+                                      float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -233,58 +259,117 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
   FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a >= b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a >= b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
+}
+
+__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                    const int4 output_shape, float act_min, float act_max) {
+  int X = get_global_id(0);  // C4
+  int Y = get_global_id(1);  // W
+  int Z = get_global_id(2);  // N * H
+  int N = Z / output_shape.y;
+  int H = Z % output_shape.y;
+  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
+    return;
+  }
+  int a_c = X < a_shape.w ? X : a_shape.w - 1;
+  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
+  int a_h = H < a_shape.y ? H : a_shape.y - 1;
+  int a_n = N < a_shape.x ? N : a_shape.x - 1;
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
+  int b_c = X < b_shape.w ? X : b_shape.w - 1;
+  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
+  int b_h = H < b_shape.y ? H : b_shape.y - 1;
+  int b_n = N < b_shape.x ? N : b_shape.x - 1;
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
+  FLT4 result = a + b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
+}
+
+__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                    const int4 output_shape, float act_min, float act_max) {
+  int X = get_global_id(0);  // C4
+  int Y = get_global_id(1);  // W
+  int Z = get_global_id(2);  // N * H
+  int N = Z / output_shape.y;
+  int H = Z % output_shape.y;
+  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
+    return;
+  }
+  int a_c = X < a_shape.w ? X : a_shape.w - 1;
+  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
+  int a_h = H < a_shape.y ? H : a_shape.y - 1;
+  int a_n = N < a_shape.x ? N : a_shape.x - 1;
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
+  int b_c = X < b_shape.w ? X : b_shape.w - 1;
+  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
+  int b_h = H < b_shape.y ? H : b_shape.y - 1;
+  int b_n = N < b_shape.x ? N : b_shape.x - 1;
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
+  FLT4 result = a - b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
+}
+
+__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                    const int4 output_shape, float act_min, float act_max) {
+  int X = get_global_id(0);  // C4
+  int Y = get_global_id(1);  // W
+  int Z = get_global_id(2);  // N * H
+  int N = Z / output_shape.y;
+  int H = Z % output_shape.y;
+  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
+    return;
+  }
+  int a_c = X < a_shape.w ? X : a_shape.w - 1;
+  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
+  int a_h = H < a_shape.y ? H : a_shape.y - 1;
+  int a_n = N < a_shape.x ? N : a_shape.x - 1;
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
+  int b_c = X < b_shape.w ? X : b_shape.w - 1;
+  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
+  int b_h = H < b_shape.y ? H : b_shape.y - 1;
+  int b_n = N < b_shape.x ? N : b_shape.x - 1;
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
+  FLT4 result = a * b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
+}
+
+__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                    const int4 output_shape, float act_min, float act_max) {
+  int X = get_global_id(0);  // C4
+  int Y = get_global_id(1);  // W
+  int Z = get_global_id(2);  // N * H
+  int N = Z / output_shape.y;
+  int H = Z % output_shape.y;
+  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
+    return;
+  }
+  int a_c = X < a_shape.w ? X : a_shape.w - 1;
+  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
+  int a_h = H < a_shape.y ? H : a_shape.y - 1;
+  int a_n = N < a_shape.x ? N : a_shape.x - 1;
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
+  int b_c = X < b_shape.w ? X : b_shape.w - 1;
+  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
+  int b_h = H < b_shape.y ? H : b_shape.y - 1;
+  int b_n = N < b_shape.x ? N : b_shape.x - 1;
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
+  FLT4 result = a / b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
 }
 
-__kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b);
-}
-
-__kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b);
-}
-
-__kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b);
-}
-
-__kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x || Y >= output_shape.y) {
-    return;
-  }
-
-  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b));
-}
 __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
+                               const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -292,11 +377,13 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b)));
+  FLT4 result = AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                              const int2 output_shape) {
+                              const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -304,11 +391,13 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b)));
+  FLT4 result = AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
+                               const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -316,11 +405,13 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b));
+  FLT4 result = max(a, (FLT4)b);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape) {
+                               const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -328,11 +419,13 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b));
+  FLT4 result = min(a, (FLT4)b);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape) {
+                                    const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -340,11 +433,13 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)));
+  FLT4 result = floor(divide_no_check(a, (FLT4)b));
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape) {
+                                    const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -352,11 +447,13 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b);
+  FLT4 result = floor(divide_no_check(a, (FLT4)b)) * (FLT)b;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                             const int2 output_shape) {
+                                             const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -364,11 +461,13 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2));
+  FLT4 result = pown((a - (FLT4)b), (int4)2);
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                 const int2 output_shape) {
+                                 const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -376,11 +475,13 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a == (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a == (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape) {
+                                    const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -388,11 +489,13 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a != (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a != (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                const int2 output_shape) {
+                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -400,11 +503,13 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a < (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a < (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                     const int2 output_shape) {
+                                     const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -412,11 +517,13 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                   const int2 output_shape) {
+                                   const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -424,11 +531,13 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a > (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a > (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                        const int2 output_shape) {
+                                        const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -436,7 +545,9 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b,
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f);
+  FLT4 result = a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f;
+  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
 __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 40e9bc80a3..bc8e30bf49 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -32,13 +32,7 @@ using mindspore::schema::PrimitiveType_Eltwise;
 
 namespace mindspore::kernel {
 
-ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {
-  if (weight_ptr_ != nullptr) {
-    auto allocator = ocl_runtime_->GetAllocator();
-    allocator->Free(weight_ptr_);
-    weight_ptr_ = nullptr;
-  }
-}
+ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {}
 
 std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
   const size_t global_x = out_tensors_[0]->Width();
@@ -114,90 +108,83 @@ int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_si
 }
 
 int ArithmeticOpenCLKernel::InitBuffer() {
-  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
-  if (!arithmetic_parameter->broadcasting_) {
-    if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->data_c() != nullptr) {
+  auto fp16_enable = ocl_runtime_->GetFp16Enable();
+  auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float);
+  for (auto in_tensor_ : in_tensors_) {
+    auto nhwc_shape = GetNHWCShape(in_tensor_->shape());
+    inputs_nhwc_shapes_.push_back(nhwc_shape);
+    if (in_tensor_->category() != lite::Tensor::Category::CONST || in_tensor_->data_c() == nullptr) {
+      inputs_weight_ptrs_.push_back(nullptr);
+    } else {
       auto allocator = ocl_runtime_->GetAllocator();
-      std::vector<size_t> img_size;
-      GetImageSize(0, &img_size);
-      int pack_weight_size = in_tensors_[1]->ElementsC4Num();
-      int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width();
-      int channel = in_tensors_[1]->Channel();
-      int batch = in_tensors_[1]->Batch();
-
-      if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) {
-        if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) {
-          weight_ptr_ =
-            allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size);
-        } else {
-          MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
-                        << in_tensors_[0]->data_type();
+      std::vector<size_t> img_size = GetImage2dShapeFromNHWC(nhwc_shape, op_format_);
+      int pack_weight_size = img_size[0] * img_size[1] * C4NUM;
+      int plane = nhwc_shape[1] * nhwc_shape[2];
+      int channel = nhwc_shape[3];
+      int batch = nhwc_shape[0];
+      img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT);
+      if (!fp16_enable) {
+        float *weight = new (std::nothrow) float[pack_weight_size];
+        if (weight == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed!";
           return RET_ERROR;
         }
-      } else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-        if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
-          if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
-            float *weight = new (std::nothrow) float[pack_weight_size];
-            if (weight == nullptr) {
-              MS_LOG(ERROR) << "Malloc buffer failed!";
-              return RET_ERROR;
-            }
+        memset(weight, 0x00, pack_weight_size * data_size);
+        if (op_format_ == schema::Format_NHWC4) {
+          if (in_tensor_->data_type() == kNumberTypeFloat32) {
             std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-            PackNHWCToNC4HW4<float, float>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
-            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
-            delete[] weight;
-          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
-            if (weight == nullptr) {
-              MS_LOG(ERROR) << "Malloc buffer failed!";
-              return RET_ERROR;
-            }
-            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
-            PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
-            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
-            delete[] weight;
-          } else {
-            MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
-                          << in_tensors_[0]->data_type();
-            return RET_ERROR;
+            PackNHWCToNHWC4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+            std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
+            PackNHWCToNHWC4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          }
+        } else if (op_format_ == schema::Format_NC4HW4) {
+          if (in_tensor_->data_type() == kNumberTypeFloat32) {
+            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
+            PackNHWCToNC4HW4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+            std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
+            PackNHWCToNC4HW4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
           }
-        } else {
-          MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
-                        << in_tensors_[0]->GetFormat();
+        }
+        if (batch * plane * channel == 1) {
+          // scalar
+          weight[3] = weight[2] = weight[1] = weight[0];
+        }
+        auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size);
+        inputs_weight_ptrs_.push_back(weight_ptr_);
+        delete[] weight;
+      } else {
+        float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
+        if (weight == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed!";
           return RET_ERROR;
         }
-      } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-        if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
-          if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
-            float *weight = new (std::nothrow) float[pack_weight_size];
-            if (weight == nullptr) {
-              MS_LOG(ERROR) << "Malloc buffer failed!";
-              return RET_ERROR;
-            }
-            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-            PackNHWCToNHWC4<float, float>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
-            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
-            delete[] weight;
-          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
-            if (weight == nullptr) {
-              MS_LOG(ERROR) << "Malloc buffer failed!";
-              return RET_ERROR;
-            }
+        memset(weight, 0x00, pack_weight_size * data_size);
+        if (op_format_ == schema::Format_NHWC4) {
+          if (in_tensor_->data_type() == kNumberTypeFloat32) {
             std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
-            PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
-            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
-            delete[] weight;
-          } else {
-            MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
-                          << in_tensors_[0]->data_type();
-            return RET_ERROR;
+            PackNHWCToNHWC4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+            std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
+            PackNHWCToNHWC4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          }
+        } else if (op_format_ == schema::Format_NC4HW4) {
+          if (in_tensor_->data_type() == kNumberTypeFloat32) {
+            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+            PackNHWCToNC4HW4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+            std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
+            PackNHWCToNC4HW4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
           }
-        } else {
-          MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
-                        << in_tensors_[0]->GetFormat();
-          return RET_ERROR;
         }
+        if (batch * plane * channel == 1) {
+          // scalar
+          weight[3] = weight[2] = weight[1] = weight[0];
+        }
+        auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size);
+        inputs_weight_ptrs_.push_back(weight_ptr_);
+        delete[] weight;
       }
     }
   }
@@ -211,7 +198,13 @@ int ArithmeticOpenCLKernel::Init() {
 
   if (arithmetic_parameter->broadcasting_) {
     element_flag_ = false;
-    kernel_name = "Broadcast";
+    if (op_format_ == schema::Format_NHWC4) {
+      kernel_name = "BroadcastNHWC4";
+    } else {
+      kernel_name = "BroadcastNC4HW4";
+      MS_LOG(ERROR) << "Don't support BroadcastNC4HW4 yet";
+      return RET_ERROR;
+    }
   } else {
     kernel_name = "Element";
   }
@@ -277,15 +270,14 @@ int ArithmeticOpenCLKernel::Init() {
     case schema::ActivationType_NO_ACTIVATION:
       break;
     case schema::ActivationType_RELU:
-      if (op_parameter_->type_ == PrimitiveType_Add && element_flag_) {
-        kernel_name += "ReLU";
-      } else {
-        MS_LOG(ERROR) << "Only support ElementAdd + ReLU";
-        return RET_ERROR;
-      }
+      activation_min_ = 0.f;
+      break;
+    case schema::ActivationType_RELU6:
+      activation_min_ = 0.f;
+      activation_max_ = 6.f;
       break;
     default:
-      MS_LOG(ERROR) << "Error activation type " << arithmetic_parameter->activation_type_;
+      MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_;
       return RET_ERROR;
   }
 
@@ -328,47 +320,41 @@ int ArithmeticOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
 
   int arg_idx = 0;
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
-  if (element_flag_) {
-    void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->data_c() : weight_ptr_;
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
+  auto input_0_ptr = inputs_weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : inputs_weight_ptrs_[0];
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
+  auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
+  if (!element_flag_) {
+    cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
+                            UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
+    cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
+                            UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
+    auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
+    cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
+    ocl_runtime_->RunKernel(kernel_,
+                            {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
+                             static_cast<size_t>(out_shape[1] * out_shape[0])},
+                            {}, nullptr);
   } else {
-    float weight = 0.f;
-    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
-      weight = static_cast<float *>(in_tensors_[1]->data_c())[0];
-    } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
-      weight = static_cast<float>(static_cast<float16_t *>(in_tensors_[1]->data_c())[0]);
-    } else {
-      MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type();
-      return RET_ERROR;
-    }
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
+    cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
+    ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
   }
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-
-  cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
-  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
   return RET_OK;
 }
 
-kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                               const lite::PrimitiveC *primitive);
-
 kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                   const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                                   const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                                   const mindspore::lite::PrimitiveC *primitive) {
-  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(opParameter);
-  if (arithmetic_parameter->broadcasting_) {
-    for (size_t i = 0; i < arithmetic_parameter->ndim_; i++) {
-      if (arithmetic_parameter->in_shape1_[i] != 0 && arithmetic_parameter->in_shape1_[i] != 1) {
-        return OpenCLBiasAddKernelCreator(inputs, outputs, opParameter, ctx, desc, primitive);
-      }
-    }
-  }
   auto *kernel =
     new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
   if (kernel == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
index 650c67d88f..2bd469d914 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -42,7 +42,10 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
 
   cl::Kernel kernel_;
   bool element_flag_{true};
-  void *weight_ptr_{nullptr};
+  float activation_min_{-FLT_MAX};
+  float activation_max_{FLT_MAX};
+  std::vector<std::vector<int>> inputs_nhwc_shapes_;
+  std::vector<void *> inputs_weight_ptrs_;
 
   std::vector<size_t> local_size_;
   std::vector<size_t> global_size_;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
index efe3e4fcf8..bfc9ad3d40 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -54,6 +54,12 @@ std::vector<size_t> ScaleOpenCLKernel::InitGlobalSize() const {
 
 void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
   local_size_ = {16, 16};
+  if (out_tensors_[0]->shape().size() == 2) {
+    size_t H = out_tensors_[0]->shape()[0];
+    size_t W = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
+    global_size_ = {W, H};
+    return;
+  }
   if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
     size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
     size_t W = out_tensors_[0]->Width();
@@ -78,18 +84,23 @@ void ScaleOpenCLKernel::BufferGetWorkGroupSize() {
 
 int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-    im_dst_x = out_tensors_[0]->Width();
-    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
-    im_dst_y = out_tensors_[0]->Batch();
-    im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  if (out_tensors_[0]->shape().size() == 2) {
+    im_dst_x = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
+    im_dst_y = out_tensors_[0]->shape()[0];
   } else {
-    MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
-    return RET_ERROR;
+    if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
+      im_dst_x = out_tensors_[0]->Width();
+      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+    } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
+      im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
+    } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
+      im_dst_y = out_tensors_[0]->Batch();
+      im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+    } else {
+      MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
+      return RET_ERROR;
+    }
   }
 
   size_t img_dtype = CL_FLOAT;
@@ -114,7 +125,7 @@ int ScaleOpenCLKernel::InitBuffer() {
     auto allocator = ocl_runtime_->GetAllocator();
     std::vector<size_t> img_size;
     GetImageSize(0, &img_size);
-    if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) {
+    if (scale_C_flag_) {
       img_size[1] = 1;
       img_size[0] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM);
       scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size);
@@ -256,8 +267,10 @@ int ScaleOpenCLKernel::Init() {
     if (scale_tensor->ElementsNum() == 1) {
       element_flag_ = false;
       kernel_name = "BoardcastScale";
-    } else if (axis_ == 3 && scale_shape.size() == 1) {
+    } else if (((in_shape.size() == 4 && axis_ == 3) || (in_shape.size() == 2 && axis_ == 1)) &&
+               scale_shape.size() == 1) {
       element_flag_ = true;
+      scale_C_flag_ = true;
       kernel_name = "Scale_C";
     }
   } else {
@@ -327,24 +340,9 @@ int ScaleOpenCLKernel::Run() {
     }
   }
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  int H = 0;
-  int W = 0;
-  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-    W = out_tensors_[0]->Width();
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
-    W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
-    H = out_tensors_[0]->Batch();
-    W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  } else {
-    MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat();
-    return RET_ERROR;
-  }
-  cl_int2 output_shape{W, H};
+  cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
-  if (element_flag_ && axis_ == 3) {
+  if (element_flag_ && scale_C_flag_) {
     ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
   }
   ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
index 8359396dd6..8f8c92adbc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
@@ -42,6 +42,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
 
   cl::Kernel kernel_;
   bool element_flag_{true};
+  bool scale_C_flag_{false};
   void *scale_ptr_{nullptr};
   void *offset_ptr_{nullptr};
   int axis_{0};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index 9904a60d16..f79f10c513 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -27,6 +27,8 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Nchw2Nhwc;
+using mindspore::schema::PrimitiveType_Nhwc2Nchw;
 using mindspore::schema::PrimitiveType_Transpose;
 
 namespace mindspore::kernel {
@@ -141,4 +143,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
index 9d0c084b88..be5c598eb4 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -276,4 +276,41 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) {
   }
   allocator->UnmapBuffer(origin_data);
 }
+
+std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape) {
+  int n, h, w, c;
+  n = h = w = c = 1;
+  if (tensor_shape.size() == 1) {
+    c = tensor_shape[0];
+  } else if (tensor_shape.size() == 2) {
+    n = tensor_shape[0];
+    c = tensor_shape[1];
+  } else if (tensor_shape.size() == 3) {
+    n = tensor_shape[0];
+    h = tensor_shape[1];
+    c = tensor_shape[2];
+  } else if (tensor_shape.size() == 4) {
+    n = tensor_shape[0];
+    h = tensor_shape[1];
+    w = tensor_shape[2];
+    c = tensor_shape[3];
+  }
+  return {n, h, w, c};
+}
+
+std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format) {
+  if (tensor_shape.size() != 4) {
+    return {1, 1};
+  }
+  size_t image_x, image_y;
+  image_x = image_y = 1;
+  if (format == schema::Format_NHWC4) {
+    image_x = tensor_shape[2] * UP_DIV(tensor_shape[3], C4NUM);
+    image_y = tensor_shape[0] * tensor_shape[1];
+  } else if (format == schema::Format_NC4HW4) {
+    image_x = tensor_shape[2];
+    image_y = tensor_shape[0] * tensor_shape[1] * UP_DIV(tensor_shape[3], C4NUM);
+  }
+  return {image_x, image_y};
+}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h
index 92cddb32ed..03931ffb56 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -48,6 +48,10 @@ void Write2File(void *mem, const std::string &file_name, int size);
 
 void PrintTensor(lite::Tensor *tensor, int num = 10, const std::string &out_file = "");
 
+std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape);
+
+std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format);
+
 template <class T1, class T2>
 void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
   int c4 = UP_DIV(channel, C4NUM);
diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh
index 44ade02040..fdf6c642a1 100755
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@@ -37,8 +37,7 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*"
 ./lite-test --gtest_filter="TestAvgPoolingOpenCL*"
 ./lite-test --gtest_filter="TestConv2dTransposeOpenCL*"
-./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*"
-./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*"
+./lite-test --gtest_filter="TestMatMulOpenCL*"
 ./lite-test --gtest_filter="TestMaxPoolingOpenCL*"
 ./lite-test --gtest_filter="TestReduceOpenCL*"
 ./lite-test --gtest_filter="TestReshapeOpenCL*"
@@ -46,3 +45,5 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestTransposeOpenCL*"
 ./lite-test --gtest_filter="TestArithmeticOpenCL*"
 ./lite-test --gtest_filter="TestScaleOpenCL*"
+./lite-test --gtest_filter="TestFullConnectionOpenCL*"
+./lite-test --gtest_filter="TestResizeOpenCL*"