diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
index f02024a1c6..7f79bcc243 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -54,7 +54,307 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
 }
 
-__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & as_int4(b)));
+}
+
+__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                            const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | as_int4(b)));
+}
+
+__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), max(a, b));
+}
+
+__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), min(a, b));
+}
+
+__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                  __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), floor(a / b));
+}
+
+__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                  __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b);
+}
+
+__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                           __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2));
+}
+
+__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                               __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b));
+}
+
+__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                  __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b));
+}
+
+__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                              __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b));
+}
+
+__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                   __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b));
+}
+
+__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                 __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b));
+}
+
+__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                      __write_only image2d_t output, const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b));
+}
+
+__kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b);
+}
+
+__kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b);
+}
+
+__kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b);
+}
+
+__kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b));
+}
+__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & (int4)(b)));
+}
+
+__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                              const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | (int4)b));
+}
+
+__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b));
+}
+
+__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b));
+}
+
+__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                    const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b));
+}
+
+__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                    const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b);
+}
+
+__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                             const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2));
+}
+
+__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                  const int2 output_shape) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -63,7 +363,67 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl
   }
 
   FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias);
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b));
+}
+
+__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                    const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b));
+}
+
+__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b));
+}
+
+__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                     const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b));
+}
+
+__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                   const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b));
+}
+
+__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                        const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
+  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b));
 }
 
 __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
@@ -94,9 +454,26 @@ __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, _
   output[idx] = input_a[idx] * input_b[idx];
 }
 
-__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
-                                 const unsigned int n) {
+__kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] + (FLT)b;
+}
+
+__kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] - (FLT)b;
+}
+
+__kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
+  int idx = get_global_id(0);
+  if (idx >= n) return;
+  output[idx] = input_a[idx] * (FLT)b;
+}
+
+__kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
   int idx = get_global_id(0);
   if (idx >= n) return;
-  output[idx] = weight * input_a[idx] + bias;
+  output[idx] = divide_no_check(input_a[idx], (FLT)b);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 9700ddc3d8..826d9ddc2c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -28,6 +28,7 @@
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_Eltwise;
 
 namespace mindspore::kernel {
 
@@ -130,18 +131,18 @@ int ArithmeticOpenCLKernel::InitBuffer() {
               MS_LOG(ERROR) << "Malloc buffer failed!";
               return RET_ERROR;
             }
-            std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
             PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
             weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
             delete[] weight;
           } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-            int16_t *weight = new (std::nothrow) int16_t[pack_weight_size];
+            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
             if (weight == nullptr) {
               MS_LOG(ERROR) << "Malloc buffer failed!";
               return RET_ERROR;
             }
-            std::function<int16_t(float)> to_dtype = Float32ToShort;
-            PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
+            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+            PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
             weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
             delete[] weight;
           } else {
@@ -162,18 +163,18 @@ int ArithmeticOpenCLKernel::InitBuffer() {
               MS_LOG(ERROR) << "Malloc buffer failed!";
               return RET_ERROR;
             }
-            std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
             PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
             weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
             delete[] weight;
           } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-            int16_t *weight = new (std::nothrow) int16_t[pack_weight_size];
+            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
             if (weight == nullptr) {
               MS_LOG(ERROR) << "Malloc buffer failed!";
               return RET_ERROR;
             }
-            std::function<int16_t(float)> to_dtype = Float32ToShort;
-            PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
+            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+            PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
             weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
             delete[] weight;
           } else {
@@ -197,28 +198,69 @@ int ArithmeticOpenCLKernel::Init() {
   std::string kernel_name;
 
   const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
+
   if (arithmetic_parameter->broadcasting_) {
     element_flag_ = false;
-    kernel_name = "BoardcastArith";
+    kernel_name = "Broadcast";
   } else {
-    element_flag_ = true;
-    switch (op_parameter_->type_) {
-      case PrimitiveType_Mul:
-        kernel_name = "ElementMul";
-        break;
-      case PrimitiveType_Add:
-        kernel_name = "ElementAdd";
-        break;
-      case PrimitiveType_Sub:
-        kernel_name = "ElementSub";
-        break;
-      case PrimitiveType_Div:
-        kernel_name = "ElementDiv";
-        break;
-      default:
-        MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
-        break;
-    }
+    kernel_name = "Element";
+  }
+
+  switch (op_parameter_->type_) {
+    case PrimitiveType_Mul:
+      kernel_name += "Mul";
+      break;
+    case PrimitiveType_Add:
+      kernel_name += "Add";
+      break;
+    case PrimitiveType_Sub:
+      kernel_name += "Sub";
+      break;
+    case PrimitiveType_Div:
+      kernel_name += "Div";
+      break;
+    case PrimitiveType_LogicalAnd:
+      kernel_name += "And";
+      break;
+    case PrimitiveType_LogicalOr:
+      kernel_name += "Or";
+      break;
+    case PrimitiveType_Maximum:
+      kernel_name += "Max";
+      break;
+    case PrimitiveType_Minimum:
+      kernel_name += "Min";
+      break;
+    case PrimitiveType_FloorDiv:
+      kernel_name += "FloorDiv";
+      break;
+    case PrimitiveType_FloorMod:
+      kernel_name += "FloorMod";
+      break;
+    case PrimitiveType_SquaredDifference:
+      kernel_name += "SquaredDifference";
+      break;
+    case PrimitiveType_Equal:
+      kernel_name += "Equal";
+      break;
+    case PrimitiveType_NotEqual:
+      kernel_name += "NotEqual";
+      break;
+    case PrimitiveType_Less:
+      kernel_name += "Less";
+      break;
+    case PrimitiveType_LessEqual:
+      kernel_name += "LessEqual";
+      break;
+    case PrimitiveType_Greater:
+      kernel_name += "Greater";
+      break;
+    case PrimitiveType_GreaterEqual:
+      kernel_name += "GreaterEqual";
+      break;
+    default:
+      MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
+      return RET_ERROR;
   }
 
   lite::STATUS error_code = RET_OK;
@@ -265,26 +307,8 @@ int ArithmeticOpenCLKernel::Run() {
     void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_;
     runtime_->SetKernelArg(kernel_, arg_idx++, weight);
   } else {
-    float value = static_cast<float *>(in_tensors_[1]->MutableData())[0];
-    switch (op_parameter_->type_) {
-      case PrimitiveType_Mul:
-        weight_ = value;
-        break;
-      case PrimitiveType_Add:
-        bias_ = value;
-        break;
-      case PrimitiveType_Sub:
-        bias_ = -1 * value;
-        break;
-      case PrimitiveType_Div:
-        weight_ = 1 / value;
-        break;
-      default:
-        MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
-        break;
-    }
-    runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
-    runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
+    float weight = static_cast<float *>(in_tensors_[1]->MutableData())[0];
+    runtime_->SetKernelArg(kernel_, arg_idx++, weight);
   }
   runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData());
 
@@ -345,4 +369,36 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCr
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
index 0555212e32..1d8fd987d7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -44,8 +44,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
   cl::Kernel kernel_;
   lite::opencl::OpenCLRuntime *runtime_;
   bool element_flag_{true};
-  float weight_{1.f};
-  float bias_{.0f};
   void *weight_ptr_{nullptr};
 
   std::vector<size_t> local_size_;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
index 0f4945d352..bd8724871c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -152,7 +152,7 @@ int ScaleOpenCLKernel::InitBuffer() {
             delete[] scale;
             return RET_ERROR;
           }
-          std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+          std::function<float(float)> to_dtype = [](float x) -> float { return x; };
           PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
           PackNHWCToNC4HW4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
           scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
@@ -160,20 +160,20 @@ int ScaleOpenCLKernel::InitBuffer() {
           delete[] scale;
           delete[] offset;
         } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-          int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
+          float16_t *scale = new (std::nothrow) float16_t[pack_weight_size];
           if (scale == nullptr) {
             MS_LOG(ERROR) << "Malloc buffer failed!";
             return RET_ERROR;
           }
-          int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
+          float16_t *offset = new (std::nothrow) float16_t[pack_weight_size];
           if (offset == nullptr) {
             MS_LOG(ERROR) << "Malloc buffer failed!";
             delete[] scale;
             return RET_ERROR;
           }
-          std::function<int16_t(float)> to_dtype = Float32ToShort;
-          PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
-          PackNHWCToNC4HW4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
+          std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+          PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
+          PackNHWCToNC4HW4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
           scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
           offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
           delete[] scale;
@@ -202,7 +202,7 @@ int ScaleOpenCLKernel::InitBuffer() {
             delete[] scale;
             return RET_ERROR;
           }
-          std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+          std::function<float(float)> to_dtype = [](float x) -> float { return x; };
           PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
           PackNHWCToNHWC4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
           scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
@@ -210,20 +210,20 @@ int ScaleOpenCLKernel::InitBuffer() {
           delete[] scale;
           delete[] offset;
         } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
-          int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
+          float16_t *scale = new (std::nothrow) float16_t[pack_weight_size];
           if (scale == nullptr) {
             MS_LOG(ERROR) << "Malloc buffer failed!";
             return RET_ERROR;
           }
-          int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
+          float16_t *offset = new (std::nothrow) float16_t[pack_weight_size];
           if (offset == nullptr) {
             MS_LOG(ERROR) << "Malloc buffer failed!";
             delete[] scale;
             return RET_ERROR;
           }
-          std::function<int16_t(float)> to_dtype = Float32ToShort;
-          PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
-          PackNHWCToNHWC4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
+          std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+          PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
+          PackNHWCToNHWC4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
           scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
           offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
           delete[] scale;
@@ -328,8 +328,8 @@ int ScaleOpenCLKernel::Run() {
         ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
         ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
       } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
-        int16_t scale = static_cast<int16_t *>(in_tensors_[1]->MutableData())[0];
-        int16_t offset = static_cast<int16_t *>(in_tensors_[2]->MutableData())[0];
+        float16_t scale = static_cast<float16_t *>(in_tensors_[1]->MutableData())[0];
+        float16_t offset = static_cast<float16_t *>(in_tensors_[2]->MutableData())[0];
         ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
         ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
       } else {
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
index 3ebce581fa..2f13dad701 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -300,12 +300,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
   if (fp16_enable_) {
     // fp16 enable, kernel will use half and read_imageh and write_imageh.
     build_options_str =
-      "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
+      "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 "
       "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half  -DTO_FLT4=convert_half4 ";
   } else {
     // fp16 not enable, kernel will use float and read_imagef and write_imagef.
     build_options_str =
-      "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
+      "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 "
       "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float  -DTO_FLT4=convert_float4 ";
   }