diff --git a/mindspore/lite/nnacl/adder.h b/mindspore/lite/nnacl/adder.h
index 1399510888..00c92796f3 100644
--- a/mindspore/lite/nnacl/adder.h
+++ b/mindspore/lite/nnacl/adder.h
@@ -16,9 +16,7 @@
 #ifndef MINDSPORE_LITE_NNACL_ADDER_H_
 #define MINDSPORE_LITE_NNACL_ADDER_H_
 
-#include <math.h>
 #include "nnacl/op_base.h"
-#include "nnacl/quantization/fixed_point.h"
 
 typedef struct AdderParameter {
   OpParameter op_parameter_;
diff --git a/mindspore/lite/nnacl/flatten.c b/mindspore/lite/nnacl/arithmetic.c
similarity index 50%
rename from mindspore/lite/nnacl/flatten.c
rename to mindspore/lite/nnacl/arithmetic.c
index 4f9936741a..c595a37637 100644
--- a/mindspore/lite/nnacl/flatten.c
+++ b/mindspore/lite/nnacl/arithmetic.c
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "nnacl/flatten.h"
-#include <string.h>
 
-void Flatten(const void *input, void *output, const FlattenParameter *flatten_param) {
-  memcpy(output, input, flatten_param->size);
+#include "nnacl/arithmetic.h"
+
+void CalcMultiplesAndStrides(ArithmeticParameter *param) {
+  NNACL_ASSERT(param->in_shape0_[i] != 0);
+  NNACL_ASSERT(param->in_shape1_[i] != 0);
+  for (size_t i = 0; i < param->ndim_; i++) {
+    param->multiples0_[i] = param->out_shape_[i] / param->in_shape0_[i];
+    param->multiples1_[i] = param->out_shape_[i] / param->in_shape1_[i];
+  }
+  // cal strides
+  ComputeStrides(param->in_shape0_, param->in_strides0_, param->ndim_);
+  ComputeStrides(param->in_shape1_, param->in_strides1_, param->ndim_);
+  ComputeStrides(param->out_shape_, param->out_strides_, param->ndim_);
 }
diff --git a/mindspore/lite/nnacl/flatten.h b/mindspore/lite/nnacl/arithmetic.h
similarity index 53%
rename from mindspore/lite/nnacl/flatten.h
rename to mindspore/lite/nnacl/arithmetic.h
index 8409b8dd13..5b6babee17 100644
--- a/mindspore/lite/nnacl/flatten.h
+++ b/mindspore/lite/nnacl/arithmetic.h
@@ -13,23 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_LITE_NNACL_FLATTEN_H_
-#define MINDSPORE_LITE_NNACL_FLATTEN_H_
+
+#ifndef MINDSPORE_LITE_NNACL_ARTITHMETIC_H_
+#define MINDSPORE_LITE_NNACL_ARTITHMETIC_H_
+
 #include "nnacl/op_base.h"
+#include "nnacl/common_func.h"
+#include "nnacl/nnacl_utils.h"
 
-typedef struct FlattenParameter {
-  // Primitive parameter
+typedef struct ArithmeticParameter {
   OpParameter op_parameter_;
-  // other parameter
-  int size;
-} FlattenParameter;
+  bool broadcasting_;
+  size_t ndim_;
+  int activation_type_;
+  int in_shape0_[10];
+  int in_elements_num0_;
+  int in_shape1_[10];
+  int in_elements_num1_;
+
+  int out_shape_[10];
+  int out_elements_num_;
+
+  int in_strides0_[10];
+  int in_strides1_[10];
+  int out_strides_[10];
+
+  int multiples0_[10];
+  int multiples1_[10];
+} ArithmeticParameter;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-void Flatten(const void *input, void *output, const FlattenParameter *flatten_param);
+void CalcMultiplesAndStrides(ArithmeticParameter *param);
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // MINDSPORE_LITE_NNACL_FLATTEN_H_
+#endif  // MINDSPORE_LITE_NNACL_ARTITHMETIC_H_
diff --git a/mindspore/lite/nnacl/arithmetic_common.c b/mindspore/lite/nnacl/arithmetic_common.c
deleted file mode 100644
index 47ff029e8e..0000000000
--- a/mindspore/lite/nnacl/arithmetic_common.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/arithmetic_common.h"
-#include "nnacl/nnacl_utils.h"
-
-void TileOneDimension(const float *inData, float *outData, int dim, size_t ndim, const int *inShape,
-                      const int *inStrides, const int *outStrides, const int *multiple) {
-  int srcDimSize = inShape[dim];
-  if (dim == ndim - 1) {
-    for (int i = 0; i < multiple[dim]; i++) {
-      memcpy(outData, inData, srcDimSize * sizeof(float));
-      outData += srcDimSize;
-    }
-    return;
-  }
-  for (size_t i = 0; i < srcDimSize; i++) {
-    for (size_t j = 0; j < multiple[dim]; j++) {
-      TileOneDimension(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
-                       inShape, inStrides, outStrides, multiple);
-    }
-  }
-}
-
-void TileOneDimensionUint8(const uint8_t *inData, uint8_t *outData, int dim, size_t ndim, const int *inShape,
-                           const int *inStrides, const int *outStrides, const int *multiple) {
-  int srcDimSize = inShape[dim];
-  if (dim == ndim - 1) {
-    for (int i = 0; i < multiple[dim]; i++) {
-      memcpy(outData, inData, srcDimSize * sizeof(uint8_t));
-      outData += srcDimSize;
-    }
-    return;
-  }
-  for (size_t i = 0; i < srcDimSize; i++) {
-    for (size_t j = 0; j < multiple[dim]; j++) {
-      TileOneDimensionUint8(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1,
-                            ndim, inShape, inStrides, outStrides, multiple);
-    }
-  }
-}
-
-void ComputeStrides(const int *shape, int *strides, const int ndim) {
-  int stride = 1;
-  for (int i = ndim - 1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= shape[i];
-  }
-}
-
-void CalcMultiplesAndStrides(ArithmeticParameter *param) {
-  NNACL_ASSERT(param->in_shape0_[i] != 0);
-  NNACL_ASSERT(param->in_shape1_[i] != 0);
-  for (size_t i = 0; i < param->ndim_; i++) {
-    param->multiples0_[i] = param->out_shape_[i] / param->in_shape0_[i];
-    param->multiples1_[i] = param->out_shape_[i] / param->in_shape1_[i];
-  }
-  // cal strides
-  ComputeStrides(param->in_shape0_, param->in_strides0_, param->ndim_);
-  ComputeStrides(param->in_shape1_, param->in_strides1_, param->ndim_);
-  ComputeStrides(param->out_shape_, param->out_strides_, param->ndim_);
-}
-
-void TileDimensions(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
-                    ArithmeticParameter *param) {
-  CalcMultiplesAndStrides(param);
-  TileOneDimension(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
-                   param->multiples0_);
-  TileOneDimension(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
-                   param->multiples1_);
-}
-
-void TileDimensionsUint8(const uint8_t *data0, const uint8_t *data1, uint8_t *tile_data0, uint8_t *tile_data1,
-                         ArithmeticParameter *param) {
-  CalcMultiplesAndStrides(param);
-  TileOneDimensionUint8(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
-                        param->multiples0_);
-  TileOneDimensionUint8(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
-                        param->multiples1_);
-}
-
-void TileDimensionsInt8(const int8_t *data0, const int8_t *data1, int8_t *tile_data0, int8_t *tile_data1,
-                        ArithmeticParameter *param) {
-  CalcMultiplesAndStrides(param);
-  TileOneDimensionUint8((uint8_t *)(data0), (uint8_t *)(tile_data0), 0, param->ndim_, param->in_shape0_,
-                        param->in_strides0_, param->out_strides_, param->multiples0_);
-  TileOneDimensionUint8((uint8_t *)(data1), (uint8_t *)(tile_data1), 0, param->ndim_, param->in_shape1_,
-                        param->in_strides1_, param->out_strides_, param->multiples1_);
-}
diff --git a/mindspore/lite/nnacl/arithmetic_common.h b/mindspore/lite/nnacl/arithmetic_common.h
deleted file mode 100644
index d9e08ad46e..0000000000
--- a/mindspore/lite/nnacl/arithmetic_common.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_LITE_NNACL_ARITHMETIC_COMMON_H_
-#define MINDSPORE_LITE_NNACL_ARITHMETIC_COMMON_H_
-
-#ifdef ENABLE_NEON
-#include <arm_neon.h>
-#endif
-#include <string.h>
-#include "nnacl/op_base.h"
-#include "nnacl/arithmetic_common.h"
-
-typedef struct ArithmeticParameter {
-  OpParameter op_parameter_;
-  bool broadcasting_;
-  size_t ndim_;
-  int activation_type_;
-  int in_shape0_[10];
-  int in_elements_num0_;
-  int in_shape1_[10];
-  int in_elements_num1_;
-
-  int out_shape_[10];
-  int out_elements_num_;
-
-  int in_strides0_[10];
-  int in_strides1_[10];
-  int out_strides_[10];
-
-  int multiples0_[10];
-  int multiples1_[10];
-} ArithmeticParameter;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-void TileOneDimension(const float *inData, float *outData, int dim, size_t ndim, const int *inShape,
-                      const int *inStrides, const int *outStrides, const int *multiple);
-void ComputeStrides(const int *shape, int *strides, const int ndim);
-
-void CalcMultiplesAndStrides(ArithmeticParameter *param);
-
-void TileOneDimensionUint8(const uint8_t *inData, uint8_t *outData, int dim, size_t ndim, const int *inShape,
-                           const int *inStrides, const int *outStrides, const int *multiple);
-void TileDimensions(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
-                    ArithmeticParameter *param);
-void TileDimensionsUint8(const uint8_t *data0, const uint8_t *data1, uint8_t *tile_data0, uint8_t *tile_data1,
-                         ArithmeticParameter *param);
-void TileDimensionsInt8(const int8_t *data0, const int8_t *data1, int8_t *tile_data0, int8_t *tile_data1,
-                        ArithmeticParameter *param);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // MINDSPORE_LITE_NNACL_ARITHMETIC_COMMON_H_
diff --git a/mindspore/lite/nnacl/arithmetic_parameter.h b/mindspore/lite/nnacl/arithmetic_parameter.h
deleted file mode 100644
index d7d2b73a20..0000000000
--- a/mindspore/lite/nnacl/arithmetic_parameter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_ARTITHMETIC_PARAMETER_H_
-#define MINDSPORE_LITE_NNACL_ARTITHMETIC_PARAMETER_H_
-
-#include "nnacl/op_attribute.h"
-
-#endif  // MINDSPORE_LITE_NNACL_ARTITHMETIC_PARAMETER_H_
diff --git a/mindspore/lite/nnacl/batch_to_space.c b/mindspore/lite/nnacl/batch_to_space.c
index 94bb4875fb..16ebbaf5ee 100644
--- a/mindspore/lite/nnacl/batch_to_space.c
+++ b/mindspore/lite/nnacl/batch_to_space.c
@@ -15,7 +15,6 @@
  */
 
 #include "nnacl/batch_to_space.h"
-#include "nnacl/arithmetic_common.h"
 
 void BatchToSpaceNoCropForNHWC(const void *input, void *output, const int *in_shape, int out_n, const int *block,
                                int data_size) {
diff --git a/mindspore/lite/nnacl/batch_to_space.h b/mindspore/lite/nnacl/batch_to_space.h
index 43b15bacac..2098aad52a 100644
--- a/mindspore/lite/nnacl/batch_to_space.h
+++ b/mindspore/lite/nnacl/batch_to_space.h
@@ -15,6 +15,8 @@
  */
 #ifndef MINDSPORE_LITE_NNACL_BATCH_TO_SPACE_H_
 #define MINDSPORE_LITE_NNACL_BATCH_TO_SPACE_H_
+
+#include <string.h>
 #include "nnacl/op_base.h"
 
 #define BATCH_TO_SPACE_BLOCK_SHAPE_SIZE 2
diff --git a/mindspore/lite/nnacl/common_func.h b/mindspore/lite/nnacl/common_func.h
index 2173d11fbd..1e6dc30d27 100644
--- a/mindspore/lite/nnacl/common_func.h
+++ b/mindspore/lite/nnacl/common_func.h
@@ -63,6 +63,14 @@ static inline int GetStride(int *strides, const int *shape, int length) {
   return stride;
 }
 
+inline void ComputeStrides(const int *shape, int *strides, const int ndim) {
+  int stride = 1;
+  for (int i = ndim - 1; i >= 0; i--) {
+    strides[i] = stride;
+    stride *= shape[i];
+  }
+}
+
 #ifdef ENABLE_ARM64
 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
 void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
diff --git a/mindspore/lite/nnacl/fp16/arithmetic_fp16.c b/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
index c707474d5c..bb77985601 100644
--- a/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
+++ b/mindspore/lite/nnacl/fp16/arithmetic_fp16.c
@@ -16,7 +16,7 @@
 
 #include "nnacl/fp16/arithmetic_fp16.h"
 #include <math.h>
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 #include "nnacl/nnacl_utils.h"
 
 void TileOneDimensionFp16(float16_t *inData, float16_t *outData, int dim, size_t ndim, int *inShape, int *inStrides,
diff --git a/mindspore/lite/nnacl/fp16/arithmetic_fp16.h b/mindspore/lite/nnacl/fp16/arithmetic_fp16.h
index f27b9d25b5..34a7ce96da 100644
--- a/mindspore/lite/nnacl/fp16/arithmetic_fp16.h
+++ b/mindspore/lite/nnacl/fp16/arithmetic_fp16.h
@@ -20,7 +20,7 @@
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "nnacl/errorcode.h"
 
 #ifdef __cplusplus
@@ -107,7 +107,7 @@ int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output,
 int ElementNotEqualFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
 int ElementEqualFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
 int ElementLessFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
-int ElementLessEqual(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
+int ElementLessEqualFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
 int ElementGreaterFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
 int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, uint8_t *output, int element_size);
 
diff --git a/mindspore/lite/nnacl/fp16/stack_fp16.c b/mindspore/lite/nnacl/fp16/stack_fp16.c
index 122657d559..4172053f91 100644
--- a/mindspore/lite/nnacl/fp16/stack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/stack_fp16.c
@@ -15,7 +15,7 @@
  */
 
 #include "nnacl/fp16/stack_fp16.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 
 size_t Fp16GetStackCopyNum(int axis, int *in_shape, size_t shape_size) {
   size_t one_input_size = 1;
diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
index 1a9869269e..56e4ee5c13 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
@@ -20,1230 +20,995 @@
 
 #define ACCURACY_DATA 0.00000001
 
-int ElementOptMul(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param) {
+int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vmulq_f32(vin0_opt, vin1);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] * input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] * in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vmulq_f32(vin0, vin1_opt);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] * input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] * in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptMulRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param) {
+int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[0] * input1[index], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] * in1[index], 0);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[index] * input1[0], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] * in1[0], 0);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptMulRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param) {
+int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptMulInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param) {
+int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
       int32x4_t vout = vmulq_s32(vin0_opt, vin1);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] * input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] * in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
       int32x4_t vout = vmulq_s32(vin0, vin1_opt);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] * input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] * in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptMulReluInt(const int *input0, const int *input1, int *output, const int element_size,
-                         const ArithmeticParameter *param) {
+int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
   int32x4_t zeros = vdupq_n_s32(0);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
       int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[0] * input1[index], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] * in1[index], 0);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
       int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[index] * input1[0], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] * in1[0], 0);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptMulRelu6Int(const int *input0, const int *input1, int *output, const int element_size,
-                          const ArithmeticParameter *param) {
+int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
   int32x4_t zeros = vdupq_n_s32(0);
   int32x4_t bounds = vdupq_n_s32(6);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
       int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
       int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptSub(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param) {
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vsubq_f32(vin0_opt, vin1);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] - input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vsubq_f32(vin0, vin1_opt);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] - input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptSubInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param) {
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
       int32x4_t vout = vsubq_s32(vin0_opt, vin1);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] - input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
       int32x4_t vout = vsubq_s32(vin0, vin1_opt);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] - input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptSubRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param) {
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[0] - input1[index], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] - in1[index], 0);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[index] - input1[0], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] - in1[0], 0);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptSubRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param) {
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[0] - input1[index], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[index] - input1[0], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptAdd(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param) {
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vaddq_f32(vin0_opt, vin1);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] + input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vaddq_f32(vin0, vin1_opt);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] + input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptAddInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param) {
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
       int32x4_t vout = vaddq_s32(vin0_opt, vin1);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[0] + input1[index];
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
       int32x4_t vout = vaddq_s32(vin0, vin1_opt);
-      vst1q_s32(output + index, vout);
+      vst1q_s32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = input0[index] + input1[0];
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptAddRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param) {
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[0] + input1[index], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] + in1[index], 0);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMAX(input0[index] + input1[0], 0);
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] + in1[0], 0);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptAddRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param) {
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
   int index = 0;
   if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(input1 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[0] + input1[index], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
     }
   } else {
 #ifdef ENABLE_NEON
-    for (; index <= element_size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(input0 + index);
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
       float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(output + index, vout);
+      vst1q_f32(out + index, vout);
     }
 #endif
-    for (; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[index] + input1[0], 0), 6);
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
     }
   }
 
   return NNACL_OK;
 }
 
-int ElementOptDiv(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param) {
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[0] / input1[index];
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
     }
   } else {
-    if (input1[0] == 0) {
+    if (in1[0] == 0) {
       return NNACL_ERRCODE_DIVISOR_ZERO;
     }
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[index] / input1[0];
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptDivRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param) {
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[0] / input1[index];
-      output[index] = output[index] > 0 ? output[index] : 0;
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+      out[index] = out[index] > 0 ? out[index] : 0;
     }
   } else {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[index] / input1[0];
-      output[index] = output[index] > 0 ? output[index] : 0;
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+      out[index] = out[index] > 0 ? out[index] : 0;
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptDivRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param) {
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[0] / input1[index], 0), 6);
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
     }
   } else {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = MSMIN(MSMAX(input0[index] / input1[0], 0), 6);
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptDivInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param) {
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[0] / input1[index];
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
     }
   } else {
-    if (input1[0] == 0) {
+    if (in1[0] == 0) {
       return NNACL_ERRCODE_DIVISOR_ZERO;
     }
-    for (int index = 0; index < element_size; index++) {
-      output[index] = input0[index] / input1[0];
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
     }
   }
   return NNACL_OK;
 }
 
-int ElementMul(const float *input0, const float *input1, float *output, const int element_size) {
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementAdd(tile_in0, tile_in1, out, size);
+}
+
+int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementMul(tile_in0, tile_in1, out, size);
+}
+
+int ElementMul(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vmulq_f32(vin0, vin1);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] * input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] * in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementMulRelu(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vmulq_f32(vin0, vin1);
     vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    float res = input0[index] * input1[index];
-    output[index] = res > 0 ? res : 0;
+  for (; index < size; index++) {
+    float res = in0[index] * in1[index];
+    out[index] = res > 0 ? res : 0;
   }
   return NNACL_OK;
 }
 
-int ElementMulRelu6(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
   }
   return NNACL_OK;
 }
 
-int ElementMulInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vmulq_s32(vin0, vin1);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] * input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] * in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementMulReluInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   int32x4_t zeros = vdupq_n_s32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vmulq_s32(vin0, vin1);
     vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    float res = input0[index] * input1[index];
-    output[index] = res > 0 ? res : 0;
+  for (; index < size; index++) {
+    float res = in0[index] * in1[index];
+    out[index] = res > 0 ? res : 0;
   }
   return NNACL_OK;
 }
 
-int ElementMulRelu6Int(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   int32x4_t zeros = vdupq_n_s32(0);
   int32x4_t bounds = vdupq_n_s32(6);
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
   }
   return NNACL_OK;
 }
 
-int BroadcastMul(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementMul(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementAdd(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementAdd(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vaddq_f32(vin0, vin1);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] + input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementAddRelu(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vaddq_f32(vin0, vin1);
     vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    float res = input0[index] + input1[index];
-    output[index] = res > 0 ? res : 0;
+  for (; index < size; index++) {
+    float res = in0[index] + in1[index];
+    out[index] = res > 0 ? res : 0;
   }
   return NNACL_OK;
 }
 
-int ElementAddRelu6(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6);
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
   }
   return NNACL_OK;
 }
 
-int ElementAddInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vaddq_s32(vin0, vin1);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] + input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementAddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = input0[i] + input1[i];
-  }
-  return NNACL_OK;
-}
-
-int BroadcastAdd(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementAdd(tile_input0, tile_input1, output, element_size);
-}
-
-int BroadcastAddInt8(const int8_t *input0, const int8_t *input1, int8_t *tile_input0, int8_t *tile_input1,
-                     int8_t *output, int element_size, ArithmeticParameter *param) {
-  TileDimensionsInt8(input0, input1, tile_input0, tile_input1, param);
-  return ElementAddInt8(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementSub(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementSub(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vsubq_f32(vin0, vin1);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] - input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementSubInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vsubq_s32(vin0, vin1);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] - input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementSubRelu(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vsubq_f32(vin0, vin1);
     vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    float res = input0[index] - input1[index];
-    output[index] = res > 0 ? res : 0;
+  for (; index < size; index++) {
+    float res = in0[index] - in1[index];
+    out[index] = res > 0 ? res : 0;
   }
   return NNACL_OK;
 }
 
-int ElementSubRelu6(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t zeros = vdupq_n_f32(0.0f);
   float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
   }
 
   return NNACL_OK;
 }
 
-int BroadcastSub(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementSub(tile_input0, tile_input1, output, element_size);
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in0, param);
+  return ElementDiv(tile_in0, tile_in0, out, size);
 }
 
-int ElementDiv(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = input0[i] / input1[i];
+int ElementDiv(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] / in1[i];
   }
   return NNACL_OK;
 }
 
-int ElementDivRelu(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    float res = input0[i] / input1[i];
-    output[i] = res > 0 ? res : 0;
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    float res = in0[i] / in1[i];
+    out[i] = res > 0 ? res : 0;
   }
   return NNACL_OK;
 }
 
-int ElementDivRelu6(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = MSMIN(MSMAX(input0[i] / input1[i], 0), 6);
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
   }
   return NNACL_OK;
 }
 
-int BroadcastDiv(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementDiv(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementFloorMod(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
+int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
   }
   return NNACL_OK;
 }
 
-int ElementFloorModInt(const int *input0, const int *input1, int *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = input0[i] - (input0[i] / input1[i]) * input1[i];
+int ElementFloorModInt(const int *in0, const int *in1, int *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] - (in0[i] / in1[i]) * in1[i];
   }
   return NNACL_OK;
 }
 
-int BroadcastFloorMod(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementFloorMod(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementMod(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = fmod(input0[i], input1[i]);
+int ElementMod(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = fmod(in0[i], in1[i]);
   }
   return NNACL_OK;
 }
 
-int ElementModInt(const int *input0, const int *input1, int *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = fmod(input0[i], input1[i]);
+int ElementModInt(const int *in0, const int *in1, int *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = fmod(in0[i], in1[i]);
   }
   return NNACL_OK;
 }
 
-int ElementOptMod(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param) {
+int ElementOptMod(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = fmod(input0[0], input1[index]);
+    for (int index = 0; index < size; index++) {
+      out[index] = fmod(in0[0], in1[index]);
     }
   } else {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = fmod(input0[index], input1[0]);
+    for (int index = 0; index < size; index++) {
+      out[index] = fmod(in0[index], in1[0]);
     }
   }
   return NNACL_OK;
 }
 
-int ElementOptModInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param) {
+int ElementOptModInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
   if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = fmod(input0[0], input1[index]);
+    for (int index = 0; index < size; index++) {
+      out[index] = fmod(in0[0], in1[index]);
     }
   } else {
-    for (int index = 0; index < element_size; index++) {
-      output[index] = fmod(input0[index], input1[0]);
+    for (int index = 0; index < size; index++) {
+      out[index] = fmod(in0[index], in1[0]);
     }
   }
   return NNACL_OK;
 }
 
-int ElementFloorDiv(const float *input0, const float *input1, float *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = floorf(input0[i] / input1[i]);
+int ElementFloorDiv(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = floorf(in0[i] / in1[i]);
   }
   return NNACL_OK;
 }
 
-int ElementFloorDivInt(const int *input0, const int *input1, int *output, const int element_size) {
-  for (int i = 0; i < element_size; i++) {
-    output[i] = input0[i] / input1[i];
+int ElementFloorDivInt(const int *in0, const int *in1, int *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] / in1[i];
   }
   return NNACL_OK;
 }
 
-int BroadcastFloorDiv(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementFloorDiv(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementLogicalAnd(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t vtrue = vdupq_n_f32(1);
   float32x4_t vfalse = vdupq_n_f32(0);
   uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
   uint32x4_t zeros = vdupq_n_u32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0 + index)), mask);
-    uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1 + index)), mask);
+  for (; index <= size - 4; index += C4NUM) {
+    uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(in0 + index)), mask);
+    uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(in1 + index)), mask);
     float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = (float)((bool)(input0[index]) & (bool)(input1[index]));
+  for (; index < size; index++) {
+    out[index] = (float)((bool)(in0[index]) & (bool)(in1[index]));
   }
   return NNACL_OK;
 }
 
-int ElementLogicalAndInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
-  for (; index < element_size; index++) {
-    output[index] = (int)((int)(input0[index]) & (int)(input1[index]));
+  for (; index < size; index++) {
+    out[index] = (int)((int)(in0[index]) & (int)(in1[index]));
   }
   return NNACL_OK;
 }
 
-int ElementLogicalAndBool(const bool *input0, const bool *input1, bool *output, const int element_size) {
+int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size) {
   int index = 0;
-  for (; index < element_size; index++) {
-    output[index] = (bool)((bool)(input0[index]) & (bool)(input1[index]));
+  for (; index < size; index++) {
+    out[index] = (bool)((bool)(in0[index]) & (bool)(in1[index]));
   }
   return NNACL_OK;
 }
 
-int ElementSquaredDifference(const float *input0, const float *input1, float *output, const int element_size) {
-  ElementSub(input0, input1, output, element_size);
-  return ElementMul(output, output, output, element_size);
-}
-
-int BroadcastSquaredDifference(const float *input0, const float *input1, float *tile_input0, float *tile_input1,
-                               float *output, int element_size, ArithmeticParameter *param) {
-  BroadcastSub(input0, input1, tile_input0, tile_input1, output, element_size, param);
-  return ElementMul(output, output, output, element_size);
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
+  ElementSub(in0, in1, out, size);
+  return ElementMul(out, out, out, size);
 }
 
-int BroadcastLogicalAnd(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                        int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementLogicalAnd(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementLogicalOr(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
   float32x4_t vtrue = vdupq_n_f32(1);
   float32x4_t vfalse = vdupq_n_f32(0);
   uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
   uint32x4_t zeros = vdupq_n_u32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0 + index)), mask);
-    uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1 + index)), mask);
+  for (; index <= size - 4; index += C4NUM) {
+    uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(in0 + index)), mask);
+    uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(in1 + index)), mask);
     float32x4_t vout = vbslq_f32(vceqq_u32(vorrq_u32(vin0, vin1), zeros), vfalse, vtrue);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = (float)((bool)(input0[index]) | (bool)(input1[index]));
+  for (; index < size; index++) {
+    out[index] = (float)((bool)(in0[index]) | (bool)(in1[index]));
   }
   return NNACL_OK;
 }
 
-int BroadcastLogicalOr(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                       int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementLogicalOr(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementMaximum(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementMaximum(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vmaxq_f32(vin0, vin1);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] > input1[index] ? input0[index] : input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
   }
   return NNACL_OK;
 }
 
-int ElementMaximumInt(const int *input0, const int *input1, int *output, const int element_size) {
+int ElementMaximumInt(const int *in0, const int *in1, int *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(input0 + index);
-    int32x4_t vin1 = vld1q_s32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
     int32x4_t vout = vmaxq_s32(vin0, vin1);
-    vst1q_s32(output + index, vout);
+    vst1q_s32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] > input1[index] ? input0[index] : input1[index];
+  for (; index < size; index++) {
+    out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
   }
   return NNACL_OK;
 }
 
-int BroadcastMaximum(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementMaximum(tile_input0, tile_input1, output, element_size);
+int BroadcastMaximum(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                     ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementMaximum(tile_in0, tile_in1, out, size);
 }
 
-int ElementMinimum(const float *input0, const float *input1, float *output, const int element_size) {
+int ElementMinimum(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
     float32x4_t vout = vminq_f32(vin0, vin1);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = input0[index] > input1[index] ? input1[index] : input0[index];
-  }
-  return NNACL_OK;
-}
-
-int BroadcastMinimum(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementMinimum(tile_input0, tile_input1, output, element_size);
-}
-
-float FloatNotEqualCheck(float in0, float in1) {
-  float tmp = in0 - in1;
-  if (tmp <= ACCURACY_DATA && tmp >= -ACCURACY_DATA) {
-    return (float)false;
-  }
-  return (float)true;
-}
-
-int ElementNotEqual(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vfalse, vtrue);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(fabsf(input0[index] - input1[index]) > FLT_EPSILON);
-  }
-  return NNACL_OK;
-}
-
-int BroadcastNotEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementNotEqual(tile_input0, tile_input1, output, element_size);
-}
-
-float FloatEqualCheck(float in0, float in1) {
-  float tmp = in0 - in1;
-  if (tmp <= ACCURACY_DATA && tmp >= -ACCURACY_DATA) {
-    return (float)true;
-  }
-  return (float)false;
-}
-
-int ElementEqual(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vtrue, vfalse);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(fabsf(input0[index] - input1[index]) <= FLT_EPSILON);
-  }
-  return NNACL_OK;
-}
-
-int BroadcastEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                   int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementEqual(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementLess(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vcltq_f32(vin0, vin1), vtrue, vfalse);
-    vst1q_f32(output + index, vout);
+    vst1q_f32(out + index, vout);
   }
 #endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(input0[index] < input1[index]);
+  for (; index < size; index++) {
+    out[index] = in0[index] > in1[index] ? in1[index] : in0[index];
   }
   return NNACL_OK;
 }
 
-int BroadcastLess(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                  int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementLess(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementLessEqual(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vcleq_f32(vin0, vin1), vtrue, vfalse);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(input0[index] <= input1[index]);
-  }
-  return NNACL_OK;
-}
-
-int BroadcastLessEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                       int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementLessEqual(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementGreater(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vcgtq_f32(vin0, vin1), vtrue, vfalse);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(input0[index] > input1[index]);
-  }
-  return NNACL_OK;
-}
-
-int BroadcastGreater(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementGreater(tile_input0, tile_input1, output, element_size);
-}
-
-int ElementGreaterEqual(const float *input0, const float *input1, float *output, const int element_size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  for (; index <= element_size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(input0 + index);
-    float32x4_t vin1 = vld1q_f32(input1 + index);
-    float32x4_t vout = vbslq_f32(vcgeq_f32(vin0, vin1), vtrue, vfalse);
-    vst1q_f32(output + index, vout);
-  }
-#endif
-  for (; index < element_size; index++) {
-    output[index] = (float)(input0[index] >= input1[index]);
-  }
-  return NNACL_OK;
-}
-
-int BroadcastGreaterEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1,
-                          float *output, int element_size, ArithmeticParameter *param) {
-  TileDimensions(input0, input1, tile_input0, tile_input1, param);
-  return ElementGreaterEqual(tile_input0, tile_input1, output, element_size);
-}
-
 #undef ACCURACY_DATA
 
 #ifdef ENABLE_NNACL_INFER_SHAPE
@@ -1304,3 +1069,30 @@ int ArithmeticInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *
   return NNACL_OK;
 }
 #endif
+
+void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t ndim, const int *inShape,
+                          const int *inStrides, const int *outStrides, const int *multiple) {
+  int srcDimSize = inShape[dim];
+  if (dim == ndim - 1) {
+    for (int i = 0; i < multiple[dim]; i++) {
+      memcpy(outData, inData, srcDimSize * sizeof(float));
+      outData += srcDimSize;
+    }
+    return;
+  }
+  for (size_t i = 0; i < srcDimSize; i++) {
+    for (size_t j = 0; j < multiple[dim]; j++) {
+      TileOneDimensionFp32(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
+                           inShape, inStrides, outStrides, multiple);
+    }
+  }
+}
+
+void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
+                        ArithmeticParameter *param) {
+  CalcMultiplesAndStrides(param);
+  TileOneDimensionFp32(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
+                       param->multiples0_);
+  TileOneDimensionFp32(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
+                       param->multiples1_);
+}
diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
index bf80d6a4d5..2d0496cfe5 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
@@ -20,144 +20,97 @@
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "nnacl/errorcode.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-int ElementOptAdd(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param);
-int ElementOptAddInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param);
-int ElementOptAddRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param);
-int ElementOptAddRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param);
-int ElementOptSub(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param);
-int ElementOptSubInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param);
-int ElementOptSubRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param);
-int ElementOptSubRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param);
-int ElementOptMul(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param);
-int ElementOptMulRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param);
-int ElementOptMulRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param);
-int ElementOptMulInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param);
-int ElementOptMulReluInt(const int *input0, const int *input1, int *output, const int element_size,
-                         const ArithmeticParameter *param);
-int ElementOptMulRelu6Int(const int *input0, const int *input1, int *output, const int element_size,
-                          const ArithmeticParameter *param);
-int ElementOptDiv(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param);
-int ElementOptDivRelu(const float *input0, const float *input1, float *output, const int element_size,
-                      const ArithmeticParameter *param);
-int ElementOptDivRelu6(const float *input0, const float *input1, float *output, const int element_size,
-                       const ArithmeticParameter *param);
-int ElementOptDivInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param);
-int ElementMul(const float *input0, const float *input1, float *output, const int element_size);
-int ElementMulRelu(const float *input0, const float *input1, float *output, const int element_size);
-int ElementMulRelu6(const float *input0, const float *input1, float *output, const int element_size);
-int ElementMulInt(const int *input0, const int *input1, int *output, const int element_size);
-int ElementMulReluInt(const int *input0, const int *input1, int *output, const int element_size);
-int ElementMulRelu6Int(const int *input0, const int *input1, int *output, const int element_size);
-int BroadcastMul(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param);
-
-int ElementAdd(const float *input0, const float *input1, float *output, const int element_size);
-int ElementAddRelu(const float *input0, const float *input1, float *output, const int element_size);
-int ElementAddRelu6(const float *input0, const float *input1, float *output, const int element_size);
-int ElementAddInt(const int *input0, const int *input1, int *output, const int element_size);
-int BroadcastAdd(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param);
-int BroadcastAddInt8(const int8_t *input0, const int8_t *input1, int8_t *tile_input0, int8_t *tile_input1,
-                     int8_t *output, int element_size, ArithmeticParameter *param);
-
-int ElementSub(const float *input0, const float *input1, float *output, const int element_size);
-int ElementSubInt(const int *input0, const int *input1, int *output, const int element_size);
-int ElementSubRelu(const float *input0, const float *input1, float *output, const int element_size);
-int ElementSubRelu6(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastSub(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param);
-
-int ElementDiv(const float *input0, const float *input1, float *output, const int element_size);
-int ElementDivRelu(const float *input0, const float *input1, float *output, const int element_size);
-int ElementDivRelu6(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastDiv(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                 int element_size, ArithmeticParameter *param);
-
-int ElementLogicalAnd(const float *input0, const float *input1, float *output, const int element_size);
-int ElementLogicalAndInt(const int *input0, const int *input1, int *output, const int element_size);
-int ElementLogicalAndBool(const bool *input0, const bool *input1, bool *output, const int element_size);
-int BroadcastLogicalAnd(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                        int element_size, ArithmeticParameter *param);
-
-int ElementLogicalOr(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastLogicalOr(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                       int element_size, ArithmeticParameter *param);
-
-int ElementMaximum(const float *input0, const float *input1, float *output, const int element_size);
-int ElementMaximumInt(const int *input0, const int *input1, int *output, const int element_size);
-int BroadcastMaximum(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param);
-
-int ElementMinimum(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastMinimum(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param);
-
-int ElementFloorDiv(const float *input0, const float *input1, float *output, const int element_size);
-int ElementFloorDivInt(const int *input0, const int *input1, int *output, const int element_size);
-int BroadcastFloorDiv(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param);
-
-int ElementFloorMod(const float *input0, const float *input1, float *output, const int element_size);
-int ElementFloorModInt(const int *input0, const int *input1, int *output, const int element_size);
-int BroadcastFloorMod(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param);
-
-int ElementMod(const float *input0, const float *input1, float *output, const int element_size);
-int ElementModInt(const int *input0, const int *input1, int *output, const int element_size);
-int ElementOptMod(const float *input0, const float *input1, float *output, const int element_size,
-                  const ArithmeticParameter *param);
-int ElementOptModInt(const int *input0, const int *input1, int *output, const int element_size,
-                     const ArithmeticParameter *param);
-
-int ElementSquaredDifference(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastSquaredDifference(const float *input0, const float *input1, float *tile_input0, float *tile_input1,
-                               float *output, int element_size, ArithmeticParameter *param);
-
-int ElementNotEqual(const float *input0, const float *input1, float *output, const int element_size);
-
-int BroadcastNotEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                      int element_size, ArithmeticParameter *param);
-
-int ElementEqual(const float *input0, const float *input1, float *output, const int element_size);
-
-int BroadcastEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                   int element_size, ArithmeticParameter *param);
-
-int ElementLess(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastLess(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                  int element_size, ArithmeticParameter *param);
-
-int ElementLessEqual(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastLessEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                       int element_size, ArithmeticParameter *param);
-
-int ElementGreater(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastGreater(const float *input0, const float *input1, float *tile_input0, float *tile_input1, float *output,
-                     int element_size, ArithmeticParameter *param);
-
-int ElementGreaterEqual(const float *input0, const float *input1, float *output, const int element_size);
-int BroadcastGreaterEqual(const float *input0, const float *input1, float *tile_input0, float *tile_input1,
-                          float *output, int element_size, ArithmeticParameter *param);
+void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t ndim, const int *inShape,
+                          const int *inStrides, const int *outStrides, const int *multiple);
+void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
+                        ArithmeticParameter *param);
+
+/* Mul */
+int ElementMul(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementMulInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
+int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+/* Add */
+int ElementAdd(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementAddInt(const int *in0, const int *in1, int *out, int size);
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+/* Sub */
+int ElementSub(const float *in0, const float *in1, float *out, int size);
+int ElementSubInt(const int *in0, const int *in1, int *out, int size);
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+
+/* Div */
+int ElementDiv(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+/* logical and */
+int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size);
+int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size);
+int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size);
+
+/* logical or */
+int ElementLogicalOr(const float *in0, const float *in1, float *out, int size);
+
+/* Element Squared Difference */
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);
+
+/* max min */
+int ElementMaximum(const float *in0, const float *in1, float *out, int size);
+int ElementMinimum(const float *in0, const float *in1, float *out, int size);
+int ElementMaximumInt(const int *in0, const int *in1, int *out, int size);
+int BroadcastMaximum(const float *in0, const float *in1, float *tile_input0, float *tile_input1, float *out, int size,
+                     ArithmeticParameter *param);
+
+/* floor div */
+int ElementFloorDiv(const float *in0, const float *in1, float *out, int size);
+int ElementFloorDivInt(const int *in0, const int *in1, int *out, int size);
+
+/* floor mod */
+int ElementFloorMod(const float *in0, const float *in1, float *out, int size);
+int ElementFloorModInt(const int *in0, const int *in1, int *out, int size);
+
+/* mod */
+int ElementMod(const float *in0, const float *in1, float *out, int size);
+int ElementModInt(const int *in0, const int *in1, int *out, int size);
+int ElementOptMod(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptModInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 
 #ifdef ENABLE_NNACL_INFER_SHAPE
 int ArithmeticInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
diff --git a/mindspore/lite/nnacl/fp32/space_to_batch_fp32.c b/mindspore/lite/nnacl/fp32/space_to_batch_fp32.c
index 3016b95eac..2f0077ba77 100644
--- a/mindspore/lite/nnacl/fp32/space_to_batch_fp32.c
+++ b/mindspore/lite/nnacl/fp32/space_to_batch_fp32.c
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include "nnacl/fp32/space_to_batch_fp32.h"
-#include "nnacl/arithmetic_common.h"
 
 void DoSpaceToBatch(const float *input, float *output, const int *in_shape, const int *out_shape, const int *in_stride,
                     const int *out_stride, const int *blocks, const int *paddings, int thread, int task_id) {
diff --git a/mindspore/lite/nnacl/fp32/space_to_batch_fp32.h b/mindspore/lite/nnacl/fp32/space_to_batch_fp32.h
index d477ace457..84aa1e4007 100644
--- a/mindspore/lite/nnacl/fp32/space_to_batch_fp32.h
+++ b/mindspore/lite/nnacl/fp32/space_to_batch_fp32.h
@@ -15,6 +15,8 @@
  */
 #ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_NNACL_FP32_SPACE_TO_BATCH_H_
 #define MINDSPORE_LITE_SRC_BACKEND_ARM_NNACL_FP32_SPACE_TO_BATCH_H_
+
+#include <string.h>
 #include "nnacl/op_base.h"
 
 typedef struct SpaceToBatchParameter {
diff --git a/mindspore/lite/nnacl/fp32/space_to_depth_fp32.c b/mindspore/lite/nnacl/fp32/space_to_depth_fp32.c
index ceac8f7368..80f242ed8f 100644
--- a/mindspore/lite/nnacl/fp32/space_to_depth_fp32.c
+++ b/mindspore/lite/nnacl/fp32/space_to_depth_fp32.c
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "nnacl/fp32/space_to_depth_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/op_base.h"
 
diff --git a/mindspore/lite/nnacl/fp32/stack_fp32.c b/mindspore/lite/nnacl/fp32/stack_fp32.c
index b8ebad4b69..a35dec49c0 100644
--- a/mindspore/lite/nnacl/fp32/stack_fp32.c
+++ b/mindspore/lite/nnacl/fp32/stack_fp32.c
@@ -15,7 +15,7 @@
  */
 
 #include "nnacl/fp32/stack_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 
 size_t GetStackCopyNum(int axis, const int *in_shape, size_t shape_size) {
   size_t one_input_size = 1;
diff --git a/mindspore/lite/nnacl/int8/add_int8.c b/mindspore/lite/nnacl/int8/add_int8.c
index cd4ec4ce19..7b0c33067e 100644
--- a/mindspore/lite/nnacl/int8/add_int8.c
+++ b/mindspore/lite/nnacl/int8/add_int8.c
@@ -312,3 +312,16 @@ void AddOptInt8(const int8_t *ptr_in, const int8_t element_in, int8_t *output, i
   }
   return;
 }
+
+int ElementAddInt8(const int8_t *in0, const int8_t *in1, int8_t *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] + in1[i];
+  }
+  return NNACL_OK;
+}
+
+int BroadcastAddInt8(const int8_t *in0, const int8_t *in1, int8_t *tile_in0, int8_t *tile_in1, int8_t *out, int size,
+                     ArithmeticParameter *param) {
+  TileDimensionsInt8(in0, in1, tile_in0, tile_in1, param);
+  return ElementAddInt8(tile_in0, tile_in1, out, size);
+}
diff --git a/mindspore/lite/nnacl/int8/add_int8.h b/mindspore/lite/nnacl/int8/add_int8.h
index 15fdc03d7a..f5469b08b9 100644
--- a/mindspore/lite/nnacl/int8/add_int8.h
+++ b/mindspore/lite/nnacl/int8/add_int8.h
@@ -18,6 +18,9 @@
 #define MINDSPORE_LITE_NNACL_ADD_INT8_H_
 
 #include "nnacl/op_base.h"
+#include "nnacl/errorcode.h"
+#include "nnacl/arithmetic.h"
+#include "nnacl/int8/arithmetic_int8.h"
 
 typedef struct AddQuantQrgs {
   int32_t zp_;
@@ -48,6 +51,10 @@ void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int siz
 void AddOptInt8(const int8_t *ptr_in, const int8_t element_in, int8_t *output, int size, AddQuantParameter *params,
                 AddQuantQrgs *ptr_args, AddQuantQrgs *ele_args);
 
+int ElementAddInt8(const int8_t *in0, const int8_t *in1, int8_t *out, int size);
+int BroadcastAddInt8(const int8_t *in0, const int8_t *in1, int8_t *tile_in0, int8_t *tile_in1, int8_t *out, int size,
+                     ArithmeticParameter *param);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/int8/arithmetic_int8.c b/mindspore/lite/nnacl/int8/arithmetic_int8.c
index 3685d61b84..e6e320f70f 100644
--- a/mindspore/lite/nnacl/int8/arithmetic_int8.c
+++ b/mindspore/lite/nnacl/int8/arithmetic_int8.c
@@ -20,6 +20,33 @@
 #endif
 #include "nnacl/errorcode.h"
 
+void TileOneDimensionInt8(const int8_t *inData, int8_t *outData, int dim, size_t ndim, const int *inShape,
+                          const int *inStrides, const int *outStrides, const int *multiple) {
+  int srcDimSize = inShape[dim];
+  if (dim == ndim - 1) {
+    for (int i = 0; i < multiple[dim]; i++) {
+      memcpy(outData, inData, srcDimSize * sizeof(int8_t));
+      outData += srcDimSize;
+    }
+    return;
+  }
+  for (size_t i = 0; i < srcDimSize; i++) {
+    for (size_t j = 0; j < multiple[dim]; j++) {
+      TileOneDimensionInt8(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
+                           inShape, inStrides, outStrides, multiple);
+    }
+  }
+}
+
+void TileDimensionsInt8(const int8_t *data0, const int8_t *data1, int8_t *tile_data0, int8_t *tile_data1,
+                        ArithmeticParameter *param) {
+  CalcMultiplesAndStrides(param);
+  TileOneDimensionInt8(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
+                       param->multiples0_);
+  TileOneDimensionInt8(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
+                       param->multiples1_);
+}
+
 #define ACCURACY_DATA 0.00000001
 
 int ElementNotEqualInt8(int8_t *input0, int8_t *input1, uint8_t *output, int element_size,
diff --git a/mindspore/lite/nnacl/int8/arithmetic_int8.h b/mindspore/lite/nnacl/int8/arithmetic_int8.h
index 3c1cf6e5b6..98f3f27ac4 100644
--- a/mindspore/lite/nnacl/int8/arithmetic_int8.h
+++ b/mindspore/lite/nnacl/int8/arithmetic_int8.h
@@ -17,11 +17,17 @@
 #define MINDSPORE_LITE_NNACL_INT8_ARITHMETIC_INT8_H_
 
 #include "nnacl/op_base.h"
+#include "nnacl/arithmetic.h"
 #include "nnacl/quantization/quantize.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+void TileOneDimensionInt8(const int8_t *inData, int8_t *outData, int dim, size_t ndim, const int *inShape,
+                          const int *inStrides, const int *outStrides, const int *multiple);
+void TileDimensionsInt8(const int8_t *data0, const int8_t *data1, int8_t *tile_data0, int8_t *tile_data1,
+                        ArithmeticParameter *param);
+
 int ElementNotEqualInt8(int8_t *input0, int8_t *input1, uint8_t *output, int element_size,
                         ArithmeticQuantArg *quant_arg);
 
diff --git a/mindspore/lite/nnacl/int8/space_to_batch_int8.c b/mindspore/lite/nnacl/int8/space_to_batch_int8.c
index df3aa2cfc6..5bdb2dd316 100644
--- a/mindspore/lite/nnacl/int8/space_to_batch_int8.c
+++ b/mindspore/lite/nnacl/int8/space_to_batch_int8.c
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "nnacl/int8/space_to_batch_int8.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 
 void DoSpaceToBatchNHWCInt8(const int8_t *input, int8_t *output, const int *block_sizes, const int *in_shape,
                             const int *out_shape) {
diff --git a/mindspore/lite/nnacl/reverse_sequence.c b/mindspore/lite/nnacl/reverse_sequence.c
index 78e4cb8757..5b0757b728 100644
--- a/mindspore/lite/nnacl/reverse_sequence.c
+++ b/mindspore/lite/nnacl/reverse_sequence.c
@@ -16,7 +16,7 @@
 
 #include "nnacl/reverse_sequence.h"
 #include <string.h>
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 
 void ReverseSequence(float *input0, const void *input1, float *output, ReverseSequenceParameter *para) {
   (void)memcpy(output, input0, para->total_data_size_);
diff --git a/mindspore/lite/src/ops/arithmetic.h b/mindspore/lite/src/ops/arithmetic.h
index d7f4d46165..6c5c6f807a 100644
--- a/mindspore/lite/src/ops/arithmetic.h
+++ b/mindspore/lite/src/ops/arithmetic.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <cmath>
 #include "src/ops/primitive_c.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/src/ops/populate/add_populate.cc b/mindspore/lite/src/ops/populate/add_populate.cc
index 05119f7b3d..e2722ff084 100644
--- a/mindspore/lite/src/ops/populate/add_populate.cc
+++ b/mindspore/lite/src/ops/populate/add_populate.cc
@@ -17,7 +17,7 @@
 #include "src/ops/add.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "src/ops/populate/arithmetic_populate.h"
 
 namespace mindspore {
diff --git a/mindspore/lite/src/ops/populate/bias_add_populate.cc b/mindspore/lite/src/ops/populate/bias_add_populate.cc
index 953c6fdbc3..f4875a5fd4 100644
--- a/mindspore/lite/src/ops/populate/bias_add_populate.cc
+++ b/mindspore/lite/src/ops/populate/bias_add_populate.cc
@@ -16,7 +16,7 @@
 
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/src/ops/populate/bias_grad_populate.cc b/mindspore/lite/src/ops/populate/bias_grad_populate.cc
index d19a8a2278..0bb338d3e3 100644
--- a/mindspore/lite/src/ops/populate/bias_grad_populate.cc
+++ b/mindspore/lite/src/ops/populate/bias_grad_populate.cc
@@ -16,7 +16,7 @@
 
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/src/ops/populate/flatten_populate.cc b/mindspore/lite/src/ops/populate/flatten_populate.cc
index 52d5877320..6905ad3176 100644
--- a/mindspore/lite/src/ops/populate/flatten_populate.cc
+++ b/mindspore/lite/src/ops/populate/flatten_populate.cc
@@ -16,18 +16,17 @@
 
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/flatten.h"
 
 namespace mindspore {
 namespace lite {
 OpParameter *PopulateFlattenParameter(const mindspore::lite::PrimitiveC *primitive) {
-  FlattenParameter *flatten_param = reinterpret_cast<FlattenParameter *>(malloc(sizeof(FlattenParameter)));
+  OpParameter *flatten_param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
   if (flatten_param == nullptr) {
     MS_LOG(ERROR) << "malloc FlattenParameter failed.";
     return nullptr;
   }
-  memset(flatten_param, 0, sizeof(FlattenParameter));
-  flatten_param->op_parameter_.type_ = primitive->Type();
+  memset(flatten_param, 0, sizeof(OpParameter));
+  flatten_param->type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(flatten_param);
 }
 
diff --git a/mindspore/lite/src/ops/populate/mul_populate.cc b/mindspore/lite/src/ops/populate/mul_populate.cc
index f33dc24ff1..1d7b709eda 100644
--- a/mindspore/lite/src/ops/populate/mul_populate.cc
+++ b/mindspore/lite/src/ops/populate/mul_populate.cc
@@ -15,7 +15,7 @@
  */
 
 #include "src/ops/mul.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
 #include "src/ops/populate/arithmetic_populate.h"
diff --git a/mindspore/lite/src/ops/populate/squeeze_populate.cc b/mindspore/lite/src/ops/populate/squeeze_populate.cc
index d270fc829e..fe2c3fd32c 100644
--- a/mindspore/lite/src/ops/populate/squeeze_populate.cc
+++ b/mindspore/lite/src/ops/populate/squeeze_populate.cc
@@ -23,13 +23,13 @@ namespace mindspore {
 namespace lite {
 
 OpParameter *PopulateSqueezeParameter(const mindspore::lite::PrimitiveC *primitive) {
-  SqueezeParameter *squeeze_param = reinterpret_cast<SqueezeParameter *>(malloc(sizeof(SqueezeParameter)));
+  OpParameter *squeeze_param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
   if (squeeze_param == nullptr) {
     MS_LOG(ERROR) << "malloc SqueezeParameter failed.";
     return nullptr;
   }
-  memset(squeeze_param, 0, sizeof(SqueezeParameter));
-  squeeze_param->op_parameter_.type_ = primitive->Type();
+  memset(squeeze_param, 0, sizeof(OpParameter));
+  squeeze_param->type_ = primitive->Type();
   return reinterpret_cast<OpParameter *>(squeeze_param);
 }
 Registry SqueezeParameterRegistry(schema::PrimitiveType_Squeeze, PopulateSqueezeParameter);
diff --git a/mindspore/lite/src/ops/populate/sub_populate.cc b/mindspore/lite/src/ops/populate/sub_populate.cc
index b3d38a3776..5685851953 100644
--- a/mindspore/lite/src/ops/populate/sub_populate.cc
+++ b/mindspore/lite/src/ops/populate/sub_populate.cc
@@ -17,7 +17,7 @@
 #include "src/ops/sub.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "src/ops/populate/arithmetic_populate.h"
 
 namespace mindspore {
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/depth_to_space_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/depth_to_space_base.cc
index 8a95daf30e..1e01125c9e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/depth_to_space_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/depth_to_space_base.cc
@@ -16,7 +16,7 @@
 #include "src/runtime/kernel/arm/base/depth_to_space_base.h"
 #include "nnacl/depth_to_space.h"
 #include "src/runtime/kernel/arm/fp32/depth_to_space_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
index 78aa4c5280..d40f7caac3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
@@ -39,7 +39,7 @@ ARITHMETIC_COMPARE_FUNC_INFO_FP16 arithmetic_cp_fun_table_fp16[] = {
   {PrimitiveType_NotEqual, schema::ActivationType_NO_ACTIVATION, ElementNotEqualFp16, ElementOptNotEqualFp16},
   {PrimitiveType_Equal, schema::ActivationType_NO_ACTIVATION, ElementEqualFp16, ElementOptEqualFp16},
   {PrimitiveType_Less, schema::ActivationType_NO_ACTIVATION, ElementLessFp16, ElementOptLessFp16},
-  {PrimitiveType_LessEqual, schema::ActivationType_NO_ACTIVATION, ElementLessEqual, ElementOptLessEqualFp16},
+  {PrimitiveType_LessEqual, schema::ActivationType_NO_ACTIVATION, ElementLessEqualFp16, ElementOptLessEqualFp16},
   {PrimitiveType_Greater, schema::ActivationType_NO_ACTIVATION, ElementGreaterFp16, ElementOptGreaterFp16},
   {PrimitiveType_GreaterEqual, schema::ActivationType_NO_ACTIVATION, ElementGreaterEqualFp16,
    ElementOptGreaterEqualFp16}};
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/argminmax_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/argminmax_fp32.h
index 9e8a12efe7..f9dc051443 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/argminmax_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/argminmax_fp32.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "include/errorcode.h"
 #include "nnacl/fp32/arg_min_max_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 #include "src/lite_kernel.h"
 
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
index b7c322fb8c..8d44c2d7d6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
@@ -29,7 +29,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Eltwise;
 
 namespace mindspore::kernel {
-
 ArithmeticCPUKernel::~ArithmeticCPUKernel() {
   FreeTmpPtr();
   return;
@@ -72,9 +71,10 @@ int ArithmeticCPUKernel::InitBroadCastCase() {
     if (input0_ptr_ == nullptr) {
       return RET_ERROR;
     }
-    TileOneDimension(reinterpret_cast<float *>(in_tensors_[0]->data_c()), reinterpret_cast<float *>(input0_ptr_), 0,
-                     arithmeticParameter_->ndim_, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_strides0_,
-                     arithmeticParameter_->out_strides_, arithmeticParameter_->multiples0_);
+    TileOneDimensionFp32(reinterpret_cast<float *>(in_tensors_[0]->data_c()), reinterpret_cast<float *>(input0_ptr_), 0,
+                         arithmeticParameter_->ndim_, arithmeticParameter_->in_shape0_,
+                         arithmeticParameter_->in_strides0_, arithmeticParameter_->out_strides_,
+                         arithmeticParameter_->multiples0_);
     arithmeticParameter_->broadcasting_ = false;
     input0_broadcast_ = true;
   }
@@ -85,9 +85,10 @@ int ArithmeticCPUKernel::InitBroadCastCase() {
       FreeTmpPtr();
       return RET_ERROR;
     }
-    TileOneDimension(reinterpret_cast<float *>(in_tensors_[1]->data_c()), reinterpret_cast<float *>(input1_ptr_), 0,
-                     arithmeticParameter_->ndim_, arithmeticParameter_->in_shape1_, arithmeticParameter_->in_strides1_,
-                     arithmeticParameter_->out_strides_, arithmeticParameter_->multiples1_);
+    TileOneDimensionFp32(reinterpret_cast<float *>(in_tensors_[1]->data_c()), reinterpret_cast<float *>(input1_ptr_), 0,
+                         arithmeticParameter_->ndim_, arithmeticParameter_->in_shape1_,
+                         arithmeticParameter_->in_strides1_, arithmeticParameter_->out_strides_,
+                         arithmeticParameter_->multiples1_);
     arithmeticParameter_->broadcasting_ = false;
     input1_broadcast_ = true;
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/depth_to_space_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/depth_to_space_fp32.h
index 99a9acf7a6..c2d8c2b0d4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/depth_to_space_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/depth_to_space_fp32.h
@@ -18,7 +18,6 @@
 
 #include <vector>
 #include "include/errorcode.h"
-#include "nnacl/arithmetic_common.h"
 #include "nnacl/depth_to_space.h"
 #include "src/runtime/kernel/arm/base/depth_to_space_base.h"
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.cc
index 875530aa85..1f7bbbc0de 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.cc
@@ -17,7 +17,6 @@
 #include "src/runtime/kernel/arm/fp32/flatten_fp32.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
-#include "nnacl/flatten.h"
 #include "include/errorcode.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
@@ -34,19 +33,12 @@ int FlattenCPUKernel::Init() {
   return ReSize();
 }
 
-int FlattenCPUKernel::ReSize() {
-  auto output_shape = out_tensors_.at(0)->shape();
-  flatten_param_->size = sizeof(float);
-  for (size_t i = 0; i < output_shape.size(); i++) {
-    flatten_param_->size *= output_shape.at(i);
-  }
-  return RET_OK;
-}
+int FlattenCPUKernel::ReSize() { return RET_OK; }
 
 int FlattenCPUKernel::Run() {
-  auto input = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
-  auto output = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
-  Flatten(input, output, flatten_param_);
+  auto input = in_tensors_.at(0);
+  auto output = out_tensors_.at(0);
+  memcpy(output->data_c(), input->data_c(), output->Size());
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.h
index c476f35931..e3274771ba 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten_fp32.h
@@ -18,9 +18,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-
 #include "include/context.h"
-#include "nnacl/flatten.h"
 
 using mindspore::lite::InnerContext;
 
@@ -30,17 +28,12 @@ class FlattenCPUKernel : public LiteKernel {
   FlattenCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                    const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                    const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    flatten_param_ = reinterpret_cast<FlattenParameter *>(parameter);
-  }
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~FlattenCPUKernel() override = default;
 
   int Init() override;
   int ReSize() override;
   int Run() override;
-
- private:
-  FlattenParameter *flatten_param_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.h
index 41e7c742ea..a778ef63ae 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/fp32/space_to_batch_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 
 namespace mindspore::kernel {
 class SpaceToBatchCPUKernel : public LiteKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
index 36119aed1f..9fd0251e12 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
@@ -15,9 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/int8/add_int8.h"
-#include <limits>
-#include <algorithm>
-#include "nnacl/arithmetic_common.h"
 #include "nnacl/quantization/quantize.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
index 8834387949..7075dfc0e0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
@@ -17,9 +17,11 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ADD_INT8_H_
 
 #include <vector>
+#include <limits>
+#include <algorithm>
 #include "src/lite_kernel.h"
 #include "nnacl/int8/add_int8.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "src/runtime/runtime_api.h"
 
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
index d8831c4cff..9dbc601ff2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "nnacl/quantization/quantize.h"
 #include "nnacl/int8/arg_min_max_int8.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/common_func.h"
 #include "include/errorcode.h"
 #include "src/lite_kernel.h"
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
index a618882b7e..a86cc1c15d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
@@ -17,7 +17,7 @@
 #include "src/runtime/kernel/arm/int8/arithmetic_int8.h"
 #include "src/runtime/kernel/arm/int8/add_int8.h"
 #include "src/runtime/kernel/arm/int8/mul_int8.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/bias_add_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/bias_add_int8.h
index 3f58fc0d00..016bf13ad7 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/bias_add_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/bias_add_int8.h
@@ -18,8 +18,9 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "nnacl/fp32/unique_fp32.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
+#include "nnacl/int8/add_int8.h"
+#include "nnacl/int8/arithmetic_int8.h"
 
 namespace mindspore::kernel {
 class BiasAddInt8CPUKernel : public LiteKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
index 15990b904f..558cfd7de8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
@@ -27,7 +27,6 @@
 #include "nnacl/int8/matmul_int8.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
-#include "nnacl/arithmetic_common.h"
 
 namespace mindspore::kernel {
 class DeConvInt8CPUKernel : public ConvolutionBaseCPUKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
index 08ffc52306..53ef549398 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
@@ -17,7 +17,7 @@
 #include "src/runtime/kernel/arm/int8/div_int8.h"
 #include <limits>
 #include <algorithm>
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/int8/arithmetic_int8.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -114,9 +114,9 @@ int DivInt8CPUKernel::Run() {
       tile1_data_ = nullptr;
       return RET_ERROR;
     }
-    TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->MutableData()),
-                        static_cast<uint8_t *>(in_tensors_.at(1)->MutableData()),
-                        reinterpret_cast<uint8_t *>(tile0_data_), reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
+    TileDimensionsInt8(static_cast<int8_t *>(in_tensors_.at(0)->MutableData()),
+                       static_cast<int8_t *>(in_tensors_.at(1)->MutableData()), reinterpret_cast<int8_t *>(tile0_data_),
+                       reinterpret_cast<int8_t *>(tile1_data_), &tile_para);
   }
   auto ret = ParallelLaunch(this->context_->thread_pool_, DivInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
index 709d3de4b9..d2df7b3753 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
@@ -15,10 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/int8/mul_int8.h"
-#include <limits>
-#include <algorithm>
-#include "nnacl/arithmetic_common.h"
-#include "nnacl/int8/mul_int8.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
index 1c3dc18ae7..08a3a3d23a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
@@ -17,9 +17,12 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MUL_INT8_H_
 
 #include <vector>
+#include <limits>
+#include <algorithm>
 #include "src/lite_kernel.h"
 #include "nnacl/mul_parameter.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/int8/mul_int8.h"
+#include "nnacl/int8/arithmetic_int8.h"
 #include "src/runtime/runtime_api.h"
 
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.cc
index b4c4da6344..f06005471a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.cc
@@ -15,11 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/int8/scale_int8.h"
-
-#include <string.h>
-#include <vector>
-#include "nnacl/int8/scale_int8.h"
-#include "nnacl/arithmetic_common.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -66,9 +61,9 @@ int ScaleInt8CPUKernel::InitScaleOffset() {
         return RET_ERROR;
       }
       malloced_scale_ = true;
-      TileOneDimensionUint8(reinterpret_cast<uint8_t *>(in_tensors_.at(1)->data_c()),
-                            reinterpret_cast<uint8_t *>(input1_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
-                            tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
+      TileOneDimensionInt8(reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()),
+                           reinterpret_cast<int8_t *>(input1_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
+                           tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
     }
   }
 
@@ -93,9 +88,9 @@ int ScaleInt8CPUKernel::InitScaleOffset() {
           return RET_ERROR;
         }
         malloced_offset_ = true;
-        TileOneDimensionUint8(reinterpret_cast<uint8_t *>(in_tensors_.at(2)->data_c()),
-                              reinterpret_cast<uint8_t *>(input2_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
-                              tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
+        TileOneDimensionInt8(reinterpret_cast<int8_t *>(in_tensors_.at(2)->data_c()),
+                             reinterpret_cast<int8_t *>(input2_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
+                             tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
       }
     }
   }
@@ -305,9 +300,9 @@ int ScaleInt8CPUKernel::Run() {
         MS_LOG(ERROR) << "malloc input1_data_  failed.";
         return RET_ERROR;
       }
-      TileOneDimensionUint8(reinterpret_cast<uint8_t *>(in_tensors_.at(1)->data_c()),
-                            reinterpret_cast<uint8_t *>(input1_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
-                            tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
+      TileOneDimensionInt8(reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()),
+                           reinterpret_cast<int8_t *>(input1_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
+                           tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
     }
 
     // If has bias, bias is passed by previous node case, need do broadcasting online
@@ -319,9 +314,9 @@ int ScaleInt8CPUKernel::Run() {
         input1_data_ = nullptr;
         return RET_ERROR;
       }
-      TileOneDimensionUint8(reinterpret_cast<uint8_t *>(in_tensors_.at(2)->data_c()),
-                            reinterpret_cast<uint8_t *>(input2_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
-                            tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
+      TileOneDimensionInt8(reinterpret_cast<int8_t *>(in_tensors_.at(2)->data_c()),
+                           reinterpret_cast<int8_t *>(input2_data_), 0, tile_para->ndim_, tile_para->in_shape1_,
+                           tile_para->in_strides1_, tile_para->out_strides_, tile_para->multiples1_);
     }
 
     auto ret = ParallelLaunch(this->context_->thread_pool_, ScaleRunInt8, this, op_parameter_->thread_num_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.h
index e66d9055ef..c7e07207c5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/scale_int8.h
@@ -17,14 +17,15 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_SCALE_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_SCALE_INT8_H_
 
+#include <string.h>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/scale.h"
 #include "nnacl/quantization/quantize.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/int8/arithmetic_int8.h"
+#include "nnacl/int8/scale_int8.h"
 
 namespace mindspore::kernel {
-
 class ScaleInt8CPUKernel : public LiteKernel {
  public:
   ScaleInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
index 2c3e3ead17..3d2765fc2d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
@@ -15,10 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/int8/sub_int8.h"
-#include <limits>
-#include <algorithm>
-#include "nnacl/arithmetic_common.h"
-#include "nnacl/quantization/quantize.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -29,7 +25,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Sub;
 
 namespace mindspore::kernel {
-
 int SubInt8CPUKernel::Init() {
   lite::Tensor *input0 = in_tensors_.at(0);
   lite::Tensor *input1 = in_tensors_.at(1);
@@ -142,9 +137,9 @@ int SubInt8CPUKernel::Run() {
       context_->allocator->Free(tile0_data_);
       return RET_ERROR;
     }
-    TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->MutableData()),
-                        static_cast<uint8_t *>(in_tensors_.at(1)->MutableData()),
-                        reinterpret_cast<uint8_t *>(tile0_data_), reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
+    TileDimensionsInt8(static_cast<int8_t *>(in_tensors_.at(0)->data_c()),
+                       static_cast<int8_t *>(in_tensors_.at(1)->data_c()), reinterpret_cast<int8_t *>(tile0_data_),
+                       reinterpret_cast<int8_t *>(tile1_data_), &tile_para);
   }
   auto ret = ParallelLaunch(this->context_->thread_pool_, SubInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h
index 1a1e632dc0..728ced359a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h
@@ -17,6 +17,10 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_SUB_INT8_H_
 
 #include <vector>
+#include <limits>
+#include <algorithm>
+#include "nnacl/int8/arithmetic_int8.h"
+#include "nnacl/quantization/quantize.h"
 #include "src/lite_kernel.h"
 #include "nnacl/int8/sub_int8.h"
 #include "src/runtime/runtime_api.h"
diff --git a/mindspore/lite/src/runtime/kernel/npu/eltwise_npu.h b/mindspore/lite/src/runtime/kernel/npu/eltwise_npu.h
index 90df10b7bf..ce0a55141b 100644
--- a/mindspore/lite/src/runtime/kernel/npu/eltwise_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/eltwise_npu.h
@@ -18,7 +18,6 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_ELTWISE_NPU_H_
 #include <vector>
 #include "src/ops/eltwise.h"
-#include "nnacl/arithmetic_common.h"
 #include "src/runtime/kernel/npu/npu_kernel.h"
 #include "include/graph/op/all_ops.h"
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/npu/resize_npu.h b/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
index 80bade7352..f59e27c498 100644
--- a/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
@@ -19,7 +19,6 @@
 #include <vector>
 #include "nnacl/resize_parameter.h"
 #include "src/ops/resize.h"
-#include "nnacl/arithmetic_common.h"
 #include "src/runtime/kernel/npu/npu_kernel.h"
 #include "include/graph/op/all_ops.h"
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/train/train_populate_parameter.cc b/mindspore/lite/src/train/train_populate_parameter.cc
index fef48e67be..892a47dffd 100644
--- a/mindspore/lite/src/train/train_populate_parameter.cc
+++ b/mindspore/lite/src/train/train_populate_parameter.cc
@@ -30,7 +30,7 @@
 #include "src/ops/power_grad.h"
 #include "nnacl/power_parameter.h"
 #include "src/ops/bias_grad.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 #include "nnacl/fp32_grad/optimizer.h"
 #include "src/ops/apply_momentum.h"
 #include "src/ops/sgd.h"
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_fp32_tests.cc
deleted file mode 100644
index 53da47b5f6..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_fp32_tests.cc
+++ /dev/null
@@ -1,1350 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-#include <memory>
-#include "src/common/log_adapter.h"
-#include "common/common_test.h"
-#include "src/common/file_utils.h"
-#include "mindspore/lite/nnacl/fp32/arithmetic_fp32.h"
-#include "mindspore/lite/src/kernel_registry.h"
-#include "mindspore/lite/src/lite_kernel.h"
-#include "include/errorcode.h"
-
-namespace mindspore {
-
-class TestArithmeticTestFp32 : public mindspore::CommonTest {
- public:
-  TestArithmeticTestFp32() {}
-  void PrepareInt(const std::vector<int> &input0_shape, const std::vector<int> &input1_shape, bool broadcast,
-                  const std::vector<int> &output_shape, int *input0_data, int *input1_data, int *output_data, int type,
-                  int act_type, const int thread_num);
-  void TearDown() override;
-
- public:
-  float err_tol = 1e-5;
-  lite::Tensor in_tensor_0_;
-  lite::Tensor in_tensor_1_;
-  lite::Tensor out_tensor_;
-  std::vector<lite::Tensor *> inputs_{&in_tensor_0_, &in_tensor_1_};
-  std::vector<lite::Tensor *> outputs_{&out_tensor_};
-  ArithmeticParameter param_;
-  kernel::KernelKey desc_ = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt, schema::PrimitiveType_Eltwise};
-  lite::InnerContext ctx_ = lite::InnerContext();
-  kernel::KernelCreator creator_ = nullptr;
-  kernel::LiteKernel *kernel_ = nullptr;
-};
-
-void TestArithmeticTestFp32::PrepareInt(const std::vector<int> &input0_shape, const std::vector<int> &input1_shape,
-                                        bool broadcast, const std::vector<int> &output_shape, int *input0_data,
-                                        int *input1_data, int *output_data, int type, int act_type,
-                                        const int thread_num) {
-  param_.broadcasting_ = true;
-  param_.op_parameter_.type_ = type;
-  param_.ndim_ = input0_shape.size();
-  param_.activation_type_ = act_type;
-  param_.broadcasting_ = broadcast;
-  for (size_t i = 0; i < input0_shape.size(); ++i) {
-    param_.in_shape0_[i] = input0_shape[i];
-  }
-  for (size_t i = 0; i < input1_shape.size(); ++i) {
-    param_.in_shape1_[i] = input1_shape[i];
-  }
-  for (size_t i = 0; i < output_shape.size(); ++i) {
-    param_.out_shape_[i] = output_shape[i];
-  }
-
-  in_tensor_0_.set_data_type(kNumberTypeInt);
-  in_tensor_0_.set_data(input0_data);
-  in_tensor_0_.set_shape(input0_shape);
-  in_tensor_1_.set_data(input1_data);
-  in_tensor_1_.set_shape(input1_shape);
-  out_tensor_.set_data(output_data);
-  out_tensor_.set_shape(output_shape);
-
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc_);
-  ASSERT_NE(creator, nullptr);
-  ctx_.thread_num_ = thread_num;
-  ASSERT_EQ(lite::RET_OK, ctx_.Init());
-  kernel_ = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc_, nullptr);
-  ASSERT_NE(kernel_, nullptr);
-}
-
-void TestArithmeticTestFp32::TearDown() {
-  in_tensor_0_.set_data(nullptr);
-  in_tensor_1_.set_data(nullptr);
-  out_tensor_.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, AddTest) {
-  auto add_param = new ArithmeticParameter();
-  add_param->ndim_ = 4;
-  add_param->in_shape0_[0] = 1;
-  add_param->in_shape0_[1] = 2;
-  add_param->in_shape0_[2] = 3;
-  add_param->in_shape0_[3] = 4;
-  add_param->in_shape1_[0] = 1;
-  add_param->in_shape1_[1] = 1;
-  add_param->in_shape1_[2] = 1;
-  add_param->in_shape1_[3] = 4;
-  add_param->out_shape_[0] = 1;
-  add_param->out_shape_[1] = 2;
-  add_param->out_shape_[2] = 3;
-  add_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  auto in_ptr = in.data();
-  std::vector<float> add = {0.9035316, 0.022212252, 0.3038014, 0.3478275};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {13.119816,  3.368904,   15.631221,  5.5827856, 1.7079077, 9.9744,
-                                    15.031756,  -7.7328877, 14.617362,  8.078041,  6.888335,  -8.904492,
-                                    -3.3416586, 11.572254,  9.565813,   1.6258626, 7.629906,  -3.9079323,
-                                    4.0682936,  -8.254251,  -2.4522753, 13.641247, -2.365638, 3.548678};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 2 * 3 * 4;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastAdd(in_ptr, add_ptr, tile_data0, tile_data1, out, size, add_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete add_param;
-}
-
-TEST_F(TestArithmeticTestFp32, MulTest) {
-  auto mul_param = new ArithmeticParameter();
-  mul_param->ndim_ = 4;
-  mul_param->in_shape0_[0] = 1;
-  mul_param->in_shape0_[1] = 2;
-  mul_param->in_shape0_[2] = 3;
-  mul_param->in_shape0_[3] = 4;
-  mul_param->in_shape1_[0] = 1;
-  mul_param->in_shape1_[1] = 1;
-  mul_param->in_shape1_[2] = 1;
-  mul_param->in_shape1_[3] = 4;
-  mul_param->out_shape_[0] = 1;
-  mul_param->out_shape_[1] = 2;
-  mul_param->out_shape_[2] = 3;
-  mul_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  auto in_ptr = in.data();
-  std::vector<float> add = {0.16771512, 0.7336843, 0.6768286, 0.4453379};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {2.0488555,   2.4554152,  10.374036,   2.3313253, 0.13490601, 7.3017635,
-                                    9.968302,    -3.5986485, 2.3000166,   5.910435,  4.4566007,  -4.120409,
-                                    -0.71198255, 8.474085,   6.2687945,   0.5691575, 1.1281147,  -2.8834853,
-                                    2.547916,    -3.8308315, -0.56281954, 9.992072,  -1.8067529, 1.42546};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 2 * 3 * 4;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastMul(in_ptr, add_ptr, tile_data0, tile_data1, out, size, mul_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete mul_param;
-}
-
-TEST_F(TestArithmeticTestFp32, DivTest) {
-  auto div_param = new ArithmeticParameter();
-  div_param->ndim_ = 4;
-  div_param->in_shape0_[0] = 1;
-  div_param->in_shape0_[1] = 2;
-  div_param->in_shape0_[2] = 3;
-  div_param->in_shape0_[3] = 4;
-  div_param->in_shape1_[0] = 1;
-  div_param->in_shape1_[1] = 1;
-  div_param->in_shape1_[2] = 1;
-  div_param->in_shape1_[3] = 4;
-  div_param->out_shape_[0] = 1;
-  div_param->out_shape_[1] = 2;
-  div_param->out_shape_[2] = 3;
-  div_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  auto in_ptr = in.data();
-  std::vector<float> add = {1.6771512, -7.336843, 0.6768286, 4.453379};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {7.28394912,  -0.45614875, 22.64593872, 1.17550247,  0.47960852,  -1.35646735,
-                                    21.76024329, -1.8145132,  8.17685967,  -1.09799665, 9.72850985,  -2.07759546,
-                                    -2.53119099, -1.5742523,  13.68442764, 0.28698101,  4.01059523,  0.53567243,
-                                    5.56195764,  -1.93158453, -2.000897,   -1.85625275, -3.94404034, 0.71874648};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 1 * 3 * 4;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastDiv(in_ptr, add_ptr, tile_data0, tile_data1, out, size, div_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete div_param;
-}
-
-TEST_F(TestArithmeticTestFp32, DivTest2) {
-  std::vector<float> in0 = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100};
-  std::vector<float> in1 = {5, 10, 2, 8, 2, 3, 7, 80, 45, 20};
-  std::vector<float> correct_out = {2, 2, 15, 5, 25, 20, 10, 1, 2, 5};
-  constexpr int kOutSize = 10;
-  float out[kOutSize];
-  ElementDiv(in0.data(), in1.data(), out, kOutSize);
-  std::cout << "out: ";
-  for (int i = 0; i < kOutSize; ++i) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << "\n";
-  ASSERT_EQ(0, CompareOutputData(out, correct_out.data(), kOutSize, 0.00001));
-}
-
-TEST_F(TestArithmeticTestFp32, FloorDivTest) {
-  auto fdiv_param = new ArithmeticParameter();
-  fdiv_param->ndim_ = 4;
-  fdiv_param->in_shape0_[0] = 1;
-  fdiv_param->in_shape0_[1] = 1;
-  fdiv_param->in_shape0_[2] = 3;
-  fdiv_param->in_shape0_[3] = 4;
-  fdiv_param->in_shape1_[0] = 1;
-  fdiv_param->in_shape1_[1] = 1;
-  fdiv_param->in_shape1_[2] = 1;
-  fdiv_param->in_shape1_[3] = 4;
-  fdiv_param->out_shape_[0] = 1;
-  fdiv_param->out_shape_[1] = 1;
-  fdiv_param->out_shape_[2] = 3;
-  fdiv_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {1.1, -1.1, 3.123, -5.432, 0.1234, -0.0312, 12.1, 21.1, 9.1, 9.0, -100, 0.1};
-  auto in_ptr = in.data();
-  std::vector<float> add = {1, 3, 2, 0.3};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {1, -1, 1, -19, 0, -1, 6, 70, 9, 3, -50, 0};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 1 * 3 * 4;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  int ret = BroadcastFloorDiv(in_ptr, add_ptr, tile_data0, tile_data1, out, size, fdiv_param);
-  EXPECT_EQ(ret, 0);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete fdiv_param;
-}
-
-TEST_F(TestArithmeticTestFp32, FloorModTest) {
-  auto fmod_param = new ArithmeticParameter();
-  fmod_param->ndim_ = 4;
-  fmod_param->in_shape0_[0] = 1;
-  fmod_param->in_shape0_[1] = 1;
-  fmod_param->in_shape0_[2] = 3;
-  fmod_param->in_shape0_[3] = 4;
-  fmod_param->in_shape1_[0] = 1;
-  fmod_param->in_shape1_[1] = 1;
-  fmod_param->in_shape1_[2] = 1;
-  fmod_param->in_shape1_[3] = 4;
-  fmod_param->out_shape_[0] = 1;
-  fmod_param->out_shape_[1] = 1;
-  fmod_param->out_shape_[2] = 3;
-  fmod_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {1.1, -1.1, 3.123, -5.432, 0.1234, -0.0312, 12.1, 21.1, 9.1, 9.0, -100, 0.1};
-  auto in_ptr = in.data();
-  std::vector<float> add = {1, 3, 2, 0.3};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {0.100000, 1.900000, 1.123000, 0.268000, 0.123400, 2.968800,
-                                    0.100000, 0.100000, 0.100000, 0.000000, 0.000000, 0.100000};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 1 * 3 * 4;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  int ret = BroadcastFloorMod(in_ptr, add_ptr, tile_data0, tile_data1, out, size, fmod_param);
-  EXPECT_EQ(ret, 0);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete fmod_param;
-}
-
-TEST_F(TestArithmeticTestFp32, LogicalAndTest) {
-  auto logical_and_param = new ArithmeticParameter();
-  logical_and_param->ndim_ = 4;
-  logical_and_param->in_shape0_[0] = 1;
-  logical_and_param->in_shape0_[1] = 2;
-  logical_and_param->in_shape0_[2] = 3;
-  logical_and_param->in_shape0_[3] = 4;
-  logical_and_param->in_shape1_[0] = 1;
-  logical_and_param->in_shape1_[1] = 1;
-  logical_and_param->in_shape1_[2] = 1;
-  logical_and_param->in_shape1_[3] = 4;
-  logical_and_param->out_shape_[0] = 1;
-  logical_and_param->out_shape_[1] = 2;
-  logical_and_param->out_shape_[2] = 3;
-  logical_and_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958,  0,          9.952188,  14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  auto in_ptr = in.data();
-  std::vector<float> add = {1.6771512, -7.336843, 0, 4.453379};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1};
-  auto correct_out_ptr = correct_out.data();
-  int size = 1 * 2 * 3 * 4;
-
-  auto out = new float[size];
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastLogicalAnd(in_ptr, add_ptr, tile_data0, tile_data1, out, size, logical_and_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete logical_and_param;
-}
-
-TEST_F(TestArithmeticTestFp32, LogicalOrTest) {
-  auto logical_or_param = new ArithmeticParameter();
-  logical_or_param->ndim_ = 4;
-  logical_or_param->in_shape0_[0] = 1;
-  logical_or_param->in_shape0_[1] = 2;
-  logical_or_param->in_shape0_[2] = 3;
-  logical_or_param->in_shape0_[3] = 4;
-  logical_or_param->in_shape1_[0] = 1;
-  logical_or_param->in_shape1_[1] = 1;
-  logical_or_param->in_shape1_[2] = 1;
-  logical_or_param->in_shape1_[3] = 4;
-  logical_or_param->out_shape_[0] = 1;
-  logical_or_param->out_shape_[1] = 2;
-  logical_or_param->out_shape_[2] = 3;
-  logical_or_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958, 0.804376,   0,         14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232, -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  0,        -3.3558068, 13.619035, -2.6694393, 3.2008505};
-
-  auto in_ptr = in.data();
-  std::vector<float> add = {1.6771512, 0, 0.6768286, 0};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 2 * 3 * 4;
-
-  auto out = new float[size];
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastLogicalOr(in_ptr, add_ptr, tile_data0, tile_data1, out, size, logical_or_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete logical_or_param;
-}
-
-TEST_F(TestArithmeticTestFp32, MaximumTest) {
-  auto maximum_param = new ArithmeticParameter();
-  maximum_param->ndim_ = 4;
-  maximum_param->in_shape0_[0] = 1;
-  maximum_param->in_shape0_[1] = 2;
-  maximum_param->in_shape0_[2] = 3;
-  maximum_param->in_shape0_[3] = 4;
-  maximum_param->in_shape1_[0] = 1;
-  maximum_param->in_shape1_[1] = 1;
-  maximum_param->in_shape1_[2] = 1;
-  maximum_param->in_shape1_[3] = 4;
-  maximum_param->out_shape_[0] = 1;
-  maximum_param->out_shape_[1] = 2;
-  maximum_param->out_shape_[2] = 3;
-  maximum_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958, 0.804376,   0,         14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232, -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  0,        -3.3558068, 13.619035, -2.6694393, 3.2008505};
-
-  auto in_ptr = in.data();
-  std::vector<float> add = {1.6771512, 6.34876, 3.6768286, 2.936284};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {12.216284, 6.34876,   15.327419, 5.234958,  1.6771512, 6.34876,
-                                    14.727955, 2.936284,  13.71383,  8.055829,  6.5845337, 2.936284,
-                                    1.6771512, 11.550042, 9.262012,  2.936284,  6.7263746, 6.34876,
-                                    3.764492,  2.93628,   1.6771512, 13.619035, 3.6768286, 3.2008505};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 2 * 3 * 4;
-
-  auto out = new float[size];
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastMaximum(in_ptr, add_ptr, tile_data0, tile_data1, out, size, maximum_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete maximum_param;
-}
-
-TEST_F(TestArithmeticTestFp32, MinimumTest) {
-  auto minimum_param = new ArithmeticParameter();
-  minimum_param->ndim_ = 4;
-  minimum_param->in_shape0_[0] = 1;
-  minimum_param->in_shape0_[1] = 2;
-  minimum_param->in_shape0_[2] = 3;
-  minimum_param->in_shape0_[3] = 4;
-  minimum_param->in_shape1_[0] = 1;
-  minimum_param->in_shape1_[1] = 1;
-  minimum_param->in_shape1_[2] = 1;
-  minimum_param->in_shape1_[3] = 4;
-  minimum_param->out_shape_[0] = 1;
-  minimum_param->out_shape_[1] = 2;
-  minimum_param->out_shape_[2] = 3;
-  minimum_param->out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {12.216284, 3.3466918,  15.327419, 5.234958, 0.804376,   0,         14.727955,  -8.080715,
-                           13.71383,  8.055829,   6.5845337, -9.25232, -4.24519,   11.550042, 9.262012,   1.2780352,
-                           6.7263746, -3.9301445, 3.764492,  0,        -3.3558068, 13.619035, -2.6694393, 3.2008505};
-
-  auto in_ptr = in.data();
-  std::vector<float> add = {1.6771512, 6.34876, 3.6768286, 2.936284};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {1.6771512, 3.3466918, 3.6768286,  2.936284,  0.804376,   0,
-                                    3.6768286, -8.080715, 1.6771512,  6.34876,   3.6768286,  -9.25232,
-                                    -4.24519,  6.34876,   3.6768286,  1.2780352, 1.6771512,  -3.9301445,
-                                    3.6768286, 0,         -3.3558068, 6.34876,   -2.6694393, 2.936284};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 1 * 2 * 3 * 4;
-
-  auto out = new float[size];
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastMinimum(in_ptr, add_ptr, tile_data0, tile_data1, out, size, minimum_param);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete minimum_param;
-}
-
-TEST_F(TestArithmeticTestFp32, SquaredDifferenceTest) {
-  auto add_param = new ArithmeticParameter();
-  add_param->ndim_ = 3;
-  add_param->in_shape0_[0] = 2;
-  add_param->in_shape0_[1] = 3;
-  add_param->in_shape0_[2] = 2;
-  add_param->in_shape1_[0] = 2;
-  add_param->in_shape1_[1] = 1;
-  add_param->in_shape1_[2] = 2;
-  add_param->out_shape_[0] = 2;
-  add_param->out_shape_[1] = 3;
-  add_param->out_shape_[2] = 2;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> in = {10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25};
-  auto in_ptr = in.data();
-  std::vector<float> add = {30, 31, 32, 33};
-  auto add_ptr = add.data();
-  std::vector<float> correct_out = {400, 400, 324, 324, 256, 256, 144, 144, 100, 100, 64, 64};
-  auto correct_out_ptr = correct_out.data();
-
-  int size = 2 * 3 * 2;
-  auto out = new float[size];
-
-  auto tile_data0 = new float[size];
-  auto tile_data1 = new float[size];
-  BroadcastSub(in_ptr, add_ptr, tile_data0, tile_data1, out, size, add_param);
-  ElementMul(out, out, out, size);
-  ASSERT_EQ(0, CompareOutputData(out, correct_out_ptr, size, 0.00001));
-
-  delete[] out;
-  delete[] tile_data0;
-  delete[] tile_data1;
-  delete add_param;
-}
-
-TEST_F(TestArithmeticTestFp32, MulFp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter mul_param;
-  mul_param.broadcasting_ = true;
-  mul_param.op_parameter_.type_ = schema::PrimitiveType_Mul;
-  mul_param.ndim_ = 4;
-  mul_param.in_shape0_[0] = 1;
-  mul_param.in_shape0_[1] = 2;
-  mul_param.in_shape0_[2] = 3;
-  mul_param.in_shape0_[3] = 4;
-  mul_param.in_shape1_[0] = 1;
-  mul_param.in_shape1_[1] = 1;
-  mul_param.in_shape1_[2] = 1;
-  mul_param.in_shape1_[3] = 4;
-  mul_param.out_shape_[0] = 1;
-  mul_param.out_shape_[1] = 2;
-  mul_param.out_shape_[2] = 3;
-  mul_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.16771512, 0.7336843, 0.6768286, 0.4453379};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&mul_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {2.0488555,   2.4554152,  10.374036,   2.3313253, 0.13490601, 7.3017635,
-                                    9.968302,    -3.5986485, 2.3000166,   5.910435,  4.4566007,  -4.120409,
-                                    -0.71198255, 8.474085,   6.2687945,   0.5691575, 1.1281147,  -2.8834853,
-                                    2.547916,    -3.8308315, -0.56281954, 9.992072,  -1.8067529, 1.42546};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, MulReluFp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter mul_param;
-  mul_param.broadcasting_ = true;
-  mul_param.op_parameter_.type_ = schema::PrimitiveType_Mul;
-  mul_param.ndim_ = 4;
-  mul_param.activation_type_ = schema::ActivationType_RELU;
-  mul_param.in_shape0_[0] = 1;
-  mul_param.in_shape0_[1] = 2;
-  mul_param.in_shape0_[2] = 3;
-  mul_param.in_shape0_[3] = 4;
-  mul_param.in_shape1_[0] = 1;
-  mul_param.in_shape1_[1] = 1;
-  mul_param.in_shape1_[2] = 1;
-  mul_param.in_shape1_[3] = 4;
-  mul_param.out_shape_[0] = 1;
-  mul_param.out_shape_[1] = 2;
-  mul_param.out_shape_[2] = 3;
-  mul_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.16771512, 0.7336843, 0.6768286, 0.4453379};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&mul_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {2.0488555, 2.4554152, 10.374036, 2.3313253, 0.13490601, 7.3017635,
-                                    9.968302,  0,         2.3000166, 5.910435,  4.4566007,  0,
-                                    0,         8.474085,  6.2687945, 0.5691575, 1.1281147,  0,
-                                    2.547916,  0,         0,         9.992072,  0,          1.42546};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, MulRelu6Fp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter mul_param;
-  mul_param.broadcasting_ = true;
-  mul_param.op_parameter_.type_ = schema::PrimitiveType_Mul;
-  mul_param.ndim_ = 4;
-  mul_param.activation_type_ = schema::ActivationType_RELU6;
-  mul_param.in_shape0_[0] = 1;
-  mul_param.in_shape0_[1] = 2;
-  mul_param.in_shape0_[2] = 3;
-  mul_param.in_shape0_[3] = 4;
-  mul_param.in_shape1_[0] = 1;
-  mul_param.in_shape1_[1] = 1;
-  mul_param.in_shape1_[2] = 1;
-  mul_param.in_shape1_[3] = 4;
-  mul_param.out_shape_[0] = 1;
-  mul_param.out_shape_[1] = 2;
-  mul_param.out_shape_[2] = 3;
-  mul_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.16771512, 0.7336843, 0.6768286, 0.4453379};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&mul_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {2.0488555, 2.4554152, 6,         2.3313253, 0.13490601, 6, 6, 0,
-                                    2.3000166, 5.910435,  4.4566007, 0,         0,          6, 6, 0.5691575,
-                                    1.1281147, 0,         2.547916,  0,         0,          6, 0, 1.42546};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, MulInt0) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 1, 1, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int in1_data[3] = {3, 2, 1};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_NO_ACTIVATION;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 2, 2, 9, 8, 5, 18, 14, 8, 27, 20, 11};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulInt1) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int in1_data[1] = {2};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_NO_ACTIVATION;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulInt2) {
-  std::vector<int> input0_shape{1};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[1] = {2};
-  int in1_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_NO_ACTIVATION;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulInt3) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = false;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
-  int in1_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_NO_ACTIVATION;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulReluInt0) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 1, 1, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int in1_data[3] = {-1, 1, 1};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 1, 2, 0, 4, 5, 0, 7, 8, 0, 10, 11};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulReluInt1) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int in1_data[1] = {1};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 11};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulReluInt2) {
-  std::vector<int> input0_shape{1};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[1] = {1};
-  int in1_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 11};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulReluInt3) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = false;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  int in1_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 11};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulRelu6Int0) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 1, 1, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int in1_data[3] = {-1, 1, 1};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU6;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 1, 2, 0, 4, 5, 0, 6, 6, 0, 6, 6};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulRelu6Int1) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int in1_data[1] = {1};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU6;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulRelu6Int2) {
-  std::vector<int> input0_shape{1};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = true;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[1] = {1};
-  int in1_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU6;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, MulRelu6Int3) {
-  std::vector<int> input0_shape{1, 2, 2, 3};
-  std::vector<int> input1_shape{1, 2, 2, 3};
-  bool broadcast = false;
-  std::vector<int> output_shape{1, 2, 2, 3};
-  int in0_data[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  int in1_data[12] = {0, -1, -2, -3, -4, -5, 6, 7, 8, 9, 10, 11};
-  int out_data[12] = {0};
-  schema::PrimitiveType type = schema::PrimitiveType_Mul;
-  int act_type = schema::ActivationType_RELU6;
-  int thread_num = 2;
-  desc_.type = type;
-  PrepareInt(input0_shape, input1_shape, broadcast, output_shape, in0_data, in1_data, out_data, type, act_type,
-             thread_num);
-  kernel_->Run();
-
-  int correct_data[12] = {0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6};
-
-  ASSERT_EQ(0, CompareOutputData(out_data, correct_data, 12, err_tol));
-}
-
-TEST_F(TestArithmeticTestFp32, AddReluFp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter add_param;
-  add_param.broadcasting_ = true;
-  add_param.op_parameter_.type_ = schema::PrimitiveType_Add;
-  add_param.ndim_ = 4;
-  add_param.activation_type_ = schema::ActivationType_RELU;
-  add_param.in_shape0_[0] = 1;
-  add_param.in_shape0_[1] = 2;
-  add_param.in_shape0_[2] = 3;
-  add_param.in_shape0_[3] = 4;
-  add_param.in_shape1_[0] = 1;
-  add_param.in_shape1_[1] = 1;
-  add_param.in_shape1_[2] = 1;
-  add_param.in_shape1_[3] = 4;
-  add_param.out_shape_[0] = 1;
-  add_param.out_shape_[1] = 2;
-  add_param.out_shape_[2] = 3;
-  add_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.9035316, 0.022212252, 0.3038014, 0.3478275};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&add_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {
-    13.119816, 3.368904, 15.631221, 5.5827856, 1.7079077, 9.9744,    15.031756, 0, 14.617362, 8.078041, 6.888335, 0, 0,
-    11.572254, 9.565813, 1.6258626, 7.629906,  0,         4.0682936, 0,         0, 13.641247, 0,        3.548678};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, AddRelu6Fp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter add_param;
-  add_param.broadcasting_ = true;
-  add_param.op_parameter_.type_ = schema::PrimitiveType_Add;
-  add_param.ndim_ = 4;
-  add_param.activation_type_ = schema::ActivationType_RELU6;
-  add_param.in_shape0_[0] = 1;
-  add_param.in_shape0_[1] = 2;
-  add_param.in_shape0_[2] = 3;
-  add_param.in_shape0_[3] = 4;
-  add_param.in_shape1_[0] = 1;
-  add_param.in_shape1_[1] = 1;
-  add_param.in_shape1_[2] = 1;
-  add_param.in_shape1_[3] = 4;
-  add_param.out_shape_[0] = 1;
-  add_param.out_shape_[1] = 2;
-  add_param.out_shape_[2] = 3;
-  add_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.9035316, 0.022212252, 0.3038014, 0.3478275};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&add_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {6, 3.368904, 6, 5.5827856, 1.7079077, 6, 6,         0, 6, 6, 6, 0,
-                                    0, 6,        6, 1.6258626, 6,         0, 4.0682936, 0, 0, 6, 0, 3.548678};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, DivReluFp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter div_param;
-  div_param.broadcasting_ = true;
-  div_param.op_parameter_.type_ = schema::PrimitiveType_Div;
-  div_param.ndim_ = 4;
-  div_param.activation_type_ = schema::ActivationType_RELU;
-  div_param.in_shape0_[0] = 1;
-  div_param.in_shape0_[1] = 2;
-  div_param.in_shape0_[2] = 3;
-  div_param.in_shape0_[3] = 4;
-  div_param.in_shape1_[0] = 1;
-  div_param.in_shape1_[1] = 1;
-  div_param.in_shape1_[2] = 1;
-  div_param.in_shape1_[3] = 4;
-  div_param.out_shape_[0] = 1;
-  div_param.out_shape_[1] = 2;
-  div_param.out_shape_[2] = 3;
-  div_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {1.6771512, -7.336843, 0.6768286, 4.453379};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&div_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {7.28394912,  0, 22.64593872, 1.17550247, 0.47960852, 0,
-                                    21.76024329, 0, 8.17685967,  0,          9.72850985, 0,
-                                    0,           0, 13.68442764, 0.28698101, 4.01059523, 0.53567243,
-                                    5.56195764,  0, 0,           0,          0,          0.71874648};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, DivRelu6Fp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter div_param;
-  div_param.broadcasting_ = true;
-  div_param.op_parameter_.type_ = schema::PrimitiveType_Div;
-  div_param.ndim_ = 4;
-  div_param.activation_type_ = schema::ActivationType_RELU6;
-  div_param.in_shape0_[0] = 1;
-  div_param.in_shape0_[1] = 2;
-  div_param.in_shape0_[2] = 3;
-  div_param.in_shape0_[3] = 4;
-  div_param.in_shape1_[0] = 1;
-  div_param.in_shape1_[1] = 1;
-  div_param.in_shape1_[2] = 1;
-  div_param.in_shape1_[3] = 4;
-  div_param.out_shape_[0] = 1;
-  div_param.out_shape_[1] = 2;
-  div_param.out_shape_[2] = 3;
-  div_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {1.6771512, -7.336843, 0.6768286, 4.453379};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&div_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {6, 0, 6, 1.17550247, 0.47960852, 0,          6,          0, 6, 0, 6, 0,
-                                    0, 0, 6, 0.28698101, 4.01059523, 0.53567243, 5.56195764, 0, 0, 0, 0, 0.71874648};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-
-TEST_F(TestArithmeticTestFp32, EqualFp32) {
-  std::vector<lite::Tensor *> inputs_tensor;
-  std::vector<lite::Tensor *> outputs_tensor;
-
-  ArithmeticParameter equal_param;
-  equal_param.broadcasting_ = true;
-  equal_param.op_parameter_.type_ = schema::PrimitiveType_Equal;
-  equal_param.ndim_ = 4;
-  equal_param.in_shape0_[0] = 1;
-  equal_param.in_shape0_[1] = 2;
-  equal_param.in_shape0_[2] = 3;
-  equal_param.in_shape0_[3] = 4;
-  equal_param.in_shape1_[0] = 1;
-  equal_param.in_shape1_[1] = 1;
-  equal_param.in_shape1_[2] = 1;
-  equal_param.in_shape1_[3] = 4;
-  equal_param.out_shape_[0] = 1;
-  equal_param.out_shape_[1] = 2;
-  equal_param.out_shape_[2] = 3;
-  equal_param.out_shape_[3] = 4;
-
-  /* 1x2x3x4 NHWC */
-  std::vector<float> input0 = {12.216284, 3.3466918, 15.327419,  5.234958,  0.804376,   9.952188,
-                               14.727955, -8.080715, 13.71383,   8.055829,  6.5845337,  -9.25232,
-                               -4.24519,  11.550042, 9.262012,   1.2780352, 6.7263746,  -3.9301445,
-                               3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
-  std::vector<int> input0_shape = {1, 2, 3, 4};
-  std::vector<float> input1 = {0.16771512, 3.3466918, 0.6768286, 3.2008505};
-  std::vector<int> input1_shape = {1, 1, 1, 4};
-
-  lite::Tensor input0_tensor;
-  lite::Tensor input1_tensor;
-  input0_tensor.set_data_type(kNumberTypeFloat32);
-  input0_tensor.set_data(input0.data());
-  input1_tensor.set_data(input1.data());
-  input0_tensor.set_shape(input0_shape);
-  input1_tensor.set_shape(input1_shape);
-  inputs_tensor.push_back(&input0_tensor);
-  inputs_tensor.push_back(&input1_tensor);
-
-  std::vector<float> output(24);
-  std::vector<int> output_shape = {1, 2, 3, 4};
-
-  lite::Tensor output0_tensor;
-  outputs_tensor.push_back(&output0_tensor);
-  output0_tensor.set_data(output.data());
-  output0_tensor.set_shape(output_shape);
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Eltwise};
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  ASSERT_NE(creator, nullptr);
-  lite::InnerContext ctx;
-  ctx.thread_num_ = 3;
-  ASSERT_EQ(lite::RET_OK, ctx.Init());
-  kernel::LiteKernel *kernel =
-    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&equal_param), &ctx, desc, nullptr);
-  ASSERT_NE(kernel, nullptr);
-  auto output_tensor_shape = output0_tensor.shape();
-  kernel->Run();
-
-  std::vector<float> correct_out = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
-  auto correct_out_ptr = correct_out.data();
-
-  ASSERT_EQ(0, CompareOutputData(output.data(), correct_out_ptr, 24, 0.00001));
-
-  input0_tensor.set_data(nullptr);
-  input1_tensor.set_data(nullptr);
-  output0_tensor.set_data(nullptr);
-}
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc
index 4e73ad62d8..4e52063652 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc
@@ -16,7 +16,7 @@
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/nnacl/batch_to_space.h"
-#include "mindspore/lite/nnacl/arithmetic_common.h"
+#include "mindspore/lite/nnacl/common_func.h"
 
 namespace mindspore {
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc
index e280b04a19..0578b2bd6d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc
@@ -16,7 +16,7 @@
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/nnacl/depth_to_space.h"
-#include "mindspore/lite/nnacl/arithmetic_common.h"
+#include "mindspore/lite/nnacl/common_func.h"
 
 namespace mindspore {
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/bias_add_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/bias_add_int8_tests.cc
deleted file mode 100644
index e1ac3b57f9..0000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/bias_add_int8_tests.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <memory>
-#include "schema/inner/model_generated.h"
-#include "common/common_test.h"
-#include "mindspore/lite/src/runtime/kernel/arm/int8/bias_add_int8.h"
-#include "mindspore/lite/src/kernel_registry.h"
-
-using mindspore::lite::DeviceType;
-
-namespace mindspore {
-class TestBiasAddInt8 : public mindspore::CommonTest {
- public:
-  TestBiasAddInt8() {}
-};
-
-TEST_F(TestBiasAddInt8, BiasAdd) {
-  lite::Tensor in_tensor0(kNumberTypeInt8, {1, 2, 3, 2});
-  lite::Tensor in_tensor1(kNumberTypeInt8, {2});
-  lite::Tensor out_tensor(kNumberTypeInt8, {1, 2, 3, 2});
-  int8_t input_data0[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  int8_t input_data1[] = {1, 1};
-  int8_t output_data[12] = {0};
-  in_tensor0.set_data(input_data0);
-  in_tensor1.set_data(input_data1);
-  out_tensor.set_data(output_data);
-  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
-  std::vector<lite::Tensor *> outputs = {&out_tensor};
-
-  ArithmeticParameter parameter = {};
-  int dims[] = {1, 2, 3, 4};
-  parameter.ndim_ = 4;
-  for (int i = 0; i < 4; i++) {
-    parameter.in_shape0_[i] = dims[i];
-    parameter.in_shape1_[i] = 1;
-    parameter.out_shape_[i] = dims[i];
-  }
-  parameter.in_shape1_[3] = dims[3];
-
-  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_BiasAdd};
-
-  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
-  EXPECT_NE(creator, nullptr);
-
-  auto ctx = std::make_shared<lite::InnerContext>();
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(&parameter), ctx.get(), desc, nullptr);
-  EXPECT_NE(kernel, nullptr);
-
-  auto ret = kernel->Run();
-  EXPECT_EQ(0, ret);
-
-  float expect[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13};
-  for (int i = 0; i < 12; ++i) {
-    EXPECT_EQ(output_data[i], expect[i]);
-  }
-
-  in_tensor0.set_data(nullptr);
-  in_tensor1.set_data(nullptr);
-  out_tensor.set_data(nullptr);
-}
-}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
index ef924c9b12..87bad0fdf2 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "ut/src/runtime/kernel/opencl/common.h"
-#include "nnacl/arithmetic_common.h"
+#include "nnacl/arithmetic.h"
 
 namespace mindspore::lite::opencl::test {