!11956 [MSLITE]remove mul/add/div/sub from arithmetic_fp32

From: @wangchengyuan Reviewed-by: @zhang_xue_tong Signed-off-by: @zhang_xue_tong
4 years ago · 2575310c12
parent 65f1d90509 fe90be3bb4
commit 2575310c12
13 changed files with 1127 additions and 864 deletions
--- a/mindspore/lite/nnacl/fp32/add_fp32.c
+++ b/mindspore/lite/nnacl/fp32/add_fp32.c
@ -0,0 +1,225 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/add_fp32.h"
+#include "nnacl/fp32/arithmetic_fp32.h"
+
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vaddq_f32(vin0_opt, vin1);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vaddq_f32(vin0, vin1_opt);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vaddq_s32(vin0_opt, vin1);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vaddq_s32(vin0, vin1_opt);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] + in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] + in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
+    }
+  }
+
+  return NNACL_OK;
+}
+
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementAdd(tile_in0, tile_in1, out, size);
+}
+
+int ElementAdd(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vaddq_f32(vin0, vin1);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vaddq_f32(vin0, vin1);
+    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] + in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
+  }
+  return NNACL_OK;
+}
+
+int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vaddq_s32(vin0, vin1);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
+  }
+  return NNACL_OK;
+}
--- a/mindspore/lite/nnacl/fp32/add_fp32.h
+++ b/mindspore/lite/nnacl/fp32/add_fp32.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_
+#define MINDSPORE_LITE_NNACL_FP32_ADD_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementAdd(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementAddInt(const int *in0, const int *in1, int *out, int size);
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_ADD_H_
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
@ -22,6 +22,11 @@
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"
+#include "nnacl/fp32/add_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
+#include "nnacl/fp32/div_fp32.h"
+#include "nnacl/fp32/sub_fp32.h"
+#include "nnacl/fp32/squared_difference.h"

 #ifdef __cplusplus
 extern "C" {
@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n
                          const int *inStrides, const int *outStrides, const int *multiple);
 void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
                        ArithmeticParameter *param);
-
-/* Mul */
-int ElementMul(const float *in0, const float *in1, float *out, int size);
-int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
-int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementMulInt(const int *in0, const int *in1, int *out, int size);
-int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
-int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
-int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
-/* Add */
-int ElementAdd(const float *in0, const float *in1, float *out, int size);
-int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
-int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementAddInt(const int *in0, const int *in1, int *out, int size);
-int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
-/* Sub */
-int ElementSub(const float *in0, const float *in1, float *out, int size);
-int ElementSubInt(const int *in0, const int *in1, int *out, int size);
-int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
-int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-
-/* Div */
-int ElementDiv(const float *in0, const float *in1, float *out, int size);
-int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
-int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
 /* logical and */
 int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size);
 int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size);
@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
 /* logical or */
 int ElementLogicalOr(const float *in0, const float *in1, float *out, int size);

-/* Element Squared Difference */
-int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);
-
 /* max min */
 int ElementMaximum(const float *in0, const float *in1, float *out, int size);
 int ElementMinimum(const float *in0, const float *in1, float *out, int size);
--- a/mindspore/lite/nnacl/fp32/div_fp32.c
+++ b/mindspore/lite/nnacl/fp32/div_fp32.c
@ -0,0 +1,107 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/div_fp32.h"
+#include <math.h>
+#include "nnacl/fp32/arithmetic_fp32.h"
+
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+    }
+  } else {
+    if (in1[0] == 0) {
+      return NNACL_ERRCODE_DIVISOR_ZERO;
+    }
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+      out[index] = out[index] > 0 ? out[index] : 0;
+    }
+  } else {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+      out[index] = out[index] > 0 ? out[index] : 0;
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
+    }
+  } else {
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+    }
+  } else {
+    if (in1[0] == 0) {
+      return NNACL_ERRCODE_DIVISOR_ZERO;
+    }
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementDiv(tile_in0, tile_in1, out, size);
+}
+
+int ElementDiv(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] / in1[i];
+  }
+  return NNACL_OK;
+}
+
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    float res = in0[i] / in1[i];
+    out[i] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
+  }
+  return NNACL_OK;
+}
--- a/mindspore/lite/nnacl/fp32/div_fp32.h
+++ b/mindspore/lite/nnacl/fp32/div_fp32.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_
+#define MINDSPORE_LITE_NNACL_FP32_DIV_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int ElementDiv(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_DIV_H_
--- a/mindspore/lite/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c
@ -19,6 +19,7 @@
 #include <float.h>
 #include "nnacl/fp32/activation_fp32.h"
 #include "nnacl/fp32/arithmetic_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"

 void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) {
  int gate_offest = 0;
--- a/mindspore/lite/nnacl/fp32/mul_fp32.c
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.c
--- a/mindspore/lite/nnacl/fp32/mul_fp32.h
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_
+#define MINDSPORE_LITE_NNACL_FP32_MUL_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementMul(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementMulInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
+int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_MUL_H_
--- a/mindspore/lite/nnacl/fp32/squared_difference.c
+++ b/mindspore/lite/nnacl/fp32/squared_difference.c
@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+
+#include "nnacl/fp32/squared_difference.h"
+#include "nnacl/fp32/sub_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
+
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
+  ElementSub(in0, in1, out, size);
+  return ElementMul(out, out, out, size);
+}
+
+#endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
--- a/mindspore/lite/nnacl/fp32/squared_difference.h
+++ b/mindspore/lite/nnacl/fp32/squared_difference.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Element Squared Difference */
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
--- a/mindspore/lite/nnacl/fp32/sub_fp32.c
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.c
@ -0,0 +1,217 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp32/sub_fp32.h"
+
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vsubq_f32(vin0_opt, vin1);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vsubq_f32(vin0, vin1_opt);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vsubq_s32(vin0_opt, vin1);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vsubq_s32(vin0, vin1_opt);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] - in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] - in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementSub(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vsubq_f32(vin0, vin1);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vsubq_s32(vin0, vin1);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vsubq_f32(vin0, vin1);
+    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] - in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
+  }
+
+  return NNACL_OK;
+}
--- a/mindspore/lite/nnacl/fp32/sub_fp32.h
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_
+#define MINDSPORE_LITE_NNACL_SUB_FP32_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementSub(const float *in0, const float *in1, float *out, int size);
+int ElementSubInt(const int *in0, const int *in1, int *out, int size);
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SUB_FP32_H_