diff --git a/mindspore/lite/nnacl/common_func.c b/mindspore/lite/nnacl/common_func.c
index 6dfbb1c480..a6e3f26593 100644
--- a/mindspore/lite/nnacl/common_func.c
+++ b/mindspore/lite/nnacl/common_func.c
@@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }
 int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); }
 
 void ReluFp32(float *data, float *dst, int ele_num) {
-  int four_block = UP_DIV(ele_num, C4NUM);
-  for (int i = 0; i < four_block - 1; i++) {
-    int index = i * C4NUM;
-#ifdef ENABLE_NEON
-    float32x4_t relu_data = vld1q_f32(data + index);
-    float32x4_t zero_data = vdupq_n_f32(0);
-    relu_data = vmaxq_f32(relu_data, zero_data);
-    vst1q_f32(dst + index, relu_data);
-#else
-    data[index] = data[index] < 0 ? 0 : data[index];
-    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
-    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
-    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
-#endif
-  }
-  for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
+  int index = 0;
+#ifdef ENABLE_AVX
+  int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
+  for (; index < c8_block; index += C8NUM) {
+    MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index);
+    MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
+    relu_data = MS_MAX256_F32(relu_data, zero_data);
+    MS_ST256_F32(dst + index, relu_data);
   }
-}
-
-void Relu6Fp32(float *data, float *dst, int ele_num) {
-  int four_block = UP_DIV(ele_num, C4NUM);
-  for (int i = 0; i < four_block - 1; i++) {
-    int index = i * C4NUM;
-#ifdef ENABLE_NEON
-    float32x4_t relu6_data = vld1q_f32(data + index);
-    float32x4_t zero_data = vdupq_n_f32(0);
-    float32x4_t six_data = vdupq_n_f32(6);
-    relu6_data = vmaxq_f32(relu6_data, zero_data);
-    relu6_data = vminq_f32(relu6_data, six_data);
-    vst1q_f32(dst + index, relu6_data);
-#else
-    data[index] = data[index] < 0 ? 0 : data[index];
-    data[index] = data[index] > 6 ? 6 : data[index];
-    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
-    data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
-    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
-    data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
-    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
-    data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
 #endif
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+  int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
+  for (; index < c4_block; index += C4NUM) {
+    MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index);
+    MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
+    relu_data = MS_MAXQ_F32(relu_data, zero_data);
+    MS_STQ_F32(dst + index, relu_data);
   }
-  for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
-    data[j] = data[j] > 6 ? 6 : data[j];
+#endif
+  for (; index < ele_num; ++index) {
+    data[index] = data[index] < 0.0f ? 0.0f : data[index];
   }
 }
 
+void Relu6Fp32(float *data, float *dst, int ele_num) {
+  int index = 0;
 #ifdef ENABLE_AVX
-#ifdef WIN32
-void ReluFp32C8(float *data, float *dst, int ele_num) {
-  int four_block = UP_DIV(ele_num, C8NUM);
-  for (int i = 0; i < four_block - 1; i++) {
-    int index = i * C8NUM;
-    data[index] = data[index] < 0 ? 0 : data[index];
-    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
-    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
-    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
-    data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
-    data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
-    data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
-    data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
-  }
-  for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
+  int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
+  for (; index < c8_block; index += C8NUM) {
+    MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index);
+    MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
+    MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f);
+    relu6_data = MS_MAX256_F32(relu6_data, zero_data);
+    relu6_data = MS_MIN256_F32(relu6_data, six_data);
+    MS_ST256_F32(dst + index, relu6_data);
   }
-}
+#endif
 
-void Relu6Fp32C8(float *data, float *dst, int ele_num) {
-  int four_block = UP_DIV(ele_num, C8NUM);
-  for (int i = 0; i < four_block - 1; i++) {
-    int index = i * C8NUM;
-    data[index] = data[index] < 0 ? 0 : data[index];
-    data[index] = data[index] > 6 ? 6 : data[index];
-    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
-    data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
-    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
-    data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
-    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
-    data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
-    data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
-    data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4];
-    data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
-    data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5];
-    data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
-    data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6];
-    data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
-    data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7];
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+  int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
+  for (; index < c4_block; index += C4NUM) {
+    MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index);
+    MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
+    MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f);
+    relu6_data = MS_MAXQ_F32(relu6_data, zero_data);
+    relu6_data = MS_MINQ_F32(relu6_data, six_data);
+    MS_STQ_F32(dst + index, relu6_data);
   }
-  for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) {
-    data[j] = data[j] < 0 ? 0 : data[j];
-    data[j] = data[j] > 6 ? 6 : data[j];
+#endif
+  for (; index < ele_num; ++index) {
+    data[index] = data[index] < 0.0f ? 0.0f : data[index];
+    data[index] = data[index] > 6.0f ? 6.0f : data[index];
   }
 }
-#endif
-#endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
similarity index 71%
rename from mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
index 7b97a6d1dc..f7f9bcea67 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
@@ -17,7 +17,7 @@
 #include <vector>
 #include "include/errorcode.h"
 #include "schema/model_generated.h"
-#include "src/runtime/kernel/arm/fp16/bias_fp16.h"
+#include "src/runtime/kernel/arm/fp16/biasadd_fp16.h"
 #include "src/kernel_registry.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
@@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_BiasAdd;
 
 namespace mindspore::kernel {
 
-int BiasCPUFp16Kernel::ReSize() {
+int BiasAddCPUFp16Kernel::ReSize() {
   auto dims = in_tensors_.at(0)->shape();
   bias_param_->ndim_ = dims.size();
   if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) {
@@ -45,13 +45,20 @@ int BiasCPUFp16Kernel::ReSize() {
   return RET_OK;
 }
 
-int BiasCPUFp16Kernel::Run() {
+int BiasAddCPUFp16Kernel::Run() {
+  if (bias_data_ == nullptr) {
+    auto ret = GetBiasData();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "GetBiasData is error in run!";
+      return ret;
+    }
+  }
   auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData());
   auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData());
   size_t data_size = in_tensors_.at(0)->ElementsNum();
   MS_ASSERT(context_->allocator != nullptr);
-  auto *tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
-  auto *tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
+  auto tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
+  auto tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t)));
   if (tile_in == nullptr || tile_bias == nullptr) {
     MS_LOG(ERROR) << "Memory allocation failed";
     context_->allocator->Free(tile_in);
@@ -64,43 +71,54 @@ int BiasCPUFp16Kernel::Run() {
   return RET_OK;
 }
 
-BiasCPUFp16Kernel::~BiasCPUFp16Kernel() {
+BiasAddCPUFp16Kernel::~BiasAddCPUFp16Kernel() {
   if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) {
     free(bias_data_);
     bias_data_ = nullptr;
   }
 }
 
-int BiasCPUFp16Kernel::Init() {
-  auto bias_tensor = in_tensors_.at(1);
-  MS_ASSERT(bias_tensor != nullptr);
-  bias_data_type_ = bias_tensor->data_type();
+int BiasAddCPUFp16Kernel::GetBiasData() {
+  bias_data_type_ = bias_tensor_->data_type();
   if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) {
-    bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor->ElementsNum() * sizeof(float16_t)));
+    bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor_->ElementsNum() * sizeof(float16_t)));
     if (bias_data_ == nullptr) {
       MS_LOG(ERROR) << "bias_data_ is nullptr";
       return RET_NULL_PTR;
     }
-    auto *bias = reinterpret_cast<float *>(bias_tensor->MutableData());
+    auto bias = reinterpret_cast<float *>(bias_tensor_->MutableData());
     if (bias == nullptr) {
       MS_LOG(ERROR) << "bias is nullptr!";
       return RET_NULL_PTR;
     }
-    for (int i = 0; i < bias_tensor->ElementsNum(); ++i) {
+    for (int i = 0; i < bias_tensor_->ElementsNum(); ++i) {
       bias_data_[i] = (float16_t)(bias[i]);
     }
   } else {
-    bias_data_ = reinterpret_cast<float16_t *>(bias_tensor->MutableData());
+    bias_data_ = reinterpret_cast<float16_t *>(bias_tensor_->MutableData());
     if (bias_data_ == nullptr) {
       MS_LOG(ERROR) << "bias_data_ is nullptr";
       return RET_NULL_PTR;
     }
   }
+  return RET_OK;
+}
+
+int BiasAddCPUFp16Kernel::Init() {
+  bias_tensor_ = in_tensors_.at(1);
+  MS_ASSERT(bias_tensor_ != nullptr);
+  if (bias_tensor_->IsConst()) {
+    auto ret = GetBiasData();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "GetBiasData is error in Init()!";
+      return ret;
+    }
+  }
   if (!InferShapeDone()) {
     return RET_OK;
   }
   return ReSize();
 }
 
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasCPUFp16Kernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasAddCPUFp16Kernel>)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
similarity index 67%
rename from mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h
rename to mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
index 6d00b75082..20f8a91eb4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
@@ -14,31 +14,33 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_
 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/fp16/arithmetic_fp16.h"
 
 namespace mindspore::kernel {
-class BiasCPUFp16Kernel : public LiteKernel {
+class BiasAddCPUFp16Kernel : public LiteKernel {
  public:
-  BiasCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                    const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+  BiasAddCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
       : LiteKernel(parameter, inputs, outputs, ctx) {
     bias_param_ = reinterpret_cast<ArithmeticParameter *>(parameter);
   }
-  ~BiasCPUFp16Kernel() override;
+  ~BiasAddCPUFp16Kernel() override;
 
   int Init() override;
   int ReSize() override;
   int Run() override;
 
  private:
+  int GetBiasData();
   ArithmeticParameter *bias_param_ = nullptr;
   float16_t *bias_data_ = nullptr;
+  lite::Tensor *bias_tensor_ = nullptr;
   TypeId bias_data_type_;
 };
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_