diff --git a/mindspore/lite/nnacl/common_func.c b/mindspore/lite/nnacl/common_func.c index 6dfbb1c480..a6e3f26593 100644 --- a/mindspore/lite/nnacl/common_func.c +++ b/mindspore/lite/nnacl/common_func.c @@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); } void ReluFp32(float *data, float *dst, int ele_num) { - int four_block = UP_DIV(ele_num, C4NUM); - for (int i = 0; i < four_block - 1; i++) { - int index = i * C4NUM; -#ifdef ENABLE_NEON - float32x4_t relu_data = vld1q_f32(data + index); - float32x4_t zero_data = vdupq_n_f32(0); - relu_data = vmaxq_f32(relu_data, zero_data); - vst1q_f32(dst + index, relu_data); -#else - data[index] = data[index] < 0 ? 0 : data[index]; - data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; - data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; - data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; -#endif - } - for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { - data[j] = data[j] < 0 ? 0 : data[j]; + int index = 0; +#ifdef ENABLE_AVX + int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; + for (; index < c8_block; index += C8NUM) { + MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index); + MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); + relu_data = MS_MAX256_F32(relu_data, zero_data); + MS_ST256_F32(dst + index, relu_data); } -} - -void Relu6Fp32(float *data, float *dst, int ele_num) { - int four_block = UP_DIV(ele_num, C4NUM); - for (int i = 0; i < four_block - 1; i++) { - int index = i * C4NUM; -#ifdef ENABLE_NEON - float32x4_t relu6_data = vld1q_f32(data + index); - float32x4_t zero_data = vdupq_n_f32(0); - float32x4_t six_data = vdupq_n_f32(6); - relu6_data = vmaxq_f32(relu6_data, zero_data); - relu6_data = vminq_f32(relu6_data, six_data); - vst1q_f32(dst + index, relu6_data); -#else - data[index] = data[index] < 0 ? 0 : data[index]; - data[index] = data[index] > 6 ? 6 : data[index]; - data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; - data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; - data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; - data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; - data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; - data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; #endif +#if defined(ENABLE_NEON) || defined(ENABLE_SSE) + int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; + for (; index < c4_block; index += C4NUM) { + MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index); + MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); + relu_data = MS_MAXQ_F32(relu_data, zero_data); + MS_STQ_F32(dst + index, relu_data); } - for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { - data[j] = data[j] < 0 ? 0 : data[j]; - data[j] = data[j] > 6 ? 6 : data[j]; +#endif + for (; index < ele_num; ++index) { + data[index] = data[index] < 0.0f ? 0.0f : data[index]; } } +void Relu6Fp32(float *data, float *dst, int ele_num) { + int index = 0; #ifdef ENABLE_AVX -#ifdef WIN32 -void ReluFp32C8(float *data, float *dst, int ele_num) { - int four_block = UP_DIV(ele_num, C8NUM); - for (int i = 0; i < four_block - 1; i++) { - int index = i * C8NUM; - data[index] = data[index] < 0 ? 0 : data[index]; - data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; - data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; - data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; - data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; - data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; - data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; - data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; - } - for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { - data[j] = data[j] < 0 ? 0 : data[j]; + int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; + for (; index < c8_block; index += C8NUM) { + MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index); + MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); + MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f); + relu6_data = MS_MAX256_F32(relu6_data, zero_data); + relu6_data = MS_MIN256_F32(relu6_data, six_data); + MS_ST256_F32(dst + index, relu6_data); } -} +#endif -void Relu6Fp32C8(float *data, float *dst, int ele_num) { - int four_block = UP_DIV(ele_num, C8NUM); - for (int i = 0; i < four_block - 1; i++) { - int index = i * C8NUM; - data[index] = data[index] < 0 ? 0 : data[index]; - data[index] = data[index] > 6 ? 6 : data[index]; - data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; - data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; - data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; - data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; - data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; - data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; - data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; - data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4]; - data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; - data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5]; - data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; - data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6]; - data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; - data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7]; +#if defined(ENABLE_NEON) || defined(ENABLE_SSE) + int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; + for (; index < c4_block; index += C4NUM) { + MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index); + MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); + MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f); + relu6_data = MS_MAXQ_F32(relu6_data, zero_data); + relu6_data = MS_MINQ_F32(relu6_data, six_data); + MS_STQ_F32(dst + index, relu6_data); } - for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { - data[j] = data[j] < 0 ? 0 : data[j]; - data[j] = data[j] > 6 ? 6 : data[j]; +#endif + for (; index < ele_num; ++index) { + data[index] = data[index] < 0.0f ? 0.0f : data[index]; + data[index] = data[index] > 6.0f ? 6.0f : data[index]; } } -#endif -#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc similarity index 71% rename from mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc rename to mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc index 7b97a6d1dc..f7f9bcea67 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc @@ -17,7 +17,7 @@ #include #include "include/errorcode.h" #include "schema/model_generated.h" -#include "src/runtime/kernel/arm/fp16/bias_fp16.h" +#include "src/runtime/kernel/arm/fp16/biasadd_fp16.h" #include "src/kernel_registry.h" using mindspore::kernel::KERNEL_ARCH::kCPU; @@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_BiasAdd; namespace mindspore::kernel { -int BiasCPUFp16Kernel::ReSize() { +int BiasAddCPUFp16Kernel::ReSize() { auto dims = in_tensors_.at(0)->shape(); bias_param_->ndim_ = dims.size(); if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) { @@ -45,13 +45,20 @@ int BiasCPUFp16Kernel::ReSize() { return RET_OK; } -int BiasCPUFp16Kernel::Run() { +int BiasAddCPUFp16Kernel::Run() { + if (bias_data_ == nullptr) { + auto ret = GetBiasData(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "GetBiasData is error in run!"; + return ret; + } + } auto in = reinterpret_cast(in_tensors_.at(0)->MutableData()); auto out = reinterpret_cast(out_tensors_.at(0)->MutableData()); size_t data_size = in_tensors_.at(0)->ElementsNum(); MS_ASSERT(context_->allocator != nullptr); - auto *tile_in = reinterpret_cast(context_->allocator->Malloc(data_size * sizeof(float16_t))); - auto *tile_bias = reinterpret_cast(context_->allocator->Malloc(data_size * sizeof(float16_t))); + auto tile_in = reinterpret_cast(context_->allocator->Malloc(data_size * sizeof(float16_t))); + auto tile_bias = reinterpret_cast(context_->allocator->Malloc(data_size * sizeof(float16_t))); if (tile_in == nullptr || tile_bias == nullptr) { MS_LOG(ERROR) << "Memory allocation failed"; context_->allocator->Free(tile_in); @@ -64,43 +71,54 @@ int BiasCPUFp16Kernel::Run() { return RET_OK; } -BiasCPUFp16Kernel::~BiasCPUFp16Kernel() { +BiasAddCPUFp16Kernel::~BiasAddCPUFp16Kernel() { if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) { free(bias_data_); bias_data_ = nullptr; } } -int BiasCPUFp16Kernel::Init() { - auto bias_tensor = in_tensors_.at(1); - MS_ASSERT(bias_tensor != nullptr); - bias_data_type_ = bias_tensor->data_type(); +int BiasAddCPUFp16Kernel::GetBiasData() { + bias_data_type_ = bias_tensor_->data_type(); if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) { - bias_data_ = reinterpret_cast(malloc(bias_tensor->ElementsNum() * sizeof(float16_t))); + bias_data_ = reinterpret_cast(malloc(bias_tensor_->ElementsNum() * sizeof(float16_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "bias_data_ is nullptr"; return RET_NULL_PTR; } - auto *bias = reinterpret_cast(bias_tensor->MutableData()); + auto bias = reinterpret_cast(bias_tensor_->MutableData()); if (bias == nullptr) { MS_LOG(ERROR) << "bias is nullptr!"; return RET_NULL_PTR; } - for (int i = 0; i < bias_tensor->ElementsNum(); ++i) { + for (int i = 0; i < bias_tensor_->ElementsNum(); ++i) { bias_data_[i] = (float16_t)(bias[i]); } } else { - bias_data_ = reinterpret_cast(bias_tensor->MutableData()); + bias_data_ = reinterpret_cast(bias_tensor_->MutableData()); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "bias_data_ is nullptr"; return RET_NULL_PTR; } } + return RET_OK; +} + +int BiasAddCPUFp16Kernel::Init() { + bias_tensor_ = in_tensors_.at(1); + MS_ASSERT(bias_tensor_ != nullptr); + if (bias_tensor_->IsConst()) { + auto ret = GetBiasData(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "GetBiasData is error in Init()!"; + return ret; + } + } if (!InferShapeDone()) { return RET_OK; } return ReSize(); } -REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h similarity index 67% rename from mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h rename to mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h index 6d00b75082..20f8a91eb4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/bias_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h @@ -14,31 +14,33 @@ * limitations under the License. */ -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_ #include #include "src/lite_kernel.h" #include "nnacl/fp16/arithmetic_fp16.h" namespace mindspore::kernel { -class BiasCPUFp16Kernel : public LiteKernel { +class BiasAddCPUFp16Kernel : public LiteKernel { public: - BiasCPUFp16Kernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, const lite::InnerContext *ctx) + BiasAddCPUFp16Kernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) : LiteKernel(parameter, inputs, outputs, ctx) { bias_param_ = reinterpret_cast(parameter); } - ~BiasCPUFp16Kernel() override; + ~BiasAddCPUFp16Kernel() override; int Init() override; int ReSize() override; int Run() override; private: + int GetBiasData(); ArithmeticParameter *bias_param_ = nullptr; float16_t *bias_data_ = nullptr; + lite::Tensor *bias_tensor_ = nullptr; TypeId bias_data_type_; }; } // namespace mindspore::kernel -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_