diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc index 4ee99d86ca..3263ebab04 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc @@ -28,7 +28,21 @@ using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Add; +using mindspore::schema::PrimitiveType_Div; +using mindspore::schema::PrimitiveType_Equal; +using mindspore::schema::PrimitiveType_FloorDiv; +using mindspore::schema::PrimitiveType_FloorMod; +using mindspore::schema::PrimitiveType_Greater; +using mindspore::schema::PrimitiveType_GreaterEqual; +using mindspore::schema::PrimitiveType_Less; +using mindspore::schema::PrimitiveType_LessEqual; +using mindspore::schema::PrimitiveType_LogicalAnd; +using mindspore::schema::PrimitiveType_LogicalOr; +using mindspore::schema::PrimitiveType_Maximum; +using mindspore::schema::PrimitiveType_Minimum; using mindspore::schema::PrimitiveType_Mul; +using mindspore::schema::PrimitiveType_NotEqual; +using mindspore::schema::PrimitiveType_SquaredDifference; using mindspore::schema::PrimitiveType_Sub; namespace mindspore::kernel { @@ -97,7 +111,44 @@ int ArithmeticFP16CPUKernel::Init() { arithmetic_run_ = ElementSubFp16; break; } - break; + case PrimitiveType_Div: + switch (arithmeticParameter_->activation_type_) { + case schema::ActivationType_RELU: + arithmetic_run_ = ElementDivReluFp16; + break; + case schema::ActivationType_RELU6: + arithmetic_run_ = ElementDivRelu6Fp16; + break; + default: + arithmetic_run_ = ElementDivFp16; + break; + } + case PrimitiveType_FloorMod: + arithmetic_run_ = ElementFloorModFp16; + case PrimitiveType_FloorDiv: + arithmetic_run_ = ElementFloorDivFp16; + case PrimitiveType_LogicalAnd: + arithmetic_run_ = ElementLogicalAndFp16; + case PrimitiveType_LogicalOr: + arithmetic_run_ = ElementLogicalOrFp16; + case PrimitiveType_SquaredDifference: + arithmetic_run_ = ElementSquaredDifferenceFp16; + case PrimitiveType_Maximum: + arithmetic_run_ = ElementMaximumFp16; + case PrimitiveType_Minimum: + arithmetic_run_ = ElementMinimumFp16; + case PrimitiveType_NotEqual: + arithmetic_run_ = ElementNotEqualFp16; + case PrimitiveType_Equal: + arithmetic_run_ = ElementEqualFp16; + case PrimitiveType_Less: + arithmetic_run_ = ElementLessFp16; + case PrimitiveType_LessEqual: + arithmetic_run_ = ElementLessEqual; + case PrimitiveType_Greater: + arithmetic_run_ = ElementGreaterFp16; + case PrimitiveType_GreaterEqual: + arithmetic_run_ = ElementGreaterEqualFp16; default: MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; arithmetic_run_ = nullptr; @@ -115,8 +166,8 @@ int ArithmeticFP16CPUKernel::ReSize() { arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum(); arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum(); if (in_tensors_[0]->data_type() == kNumberTypeFloat32 || in_tensors_[0]->data_type() == kNumberTypeFloat) { - input0_fp16_ = reinterpret_cast(context_->allocator->Malloc( - arithmeticParameter_->in_elements_num0_ * sizeof(float16_t))); + input0_fp16_ = reinterpret_cast( + context_->allocator->Malloc(arithmeticParameter_->in_elements_num0_ * sizeof(float16_t))); if (input0_fp16_ == nullptr) { MS_LOG(ERROR) << "malloc data fail!"; return RET_ERROR; @@ -125,8 +176,8 @@ int ArithmeticFP16CPUKernel::ReSize() { arithmeticParameter_->in_elements_num0_); } if (in_tensors_[1]->data_type() == kNumberTypeFloat32 || in_tensors_[1]->data_type() == kNumberTypeFloat) { - input1_fp16_ = reinterpret_cast(context_->allocator->Malloc( - arithmeticParameter_->in_elements_num1_ * sizeof(float16_t))); + input1_fp16_ = reinterpret_cast( + context_->allocator->Malloc(arithmeticParameter_->in_elements_num1_ * sizeof(float16_t))); if (input0_fp16_ == nullptr) { MS_LOG(ERROR) << "malloc data fail!"; return RET_ERROR; @@ -135,8 +186,8 @@ int ArithmeticFP16CPUKernel::ReSize() { arithmeticParameter_->in_elements_num1_); } if (out_tensors_[0]->data_type() == kNumberTypeFloat32 || out_tensors_[0]->data_type() == kNumberTypeFloat) { - output_fp16_ = reinterpret_cast(context_->allocator->Malloc( - arithmeticParameter_->out_elements_num_ * sizeof(float16_t))); + output_fp16_ = reinterpret_cast( + context_->allocator->Malloc(arithmeticParameter_->out_elements_num_ * sizeof(float16_t))); if (output_fp16_ == nullptr) { MS_LOG(ERROR) << "malloc data fail!"; return RET_ERROR; @@ -197,22 +248,22 @@ int ArithmeticFP16CPUKernel::DoArithmetic(int task_id) { int error_code = RET_OK; if (arithmeticParameter_->broadcasting_) { - error_code = arithmetic_run_(tile_data0_ + thread_stride, tile_data1_ + thread_stride, - output_data + thread_stride, count); + error_code = + arithmetic_run_(tile_data0_ + thread_stride, tile_data1_ + thread_stride, output_data + thread_stride, count); } else if (arithmetic_opt_run_ != nullptr) { if (arithmeticParameter_->in_elements_num0_ == 1) { - error_code = arithmetic_opt_run_(input0_data, input1_data1 + thread_stride, output_data + thread_stride, - count, arithmeticParameter_); + error_code = arithmetic_opt_run_(input0_data, input1_data1 + thread_stride, output_data + thread_stride, count, + arithmeticParameter_); } else if (arithmeticParameter_->in_elements_num1_ == 1) { - error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1, output_data + thread_stride, - count, arithmeticParameter_); + error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1, output_data + thread_stride, count, + arithmeticParameter_); } else { error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1 + thread_stride, output_data + thread_stride, count, arithmeticParameter_); } } else { - error_code = arithmetic_run_(input0_data + thread_stride, input1_data1 + thread_stride, - output_data + thread_stride, count); + error_code = + arithmetic_run_(input0_data + thread_stride, input1_data1 + thread_stride, output_data + thread_stride, count); } if (error_code != RET_OK) { return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.c index e058cf7293..964122057c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.c @@ -104,10 +104,16 @@ int ElementMulFp16(float16_t *input0, float16_t *input1, float16_t *output, int int block_c8 = element_size - block_mod; for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = input0[0] * input1[0]; - output[1] = input0[1] * input1[1]; - output[2] = input0[2] * input1[2]; - output[3] = input0[3] * input1[3]; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vmulq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = input0[i] * input1[i]; + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -123,15 +129,24 @@ int ElementMulReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { - float16_t res = input0[0] * input1[0]; - output[0] = res > 0 ? res : 0; - res = input0[1] * input1[1]; - output[1] = res > 0 ? res : 0; - res = input0[2] * input1[2]; - output[2] = res > 0 ? res : 0; - res = input0[3] * input1[3]; - output[3] = res > 0 ? res : 0; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vmulq_f16(vin0, vin1); + vout = vmaxq_f16(vout, zeros); + vst1q_f16(output, vout); +#else + float16_t res; + for (int i = 0; i < C8NUM; ++i) { + res = input[i] * input1[i]; + output[i] = res > 0 ? res : 0; + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -148,11 +163,23 @@ int ElementMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = MSMIN(MSMAX(input0[0] * input1[0], 0), 6); - output[1] = MSMIN(MSMAX(input0[1] * input1[1], 0), 6); - output[2] = MSMIN(MSMAX(input0[2] * input1[2], 0), 6); - output[3] = MSMIN(MSMAX(input0[3] * input1[3], 0), 6); +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vmulq_f16(vin0, vin1); + vout = vminq_f16(vmaxq_f16(vout, zeros), bounds); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMIN(MSMAX(input0[i] * input1[i], 0), 6); + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -169,10 +196,16 @@ int ElementAddFp16(float16_t *input0, float16_t *input1, float16_t *output, int int block_c8 = element_size - block_mod; for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = input0[0] + input1[0]; - output[1] = input0[1] + input1[1]; - output[2] = input0[2] + input1[2]; - output[3] = input0[3] + input1[3]; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vaddq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = input0[i] + input1[i]; + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -187,15 +220,22 @@ int ElementAddReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { - float16_t res = input0[0] + input1[0]; - output[0] = res > 0 ? res : 0; - res = input0[1] + input1[1]; - output[1] = res > 0 ? res : 0; - res = input0[2] + input1[2]; - output[2] = res > 0 ? res : 0; - res = input0[3] + input1[3]; - output[3] = res > 0 ? res : 0; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vaddq_f16(vin0, vin1); + vout = vmaxq_f16(vout, zeros); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMAX(input0[i] + input1[i], 0); + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -211,11 +251,23 @@ int ElementAddRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = MSMIN(MSMAX(input0[0] + input1[0], 0), 6); - output[1] = MSMIN(MSMAX(input0[1] + input1[1], 0), 6); - output[2] = MSMIN(MSMAX(input0[2] + input1[2], 0), 6); - output[3] = MSMIN(MSMAX(input0[3] + input1[3], 0), 6); +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vaddq_f16(vin0, vin1); + vout = vminq_f16(vmaxq_f16(vout, zeros), bounds); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMIN(MSMAX(input0[i] + input1[i], 0), 6); + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -232,10 +284,16 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int int block_c8 = element_size - block_mod; for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = input0[0] - input1[0]; - output[1] = input0[1] - input1[1]; - output[2] = input0[2] - input1[2]; - output[3] = input0[3] - input1[3]; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = input0[i] - input1[i]; + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -249,16 +307,21 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; - +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif for (int index = 0; index < block_c8; index += C8NUM) { - float16_t res = input0[0] - input1[0]; - output[0] = res > 0 ? res : 0; - res = input0[1] - input1[1]; - output[1] = res > 0 ? res : 0; - res = input0[2] - input1[2]; - output[2] = res > 0 ? res : 0; - res = input0[3] - input1[3]; - output[3] = res > 0 ? res : 0; +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vout = vmaxq_f16(vout, zeros); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMAX(input0[i] - input1[i], 0); + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; @@ -273,19 +336,439 @@ int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { int block_mod = element_size % C8NUM; int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vout = vminq_f16(vmaxq_f16(vout, zeros), bounds); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMIN(MSMAX(input0[i] - input1[i], 0), 6); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6); + } + return NNACL_OK; +} + +int ElementDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; + + for (int index = 0; index < block_c8; index += C8NUM) { + for (int i = 0; i < C8NUM; ++i) { + if (input1[i] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + } +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = input0[i] / input1[i]; + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + if (input1[index] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + output[index] = input0[index] / input1[index]; + } + return NNACL_OK; +} + +int ElementDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif for (int index = 0; index < block_c8; index += C8NUM) { - output[0] = MSMIN(MSMAX(input0[0] - input1[0], 0), 6); - output[1] = MSMIN(MSMAX(input0[1] - input1[1], 0), 6); - output[2] = MSMIN(MSMAX(input0[2] - input1[2], 0), 6); - output[3] = MSMIN(MSMAX(input0[3] - input1[3], 0), 6); + for (int i = 0; i < C8NUM; ++i) { + if (input1[i] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + } +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vout = vmaxq_f16(vout, zeros); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMAX(input0[i] - input1[i], 0); + } +#endif input0 += C8NUM; input1 += C8NUM; output += C8NUM; } for (int index = 0; index < block_mod; ++index) { + if (input1[index] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + float16_t res = input0[index] - input1[index]; + output[index] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; + float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { + for (int i = 0; i < C8NUM; ++i) { + if (input1[i] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + } +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vsubq_f16(vin0, vin1); + vout = vminq_f16(vmaxq_f16(vout, zeros), bounds); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMIN(MSMAX(input0[i] - input1[i], 0), 6); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + if (input1[index] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6); } + return NNACL_OK; +} + +int ElementFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + for (int i = 0; i < element_size; ++i) { + if (input1[i] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i]; + } + return NNACL_OK; +} + +int ElementFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + for (int i = 0; i < element_size; ++i) { + if (input1[i] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + output[i] = floorf(input0[i] / input1[i]); + } + return NNACL_OK; +} + +int ElementLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; + +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; + uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1)); + uint16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0)), mask); + uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1)), mask); + float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)((bool)(input0[i]) & (bool)(input1[i])); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[index])); + } + return NNACL_OK; +} +int ElementLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; + +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; + uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1)); + uint16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0)), mask); + uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1)), mask); + float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)((bool)(input0[i]) | (bool)(input1[i])); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[index])); + } + return NNACL_OK; +} + +int ElementSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + ElementSubFp16(input0, input1, output, element_size); + return ElementMulFp16(output, output, output, element_size); +} + +int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vmaxq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMAX(input0[i], input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = MSMAX(input0[index], input1[index]); + } + return NNACL_OK; +} + +int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vminq_f16(vin0, vin1); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = MSMIN(input0[i], input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = MSMIN(input0[index], input1[index]); + } + return NNACL_OK; +} + +int ElementNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vfalse, vtrue); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] != input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] != input1[index]); + } + return NNACL_OK; +} + +int ElementEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vtrue, vfalse); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] == input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] == input1[index]); + } + return NNACL_OK; +} + +int ElementLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vcltq_f16(vin0, vin1), vtrue, vfalse); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] < input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] < input1[index]); + } + return NNACL_OK; +} + +int ElementLessEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vcleq_f16(vin0, vin1), vtrue, vfalse); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] <= input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] <= input1[index]); + } + return NNACL_OK; +} + +int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1), vtrue, vfalse); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] > input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] > input1[index]); + } + return NNACL_OK; +} + +int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { + int block_mod = element_size % C8NUM; + int block_c8 = element_size - block_mod; +#ifdef ENABLE_NEON + float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; + float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; +#endif + for (int index = 0; index < block_c8; index += C8NUM) { +#ifdef ENABLE_NEON + float16x8_t vin0 = vld1q_f16(input0); + float16x8_t vin1 = vld1q_f16(input1); + float16x8_t vout = vbslq_f16(vcgeq_f16(vin0, vin1), vtrue, vfalse); + vst1q_f16(output, vout); +#else + for (int i = 0; i < C8NUM; ++i) { + output[i] = (float16_t)(input0[i] >= input1[i]); + } +#endif + input0 += C8NUM; + input1 += C8NUM; + output += C8NUM; + } + for (int index = 0; index < block_mod; ++index) { + output[index] = (float16_t)(input0[index] >= input1[index]); + } return NNACL_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.h index 58fa13528c..0cbaa9e6c8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/arithmetic_fp16.h @@ -44,6 +44,28 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + +int ElementFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + +int ElementLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + +int ElementSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + +int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + +int ElementNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementLessEqual(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); +int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size); + void TileDimensionsFp16(float16_t *data0, float16_t *data1, float16_t *tile_data0, float16_t *tile_data1, ArithmeticParameter *param); #ifdef __cplusplus