|
|
|
@ -99,6 +99,8 @@ int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) {
|
|
|
|
|
{28.0f, 28.0f, 28.0f, 28.0f},
|
|
|
|
|
{3150.0f, 3150.0f, 3150.0f, 3150.0f},
|
|
|
|
|
{62370.0f, 62370.0f, 62370.0f, 62370.0f}};
|
|
|
|
|
float32x4_t neg_one = {-1.0f, -1.0f, -1.0f, -1.0f};
|
|
|
|
|
float32x4_t pos_one = {1.0f, 1.0f, 1.0f, 1.0f};
|
|
|
|
|
int count = (ele_num / C4NUM) * C4NUM;
|
|
|
|
|
for (; i < count; i += C4NUM) {
|
|
|
|
|
float32x4_t input = vcvt_f32_f16(vld1_f16(src + i));
|
|
|
|
@ -109,7 +111,7 @@ int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) {
|
|
|
|
|
float32x4_t b = vaddq_f32(
|
|
|
|
|
vmulq_f32(vaddq_f32(vmulq_f32(vaddq_f32(vmulq_f32(paramv[3], square), paramv[4]), square), paramv[5]), square),
|
|
|
|
|
paramv[2]);
|
|
|
|
|
vst1_f16(dst + i, vcvt_f16_f32(vdivq_f32(a, b)));
|
|
|
|
|
vst1_f16(dst + i, vcvt_f16_f32(vminq_f32(vmaxq_f32(vdivq_f32(a, b), neg_one), pos_one)));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
for (; i < ele_num; ++i) {
|
|
|
|
@ -118,6 +120,8 @@ int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) {
|
|
|
|
|
float a = (((square + 378.0f) * square + 17325.0f) * square + 135135.0f) * input;
|
|
|
|
|
float b = ((28.0f * square + 3150.0f) * square + 62370.0f) * square + 135135.0f;
|
|
|
|
|
dst[i] = a / b;
|
|
|
|
|
dst[i] = MSMAX(dst[i], -1);
|
|
|
|
|
dst[i] = MSMIN(dst[i], 1);
|
|
|
|
|
}
|
|
|
|
|
return NNACL_OK;
|
|
|
|
|
}
|
|
|
|
|