|
|
|
@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }
|
|
|
|
|
int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); }
|
|
|
|
|
|
|
|
|
|
void ReluFp32(float *data, float *dst, int ele_num) {
|
|
|
|
|
int four_block = UP_DIV(ele_num, C4NUM);
|
|
|
|
|
for (int i = 0; i < four_block - 1; i++) {
|
|
|
|
|
int index = i * C4NUM;
|
|
|
|
|
#ifdef ENABLE_NEON
|
|
|
|
|
float32x4_t relu_data = vld1q_f32(data + index);
|
|
|
|
|
float32x4_t zero_data = vdupq_n_f32(0);
|
|
|
|
|
relu_data = vmaxq_f32(relu_data, zero_data);
|
|
|
|
|
vst1q_f32(dst + index, relu_data);
|
|
|
|
|
#else
|
|
|
|
|
data[index] = data[index] < 0 ? 0 : data[index];
|
|
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
|
|
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
|
|
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
|
|
|
|
|
data[j] = data[j] < 0 ? 0 : data[j];
|
|
|
|
|
int index = 0;
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
|
|
|
|
|
for (; index < c8_block; index += C8NUM) {
|
|
|
|
|
MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index);
|
|
|
|
|
MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
|
|
|
|
|
relu_data = MS_MAX256_F32(relu_data, zero_data);
|
|
|
|
|
MS_ST256_F32(dst + index, relu_data);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Relu6Fp32(float *data, float *dst, int ele_num) {
|
|
|
|
|
int four_block = UP_DIV(ele_num, C4NUM);
|
|
|
|
|
for (int i = 0; i < four_block - 1; i++) {
|
|
|
|
|
int index = i * C4NUM;
|
|
|
|
|
#ifdef ENABLE_NEON
|
|
|
|
|
float32x4_t relu6_data = vld1q_f32(data + index);
|
|
|
|
|
float32x4_t zero_data = vdupq_n_f32(0);
|
|
|
|
|
float32x4_t six_data = vdupq_n_f32(6);
|
|
|
|
|
relu6_data = vmaxq_f32(relu6_data, zero_data);
|
|
|
|
|
relu6_data = vminq_f32(relu6_data, six_data);
|
|
|
|
|
vst1q_f32(dst + index, relu6_data);
|
|
|
|
|
#else
|
|
|
|
|
data[index] = data[index] < 0 ? 0 : data[index];
|
|
|
|
|
data[index] = data[index] > 6 ? 6 : data[index];
|
|
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
|
|
|
|
|
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
|
|
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
|
|
|
|
|
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
|
|
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
|
|
|
|
|
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
|
|
|
|
|
#endif
|
|
|
|
|
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
|
|
|
|
int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
|
|
|
|
|
for (; index < c4_block; index += C4NUM) {
|
|
|
|
|
MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index);
|
|
|
|
|
MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
|
|
|
|
|
relu_data = MS_MAXQ_F32(relu_data, zero_data);
|
|
|
|
|
MS_STQ_F32(dst + index, relu_data);
|
|
|
|
|
}
|
|
|
|
|
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
|
|
|
|
|
data[j] = data[j] < 0 ? 0 : data[j];
|
|
|
|
|
data[j] = data[j] > 6 ? 6 : data[j];
|
|
|
|
|
#endif
|
|
|
|
|
for (; index < ele_num; ++index) {
|
|
|
|
|
data[index] = data[index] < 0.0f ? 0.0f : data[index];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Relu6Fp32(float *data, float *dst, int ele_num) {
|
|
|
|
|
int index = 0;
|
|
|
|
|
#ifdef ENABLE_AVX
|
|
|
|
|
#ifdef WIN32
|
|
|
|
|
void ReluFp32C8(float *data, float *dst, int ele_num) {
|
|
|
|
|
int four_block = UP_DIV(ele_num, C8NUM);
|
|
|
|
|
for (int i = 0; i < four_block - 1; i++) {
|
|
|
|
|
int index = i * C8NUM;
|
|
|
|
|
data[index] = data[index] < 0 ? 0 : data[index];
|
|
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
|
|
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
|
|
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
|
|
|
|
|
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
|
|
|
|
|
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
|
|
|
|
|
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
|
|
|
|
|
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
|
|
|
|
|
}
|
|
|
|
|
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) {
|
|
|
|
|
data[j] = data[j] < 0 ? 0 : data[j];
|
|
|
|
|
int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM;
|
|
|
|
|
for (; index < c8_block; index += C8NUM) {
|
|
|
|
|
MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index);
|
|
|
|
|
MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f);
|
|
|
|
|
MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f);
|
|
|
|
|
relu6_data = MS_MAX256_F32(relu6_data, zero_data);
|
|
|
|
|
relu6_data = MS_MIN256_F32(relu6_data, six_data);
|
|
|
|
|
MS_ST256_F32(dst + index, relu6_data);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
void Relu6Fp32C8(float *data, float *dst, int ele_num) {
|
|
|
|
|
int four_block = UP_DIV(ele_num, C8NUM);
|
|
|
|
|
for (int i = 0; i < four_block - 1; i++) {
|
|
|
|
|
int index = i * C8NUM;
|
|
|
|
|
data[index] = data[index] < 0 ? 0 : data[index];
|
|
|
|
|
data[index] = data[index] > 6 ? 6 : data[index];
|
|
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
|
|
|
|
|
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
|
|
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
|
|
|
|
|
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
|
|
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
|
|
|
|
|
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
|
|
|
|
|
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4];
|
|
|
|
|
data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4];
|
|
|
|
|
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5];
|
|
|
|
|
data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5];
|
|
|
|
|
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6];
|
|
|
|
|
data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6];
|
|
|
|
|
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7];
|
|
|
|
|
data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7];
|
|
|
|
|
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
|
|
|
|
int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM;
|
|
|
|
|
for (; index < c4_block; index += C4NUM) {
|
|
|
|
|
MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index);
|
|
|
|
|
MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f);
|
|
|
|
|
MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f);
|
|
|
|
|
relu6_data = MS_MAXQ_F32(relu6_data, zero_data);
|
|
|
|
|
relu6_data = MS_MINQ_F32(relu6_data, six_data);
|
|
|
|
|
MS_STQ_F32(dst + index, relu6_data);
|
|
|
|
|
}
|
|
|
|
|
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) {
|
|
|
|
|
data[j] = data[j] < 0 ? 0 : data[j];
|
|
|
|
|
data[j] = data[j] > 6 ? 6 : data[j];
|
|
|
|
|
#endif
|
|
|
|
|
for (; index < ele_num; ++index) {
|
|
|
|
|
data[index] = data[index] < 0.0f ? 0.0f : data[index];
|
|
|
|
|
data[index] = data[index] > 6.0f ? 6.0f : data[index];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|
|
|
|
|