add avx int8 add

4 years ago · ca84fa1a00
parent 52953f16fc
commit ca84fa1a00
7 changed files with 507 additions and 6 deletions
--- a/mindspore/lite/nnacl/CMakeLists.txt
+++ b/mindspore/lite/nnacl/CMakeLists.txt
@ -9,6 +9,10 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
    endif()
 endif ()
+if ("${X86_64_SIMD}" STREQUAL "avx")
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse4.1 -mavx -mavx2")
+    set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2")
+endif ()

 ########################### files ###########################
 file(GLOB KERNEL_SRC
@ -39,6 +43,7 @@ endif()

 if ("${X86_64_SIMD}" STREQUAL "avx")
    file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/x86_64_sse/*.c
+         ${NNACL_DIR}/x86_64_avx/*.c
         ${NNACL_DIR}/assembly/avx/*.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
 endif()
--- a/mindspore/lite/nnacl/int8/add_int8.c
+++ b/mindspore/lite/nnacl/int8/add_int8.c
--- a/mindspore/lite/nnacl/int8/add_int8.h
+++ b/mindspore/lite/nnacl/int8/add_int8.h
@ -17,6 +17,9 @@
 #ifndef MINDSPORE_LITE_NNACL_ADD_INT8_H_
 #define MINDSPORE_LITE_NNACL_ADD_INT8_H_

+#ifdef ENABLE_AVX
+#include <x86intrin.h>
+#endif
 #include "nnacl/op_base.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/arithmetic.h"
@ -48,13 +51,21 @@ extern "C" {
 #endif

 void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params);
+
 void AddOptInt8(const int8_t *ptr_in, const int8_t element_in, int8_t *output, int size, AddQuantParameter *params,
                AddQuantQrgs *ptr_args, AddQuantQrgs *ele_args);

 int ElementAddInt8(const int8_t *in0, const int8_t *in1, int8_t *out, int size);
+
 int BroadcastAddInt8(const int8_t *in0, const int8_t *in1, int8_t *tile_in0, int8_t *tile_in1, int8_t *out, int size,
                     ArithmeticParameter *param);

+#ifdef ENABLE_AVX
+void AddInt8_AVX2(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params);
+
+void AddOptInt8_AVX2(const int8_t *ptr_in, const int8_t element_in, int8_t *output, int size, AddQuantParameter *params,
+                     AddQuantQrgs *ptr_args, AddQuantQrgs *ele_args);
+#endif
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/x86_64_avx/common_utils.c
+++ b/mindspore/lite/nnacl/x86_64_avx/common_utils.c
@ -0,0 +1,70 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/x86_64_avx/common_utils.h"
+#ifdef WIN32
+#ifdef ENABLE_AVX
+#include <stdint.h>
+#endif
+#endif
+
+__m128i _mm_adds_epi32(__m128i a, __m128i b) {
+  __m128i int_min = _mm_set1_epi32(0x80000000);
+  __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
+
+  const __m128i res = _mm_add_epi32(a, b);
+  const __m128i sign_and = _mm_and_si128(a, b);
+  const __m128i sign_or = _mm_or_si128(a, b);
+
+  const __m128i min_sat_mask = _mm_andnot_si128(res, sign_and);
+  const __m128i max_sat_mask = _mm_andnot_si128(sign_or, res);
+  const __m128 res_temp =
+    _mm_blendv_ps(_mm_castsi128_ps(res), _mm_castsi128_ps(int_min), _mm_castsi128_ps(min_sat_mask));
+  return _mm_castps_si128(_mm_blendv_ps(res_temp, _mm_castsi128_ps(int_max), _mm_castsi128_ps(max_sat_mask)));
+}
+
+__m128i _mm_rshr_epi32(__m128i a, int shift) {
+  const __m128i vmask = _mm_cmpgt_epi32(_mm_setzero_si128(), a);
+  const __m128i vabs_a = _mm_sub_epi32(_mm_xor_si128(a, vmask), vmask);
+  const __m128i tmp_res = _mm_srli_epi32(vabs_a, shift);
+  return _mm_xor_si128(tmp_res, vmask);
+}
+
+__m128i _mm_qrdmulh_epi32(__m128i a, __m128i b) {
+  const __m128i tmp_a_lo = _mm_unpacklo_epi32(a, _mm_setzero_si128());
+  const __m128i tmp_a_hi = _mm_unpackhi_epi32(a, _mm_setzero_si128());
+  const __m256i tmp_a_256 = _mm256_set_m128i(tmp_a_hi, tmp_a_lo);
+  const __m128i tmp_b_lo = _mm_unpacklo_epi32(b, _mm_setzero_si128());
+  const __m128i tmp_b_hi = _mm_unpackhi_epi32(b, _mm_setzero_si128());
+  const __m256i tmp_b_256 = _mm256_set_m128i(tmp_b_hi, tmp_b_lo);
+  __m256i tmp_out = _mm256_mul_epi32(tmp_a_256, tmp_b_256);
+  tmp_out = _mm256_add_epi64(tmp_out, _mm256_set1_epi64x(1ll << 30));
+  const __m256i vmask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), tmp_out);
+  const __m256i vabs_tmp_out = _mm256_sub_epi64(_mm256_xor_si256(tmp_out, vmask), vmask);
+  tmp_out = _mm256_srli_epi64(vabs_tmp_out, 31);
+  const __m256i vtmp_out = _mm256_sub_epi64(_mm256_xor_si256(tmp_out, vmask), vmask);
+  const int32_t max_32bit = (1ll << 31) - 1;
+  const int32_t min_32bit = -(1ll << 31);
+  int64_t *tmp_out_ptr = (int64_t *)(&vtmp_out);
+  int32_t r1 = tmp_out_ptr[0] > max_32bit ? max_32bit : tmp_out_ptr[0];
+  r1 = r1 < min_32bit ? min_32bit : r1;
+  int32_t r2 = tmp_out_ptr[1] > max_32bit ? max_32bit : tmp_out_ptr[1];
+  r2 = r2 < min_32bit ? min_32bit : r2;
+  int32_t r3 = tmp_out_ptr[2] > max_32bit ? max_32bit : tmp_out_ptr[2];
+  r3 = r3 < min_32bit ? min_32bit : r3;
+  int32_t r4 = tmp_out_ptr[3] > max_32bit ? max_32bit : tmp_out_ptr[3];
+  r4 = r4 < min_32bit ? min_32bit : r4;
+  return _mm_set_epi32(r4, r3, r2, r1);
+}
--- a/mindspore/lite/nnacl/x86_64_avx/common_utils.h
+++ b/mindspore/lite/nnacl/x86_64_avx/common_utils.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_X86_64_AVX_COMMON_UTILS_H_
+#define MINDSPORE_LITE_NNACL_X86_64_AVX_COMMON_UTILS_H_
+
+#include <x86intrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __GNUC__
+#if __GNUC__ < 8
+#define _mm256_set_m128i(xmm1, xmm2) \
+  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#define _mm256_set_m128f(xmm1, xmm2) \
+  _mm256_permute2f128_ps(_mm256_castps128_ps256(xmm1), _mm256_castps128_ps256(xmm2), 2)
+#endif
+#endif
+
+// Signed saturating Add
+__m128i _mm_adds_epi32(__m128i a, __m128i b);
+
+// Signed rounding shift right
+__m128i _mm_rshr_epi32(__m128i a, int shift);
+
+// Signed saturating Rounding Doubling Multiply return High half
+__m128i _mm_qrdmulh_epi32(__m128i a, __m128i b);
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_LITE_NNACL_X86_64_AVX_COMMON_UTILS_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
@ -153,7 +153,11 @@ void QuantizedAddCPUKernel::BroadcastRun(int task_id) {
      cur_in1 = input1_data_ + task_id * stride * in_size_ + i * in_size_;
      cur_out = output_data_ + task_id * stride * in_size_ + i * in_size_;
    }
+#ifdef ENABLE_AVX
+    AddInt8_AVX2(cur_in0, cur_in1, cur_out, in_size_, &para_);
+#else
    AddInt8(cur_in0, cur_in1, cur_out, in_size_, &para_);
+#endif
  }
  return;
 }
@ -180,9 +184,17 @@ int QuantizedAddCPUKernel::DoExecute(int task_id) {
    int8_t element_in = arith_para_->in_elements_num0_ == 1 ? input0_data_[0] : input1_data_[0];
    AddQuantQrgs *ptr_args = arith_para_->in_elements_num0_ == 1 ? &para_.in1_args_ : &para_.in0_args_;
    AddQuantQrgs *ele_args = arith_para_->in_elements_num0_ == 1 ? &para_.in0_args_ : &para_.in1_args_;
+#ifdef ENABLE_AVX
+    AddOptInt8_AVX2(ptr_in, element_in, cur_out, rest_count, &para_, ptr_args, ele_args);
+#else
    AddOptInt8(ptr_in, element_in, cur_out, rest_count, &para_, ptr_args, ele_args);
+#endif
  } else {
+#ifdef ENABLE_AVX
+    AddInt8_AVX2(cur_in0, cur_in1, cur_out, rest_count, &para_);
+#else
    AddInt8(cur_in0, cur_in1, cur_out, rest_count, &para_);
+#endif
  }

  return RET_OK;
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@ -75,7 +75,10 @@ if ("${X86_64_SIMD}" STREQUAL "sse")
 endif()

 if ("${X86_64_SIMD}" STREQUAL "avx")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -mavx -mavx2")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2")
    file(GLOB TEST_ASSEMBLY_SRC ${LITE_DIR}/nnacl/x86_64_sse/*.c
+             ${LITE_DIR}/nnacl/x86_64_avx/*.c
             ${LITE_DIR}/nnacl/assembly/avx/*.S)
    set_property(SOURCE ${TEST_ASSEMBLY_SRC} PROPERTY LANGUAGE C)
    set(KERNEL_OP_SRC