Set the simd-related kernels used under arm toolchains.

8 years ago · 2a601e025e
parent 909cc6f0c5
commit 2a601e025e
7 changed files with 490 additions and 5 deletions
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@ -17,7 +17,11 @@ limitations under the License. */

 #include <stdio.h>
 #include "hl_base.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "hl_neon_matrix_kernel.cuh"
+#else
 #include "hl_sse_matrix_kernel.cuh"
+#endif

 /**
 * @brief   cpu element wise unary operator.
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
+#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
+#include "hl_matrix_base_neon.cuh"
 #else
 #include "hl_matrix_base_sse.cuh"
 #endif
--- a/paddle/cuda/include/hl_matrix_base_neon.cuh
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_NEON_CUH_
+#define HL_MATRIX_BASE_NEON_CUH_
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = true;
+  const real p1;
+  const real p2;
+  float32x4_t mp1;
+  float32x4_t mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    mp1 = vdupq_n_f32(p1);
+    mp2 = vdupq_n_f32(p2);
+  }
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp1, tmp2;
+    tmp1 = vmulq_f32(mp1, a);
+    tmp2 = vmulq_f32(mp2, b);
+    return vaddq_f32(tmp1, tmp2);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vsubq_f32(a, b);
+    return vmulq_f32(tmp, tmp);
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = true;
+  const real p;
+  float32x4_t mp;
+  uint32x4_t result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    mp = vdupq_n_f32(p);
+    result = vdupq_n_u32(1);
+  }
+  // TODO: to be check
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    uint32x4_t tmp1 = vcgtq_f32(a, mp);
+    uint32x4_t tmp2 = vcgtq_f32(b, mp);
+    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+    return vcvtq_f32_u32(vandq_u32(tmp3, result));
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_NEON_CUH_ */
--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "SIMDFunctions.h"
+#ifdef __SSE__
 #include <immintrin.h>
+#endif
 #include <algorithm>

-#ifndef __AVX__
+#ifdef __SSE__
 static void addto_sse(float* a, const float* b, size_t len) {
  int offset = len % 16;
  __m128 ma0, ma1, ma2, ma3;
@ -125,7 +127,8 @@ static void col_max_sse(float* result,
  }
 }

-#else
+#elif defined(__AVX__)
+
 static void addto_avx(float* a, const float* b, size_t len) {
  int offset = len % 32;

@ -357,15 +360,16 @@ static void decayL1_avx(

 #endif

-#ifndef __AVX__
+#ifdef __SSE__
 #define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#else
+#elif __AVX__
 #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
 #endif

 namespace paddle {
 namespace simd {
 namespace internal {
+#ifdef __SSE__
 void addToImpl(float* a, const float* b, size_t len) {
  SIMD_INVOKE(addto, a, b, len);
 }
@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
 void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
  SIMD_INVOKE(col_max, result, data, dim, numSamples);
 }
+#endif

 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@ -128,17 +128,29 @@ void decayL1AvxImpl(

 template <>
 inline void addTo(float* a, const float* b, size_t len) {
+#ifdef __SSE__
  internal::addToImpl(a, b, len);
+#else
+  naive::addTo(a, b, len);
+#endif
 }

 template <>
 inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
+#ifdef __SSE__
  internal::batchAddToImpl(a, b, batch, len);
+#else
+  naive::batchAddTo(a, b, batch, len);
+#endif
 }

 template <>
 inline void colMax(float* result, const float* data, int dim, int numSamples) {
+#ifdef __SSE__
  internal::colMaxImpl(result, data, dim, numSamples);
+#else
+  naive::colMax(result, data, dim, numSamples);
+#endif
 }

 template <>
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@ -18,7 +18,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT

 TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__)
  // clang-format off
  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
 }