Set the simd-related kernels used under arm toolchains.

release/0.10.0
Liu Yiqun 8 years ago
parent 909cc6f0c5
commit 2a601e025e

@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#endif
/**
* @brief cpu element wise unary operator.

@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef BaseOp SSEFirst;
typedef BaseOp SSESecond;
typedef BaseOp SSEClassificationError;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_sse.cuh"
#endif

@ -0,0 +1,161 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace aggregate {
class SSESum {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEMax {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmaxq_f32(a, b);
}
};
class SSEMin {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vminq_f32(a, b);
}
};
} // namespace aggregate
namespace base {
namespace unary {
class SSEIdentity {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a) const {
return a;
}
};
} // namespace unary
namespace binary {
class SSEAdd {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEAdd2 {
public:
static const bool sse = true;
const real p1;
const real p2;
float32x4_t mp1;
float32x4_t mp2;
public:
SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
mp1 = vdupq_n_f32(p1);
mp2 = vdupq_n_f32(p2);
}
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp1, tmp2;
tmp1 = vmulq_f32(mp1, a);
tmp2 = vmulq_f32(mp2, b);
return vaddq_f32(tmp1, tmp2);
}
};
class SSESub {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vsubq_f32(a, b);
}
};
class SSEMul {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmulq_f32(a, b);
}
};
class SSEDiv {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vrecpeq_f32(b);
return vmulq_f32(a, tmp);
}
};
class SSESquaredDiff {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vsubq_f32(a, b);
return vmulq_f32(tmp, tmp);
}
};
class SSEFirst {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return a;
}
};
class SSESecond {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return b;
}
};
class SSEClassificationError {
public:
static const bool sse = true;
const real p;
float32x4_t mp;
uint32x4_t result;
public:
explicit SSEClassificationError(const real s) : p(s) {
mp = vdupq_n_f32(p);
result = vdupq_n_u32(1);
}
// TODO: to be check
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
uint32x4_t tmp1 = vcgtq_f32(a, mp);
uint32x4_t tmp2 = vcgtq_f32(b, mp);
uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
return vcvtq_f32_u32(vandq_u32(tmp3, result));
}
};
} // namespace binary
} // namespace base
#endif /* HL_MATRIX_BASE_NEON_CUH_ */

File diff suppressed because it is too large Load Diff

@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE__
#include <immintrin.h>
#endif
#include <algorithm>
#ifndef __AVX__
#ifdef __SSE__
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
@ -125,7 +127,8 @@ static void col_max_sse(float* result,
}
}
#else
#elif defined(__AVX__)
static void addto_avx(float* a, const float* b, size_t len) {
int offset = len % 32;
@ -357,15 +360,16 @@ static void decayL1_avx(
#endif
#ifndef __AVX__
#ifdef __SSE__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#else
#elif __AVX__
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#endif
namespace paddle {
namespace simd {
namespace internal {
#ifdef __SSE__
void addToImpl(float* a, const float* b, size_t len) {
SIMD_INVOKE(addto, a, b, len);
}
@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
SIMD_INVOKE(col_max, result, data, dim, numSamples);
}
#endif
#ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {

@ -128,17 +128,29 @@ void decayL1AvxImpl(
template <>
inline void addTo(float* a, const float* b, size_t len) {
#ifdef __SSE__
internal::addToImpl(a, b, len);
#else
naive::addTo(a, b, len);
#endif
}
template <>
inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
#ifdef __SSE__
internal::batchAddToImpl(a, b, batch, len);
#else
naive::batchAddTo(a, b, batch, len);
#endif
}
template <>
inline void colMax(float* result, const float* data, int dim, int numSamples) {
#ifdef __SSE__
internal::colMaxImpl(result, data, dim, numSamples);
#else
naive::colMax(result, data, dim, numSamples);
#endif
}
template <>

@ -18,7 +18,8 @@ limitations under the License. */
using namespace paddle; // NOLINT
TEST(SIMDFlags, gccTest) {
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
!defined(__arm__)
// clang-format off
CHECK(!__builtin_cpu_supports("sse") != HAS_SSE);
CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2);
@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX;
LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2;
LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512;
LOG(INFO) << "Has NEON: " << std::boolalpha << HAS_NEON;
}

Loading…
Cancel
Save