Merge pull request #2299 from Xreki/support_scalar_kernels

Centralize the use of simd intrinsic and implement scalar kernels.
8 years ago · e36e24d49c
parent 4537b7bc8c 7fb0684a91
commit e36e24d49c
12 changed files with 656 additions and 815 deletions
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@ -17,10 +17,9 @@ limitations under the License. */
 #include <stdio.h>
 #include "hl_base.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
-#include "hl_neon_matrix_kernel.cuh"
+#ifndef __CUDA_ARCH__
-#else
+#include "hl_cpu_matrix_kernel_detail.cuh"
 #include "hl_sse_matrix_kernel.cuh"
 #endif
 /**
@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_matrix_row_op(Agg agg, Op op, Saver sv,
                      int dimM, int dimN,
                      real *dst, int ld,
                      real *A, int lda) {
  for (int i = 0; i < dimM; i++) {
    real tmp = agg.init();
    for (int j = 0; j < dimN; j++) {
        tmp = agg(tmp, op(A[i * lda + j]));
    }
    dst[i*ld] = sv(dst[i*ld], tmp);
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_matrix_row_op(Agg agg, Op op, Saver sv,
                      int dimM, int dimN,
                      real *dst, int ld,
                      real *A, int lda,
                      real *B, int ldb) {
  for (int i = 0; i < dimM; i++) {
    real tmp = agg.init();
    for (int j = 0; j < dimN; j++) {
        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
    }
    dst[i*ld] = sv(dst[i*ld], tmp);
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
                          int dimM, int dimN,
--- a/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
-#ifndef HL_SSE_MATRIX_KERNEL_CUH_
+#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
-#define HL_SSE_MATRIX_KERNEL_CUH_
+#define HL_MATRIX_KERNEL_DETAIL_CUH_
 #include "hl_matrix_type.cuh"
 #define VECTOR_SIZE     16
 #ifndef PADDLE_TYPE_DOUBLE
 /* number of float in vector */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
 #if   defined(__APPLE__) || defined(__OSX__)
 #define     _mm_set_pd1     _mm_set1_pd
 #endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1
 #endif
 inline bool hl_check_align(size_t size) {
  return !(size & (VECTOR_SIZE - 1));
 }
@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
-#ifndef PADDLE_TYPE_DOUBLE
+template <class Agg, class Op, class Saver>
-template <class Agg>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-inline real hl_agg_op(Agg agg, vecType mm) {
+                      int dimM, int dimN,
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
+                      real *dst, int ld,
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
+                      real *A, int lda) {
-  __m128 tmp1 = agg.vecOp(lo, hi);
+  for (int i = 0; i < dimM; i++) {
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+    real tmp = agg.init();
-  __m128 ret = agg.vecOp(tmp1, tmp2);
+    for (int j = 0; j < dimN; j++) {
        tmp = agg(tmp, op(A[i * lda + j]));
    }
    dst[i*ld] = sv(dst[i*ld], tmp);
  }
 }
-  return _mm_cvtss_f32(ret);
+template <class Agg, class Op, class Saver>
 void hl_matrix_row_op(Agg agg, Op op, Saver sv,
                      int dimM, int dimN,
                      real *dst, int ld,
                      real *A, int lda,
                      real *B, int ldb) {
  for (int i = 0; i < dimM; i++) {
    real tmp = agg.init();
    for (int j = 0; j < dimN; j++) {
        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
    }
    dst[i*ld] = sv(dst[i*ld], tmp);
  }
 }
-#else
+
-template <class Agg>
+template <class Agg, class Op, class Saver>
-inline real hl_agg_op(Agg agg, vecType mm) {
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
+                         int dimM, int dimN,
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
+                         real *dst,
-  __m128d ret = agg.vecOp(lo, hi);
+                         real *A, int lda) {
-
+  for (int j = 0; j < dimN; j++) {
-  return _mm_cvtsd_f64(ret);
+    real tmp = agg.init();
    for (int i = 0; i < dimM; i++) {
        tmp = agg(tmp, op(A[i * lda + j]));
    }
    dst[j] = sv(dst[j], tmp);
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_matrix_column_op(Agg agg, Op op, Saver sv,
                         int dimM, int dimN,
                         real *dst,
                         real *A, int lda,
                         real *B, int ldb) {
  for (int j = 0; j < dimN; j++) {
    real tmp = agg.init();
    for (int i = 0; i < dimM; i++) {
        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
    }
    dst[j] = sv(dst[j], tmp);
  }
 }
 #endif
 template <class Agg, class Op, class Saver>
 void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_matrix_column_op(Agg agg, Op op, Saver sv,
                         int dimM, int dimN,
                         real *dst,
                         real *A, int lda) {
  for (int j = 0; j < dimN; j++) {
    real tmp = agg.init();
    for (int i = 0; i < dimM; i++) {
        tmp = agg(tmp, op(A[i * lda + j]));
    }
    dst[j] = sv(dst[j], tmp);
  }
 }
 template <class Agg, class Op, class Saver>
 void hl_matrix_column_op(Agg agg, Op op, Saver sv,
                         int dimM, int dimN,
                         real *dst,
                         real *A, int lda,
                         real *B, int ldb) {
  for (int j = 0; j < dimN; j++) {
    real tmp = agg.init();
    for (int i = 0; i < dimM; i++) {
        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
    }
    dst[j] = sv(dst[j], tmp);
  }
 }
 /*
 * MaxRow greater than or equal dimN
 * dimN is multiples of VECTOR_LEN
@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
  }
 }
-#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
+#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
@ -0,0 +1,50 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CPU_SCALAR_CUH_
 #define HL_CPU_SCALAR_CUH_
 #define VECTOR_SIMD false
 #define VECTOR_SET  hl_vec_set
 #ifndef PADDLE_TYPE_DOUBLE
 /* size of float */
 #define VECTOR_SIZE 4
 #else
 /* size of double */
 #define VECTOR_SIZE 8
 #endif
 typedef real vecType;
 /* Consider a real as a vector */
 #define VECTOR_LEN  1
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  return mm;
 }
 INLINE real hl_vec_set(const real r) {
  return r;
 }
 INLINE real hl_vec_classification_error(const real a,
                                        const real b,
                                        const real p,
                                        const real r) {
  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
 }
 #endif  // HL_CPU_SCALAR_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
@ -0,0 +1,73 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CPU_SIMD_NEON_CUH_
 #define HL_CPU_SIMD_NEON_CUH_
 #include <arm_neon.h>
 #define VECTOR_SIMD true
 #define VECTOR_SIZE 16
 #define VECTOR_SET  hl_vec_set
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float32x4_t vecType;
 /* number of float in vector */
 #define VECTOR_LEN  4
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  float32x4_t rev = vrev64q_f32(mm);
  float32x4_t tmp1 = agg.vecOp(rev, rev);
  float32x2_t lo = vget_high_f32(rev);
  float32x2_t hi = vget_low_f32(rev);
  float32x4_t tmp2 = vcombine_f32(hi, lo);
  float32x4_t ret = agg.vecOp(tmp1, tmp2);
  return vgetq_lane_f32(ret, 0);
 }
 inline float32x4_t hl_vec_set(const real f) {
  return vdupq_n_f32(f);
 }
 inline float32x4_t hl_vec_classification_error(const float32x4_t a,
                                               const float32x4_t b,
                                               const float32x4_t p,
                                               const float32x4_t r) {
  uint32x4_t tmp1 = vcgtq_f32(a, p);
  uint32x4_t tmp2 = vcgtq_f32(b, p);
  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
 }
 #else
 #ifdef __aarch64__
 typedef float64x2_t vecType;
 /* number of float in vector */
 #define VECTOR_LEN  2
 #define VECTOR_SET  vdupq_n_f64
 #error To be implemented
 #else
 #error NEON instructions does not support double precision
 #endif  // __aarch64__
 #endif
 #endif  // HL_CPU_SIMD_NEON_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
@ -0,0 +1,94 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CPU_SIMD_SSE_CUH_
 #define HL_CPU_SIMD_SSE_CUH_
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #define VECTOR_SIMD true
 #define VECTOR_SIZE 16
 #define VECTOR_SET  hl_vec_set
 #ifndef PADDLE_TYPE_DOUBLE
 typedef __m128  vecType;
 /* number of float in vector */
 #define VECTOR_LEN  4
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  __m128 lo = _mm_unpacklo_ps(mm, mm);
  __m128 hi = _mm_unpackhi_ps(mm, mm);
  __m128 tmp1 = agg.vecOp(lo, hi);
  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
  __m128 ret = agg.vecOp(tmp1, tmp2);
  return _mm_cvtss_f32(ret);
 }
 inline __m128 hl_vec_set(const real f) {
  return _mm_set_ps1(f);
 }
 inline __m128 hl_vec_classification_error(const __m128 a,
                                          const __m128 b,
                                          const __m128 p,
                                          const __m128 r) {
  __m128 tmp1 = _mm_cmpgt_ps(a, p);
  __m128 tmp2 = _mm_cmpgt_ps(b, p);
  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
  return _mm_and_ps(tmp3, r);
 }
 #else
 typedef __m128d vecType;
 /* number of double in vector */
 #define VECTOR_LEN  2
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  __m128d lo = _mm_unpacklo_pd(mm, mm);
  __m128d hi = _mm_unpackhi_pd(mm, mm);
  __m128d ret = agg.vecOp(lo, hi);
  return _mm_cvtsd_f64(ret);
 }
 inline __m128d hl_vec_set(const real d) {
 #if defined(__APPLE__) || defined(__OSX__)
  return _mm_set1_pd(d);
 #else
  return _mm_set_pd1(d);
 #endif
 }
 inline __m128d hl_vec_classification_error(const __m128d a,
                                           const __m128d b,
                                           const __m128d p,
                                           const __m128d r) {
  __m128d tmp1 = _mm_cmpgt_pd(a, p);
  __m128d tmp2 = _mm_cmpgt_pd(b, p);
  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
  return _mm_and_pd(tmp3, r);
 }
 #endif
 #endif  // HL_CPU_SIMD_SSE_CUH_
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@ -18,26 +18,6 @@ limitations under the License. */
 #include "hl_matrix_type.cuh"
 #ifdef __CUDA_ARCH__
 /**
 * CUDA kernel inline function
 */
 #define INLINE   __device__ inline
 #else
 /**
 * CPP inline function
 */
 #define INLINE   inline
 #endif
 #ifndef PADDLE_TYPE_DOUBLE
 #define     DEVICE_FMAX     fmaxf
 #define     DEVICE_FMIN     fminf
 #else
 #define     DEVICE_FMAX     fmax
 #define     DEVICE_FMIN     fmin
 #endif
 class BaseOp {
 public:
  static const bool sse = false;
@ -66,10 +46,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
 #elif defined(__ARM__NEON__) || defined(__ARM_NEON)
 #include "hl_matrix_base_neon.cuh"
 #else
-#include "hl_matrix_base_sse.cuh"
+#include "hl_matrix_base_detail.cuh"
 #endif
 namespace aggregate {
@ -124,7 +102,7 @@ public:
  add2(const real s1, const real s2)
    : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
  INLINE real operator()(const real a, const real b) const {
-     return p1 * a + p2 * b;
+    return p1 * a + p2 * b;
  }
 };
--- a/paddle/cuda/include/hl_matrix_base_detail.cuh
+++ b/paddle/cuda/include/hl_matrix_base_detail.cuh
@ -0,0 +1,153 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_MATRIX_BASE_DETAIL_CUH_
 #define HL_MATRIX_BASE_DETAIL_CUH_
 #include "hl_matrix_type.cuh"
 #include "hl_tensor_ops.h"
 namespace aggregate {
 class SSESum {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEMax {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::max<vecType>()(a, b);
  }
 };
 class SSEMin {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::min<vecType>()(a, b);
  }
 };
 }  // namespace aggregate
 namespace base {
 namespace unary {
 class SSEIdentity {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a) const {
    return a;
  }
 };
 }  // namespace unary
 namespace binary {
 class SSEAdd {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEAdd2 {
 public:
  static const bool sse = VECTOR_SIMD;
  const real p1;
  const real p2;
  vecType mp1;
  vecType mp2;
 public:
  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
    mp1 = hl_vec_set(p1);
    mp2 = hl_vec_set(p2);
  }
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
  }
 };
 class SSESub {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::sub<vecType>()(a, b);
  }
 };
 class SSEMul {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::mul<vecType>()(a, b);
  }
 };
 class SSEDiv {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hppl::binary::div<vecType>()(a, b);
  }
 };
 class SSESquaredDiff {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    vecType tmp = hppl::binary::sub<vecType>()(a, b);
    return hppl::binary::mul<vecType>()(tmp, tmp);
  }
 };
 class SSEFirst {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return a;
  }
 };
 class SSESecond {
 public:
  static const bool sse = VECTOR_SIMD;
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return b;
  }
 };
 class SSEClassificationError {
 public:
  static const bool sse = VECTOR_SIMD;
  const real p;
  vecType mp;
  vecType result;
 public:
  explicit SSEClassificationError(const real s) : p(s) {
    mp = hl_vec_set(p);
    result = hl_vec_set(1.0f);
  }
  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return hl_vec_classification_error(a, b, mp, result);
  }
 };
 }  // namespace binary
 }  // namespace base
 #endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_matrix_base_neon.cuh
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@ -1,161 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_MATRIX_BASE_NEON_CUH_
 #define HL_MATRIX_BASE_NEON_CUH_
 namespace aggregate {
 class SSESum {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vaddq_f32(a, b);
  }
 };
 class SSEMax {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vmaxq_f32(a, b);
  }
 };
 class SSEMin {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vminq_f32(a, b);
  }
 };
 }  // namespace aggregate
 namespace base {
 namespace unary {
 class SSEIdentity {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a) const {
    return a;
  }
 };
 }  // namespace unary
 namespace binary {
 class SSEAdd {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vaddq_f32(a, b);
  }
 };
 class SSEAdd2 {
 public:
  static const bool sse = true;
  const real p1;
  const real p2;
  float32x4_t mp1;
  float32x4_t mp2;
 public:
  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
    mp1 = vdupq_n_f32(p1);
    mp2 = vdupq_n_f32(p2);
  }
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    float32x4_t tmp1, tmp2;
    tmp1 = vmulq_f32(mp1, a);
    tmp2 = vmulq_f32(mp2, b);
    return vaddq_f32(tmp1, tmp2);
  }
 };
 class SSESub {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vsubq_f32(a, b);
  }
 };
 class SSEMul {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return vmulq_f32(a, b);
  }
 };
 class SSEDiv {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    float32x4_t tmp;
    tmp = vrecpeq_f32(b);
    return vmulq_f32(a, tmp);
  }
 };
 class SSESquaredDiff {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    float32x4_t tmp;
    tmp = vsubq_f32(a, b);
    return vmulq_f32(tmp, tmp);
  }
 };
 class SSEFirst {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return a;
  }
 };
 class SSESecond {
 public:
  static const bool sse = true;
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    return b;
  }
 };
 class SSEClassificationError {
 public:
  static const bool sse = true;
  const real p;
  float32x4_t mp;
  uint32x4_t result;
 public:
  explicit SSEClassificationError(const real s) : p(s) {
    mp = vdupq_n_f32(p);
    result = vdupq_n_u32(1);
  }
  // TODO: to be check
  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
    uint32x4_t tmp1 = vcgtq_f32(a, mp);
    uint32x4_t tmp2 = vcgtq_f32(b, mp);
    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
    return vcvtq_f32_u32(vandq_u32(tmp3, result));
  }
 };
 }  // namespace binary
 }  // namespace base
 #endif /* HL_MATRIX_BASE_NEON_CUH_ */
--- a/paddle/cuda/include/hl_matrix_base_sse.cuh
+++ b/paddle/cuda/include/hl_matrix_base_sse.cuh
@ -1,211 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_MATRIX_BASE_SSE_CUH_
 #define HL_MATRIX_BASE_SSE_CUH_
 namespace aggregate {
 class SSESum {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_add_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_add_pd(a, b);
  }
 };
 class SSEMax {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_max_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_max_pd(a, b);
  }
 };
 class SSEMin {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_min_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_min_pd(a, b);
  }
 };
 }  // namespace aggregate
 namespace base {
 namespace unary {
 class SSEIdentity {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a) const {
    return a;
  }
  INLINE __m128d vecOp(const __m128d a) const {
    return a;
  }
 };
 }  // namespace unary
 namespace binary {
 class SSEAdd {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_add_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_add_pd(a, b);
  }
 };
 class SSEAdd2 {
 public:
  static const bool sse = true;
  const real p1;
  const real p2;
  union {__m128 f; __m128d d;} mp1;
  union {__m128 f; __m128d d;} mp2;
 public:
  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
    if (sizeof(real) == sizeof(float)) {
      mp1.f = _mm_set1_ps(p1);
      mp2.f = _mm_set1_ps(p2);
    } else {
      mp1.d = _mm_set1_pd(p1);
      mp2.d = _mm_set1_pd(p2);
    }
  }
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    __m128 tmp1, tmp2;
    tmp1 = _mm_mul_ps(mp1.f, a);
    tmp2 = _mm_mul_ps(mp2.f, b);
    return _mm_add_ps(tmp1, tmp2);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    __m128d tmp1, tmp2;
    tmp1 = _mm_mul_pd(mp1.d, a);
    tmp2 = _mm_mul_pd(mp2.d, b);
    return _mm_add_pd(tmp1, tmp2);
  }
 };
 class SSESub {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_sub_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_sub_pd(a, b);
  }
 };
 class SSEMul {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_mul_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_mul_pd(a, b);
  }
 };
 class SSEDiv {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_div_ps(a, b);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_div_pd(a, b);
  }
 };
 class SSESquaredDiff {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return _mm_mul_ps(_mm_sub_ps(a, b), _mm_sub_ps(a, b));
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return _mm_mul_pd(_mm_sub_pd(a, b), _mm_sub_pd(a, b));
  }
 };
 class SSEFirst {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return a;
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return a;
  }
 };
 class SSESecond {
 public:
  static const bool sse = true;
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    return b;
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    return b;
  }
 };
 class SSEClassificationError {
 public:
  static const bool sse = true;
  const real p;
  union {__m128 f; __m128d d;} mp;
  union {__m128 f; __m128d d;} result;
 public:
  explicit SSEClassificationError(const real s) : p(s) {
    if (sizeof(real) == sizeof(float)) {
      mp.f = _mm_set1_ps(p);
      result.f = _mm_set1_ps(1.0f);
    } else {
      mp.d = _mm_set1_pd(p);
      result.d = _mm_set1_pd(1.0);
    }
  }
  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
    __m128 tmp1 = _mm_cmpgt_ps(a, mp.f);
    __m128 tmp2 = _mm_cmpgt_ps(b, mp.f);
    __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
    return _mm_and_ps(tmp3, result.f);
  }
  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
    __m128d tmp1 = _mm_cmpgt_pd(a, mp.d);
    __m128d tmp2 = _mm_cmpgt_pd(b, mp.d);
    __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
    return _mm_and_pd(tmp3, result.d);
  }
 };
 }  // namespace binary
 }  // namespace base
 #endif /* HL_MATRIX_BASE_SSE_CUH_ */
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@ -17,35 +17,35 @@ limitations under the License. */
 #include "hl_base.h"
-#if defined(__CUDA_ARCH__)
+#ifdef __CUDA_ARCH__
 /**
 * CUDA kernel inline function
 */
 #define INLINE   __device__ inline
 #else
 /**
 * CPP inline function
 */
 #define INLINE   inline
 #endif
 #ifdef __CUDA_ARCH__
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
-#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#elif defined(__SSE3__)
-#include <arm_neon.h>
+#include "hl_cpu_simd_sse.cuh"
-#ifndef PADDLE_TYPE_DOUBLE
+#define PADDLE_USE_SSE3
-typedef float32x4_t  vecType;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
-#else
+// Currently nvcc does not support neon intrinsic.
-#error NEON instructions does not support double precision
+// TODO: Extract simd intrinsic implementation from .cu files.
-#endif
+#include "hl_cpu_simd_neon.cuh"
 #define PADDLE_USE_NEON
 #else
-#include <mmintrin.h>
+#include "hl_cpu_scalar.cuh"
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef __m128  vecType;
 #else
 typedef __m128d vecType;
 #endif
 #endif
 #ifdef __CUDA_ARCH__
 #define INLINE   __device__ inline
 #else
 #define INLINE   inline
 #endif
 #endif  // HL_MATRIX_TYPE_CUH_
--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@ -328,6 +328,208 @@ public:
  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 #ifdef PADDLE_USE_SSE3
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_add_ps(a, b);
  }
 };
 template <>
 class add_scale<__m128> {
 private:
  const __m128 p1;
  const __m128 p2;
 public:
  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
  }
 };
 template <>
 class sub<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_sub_ps(a, b);
  }
 };
 template <>
 class mul<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_mul_ps(a, b);
  }
 };
 template <>
 class div<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_div_ps(a, b);
  }
 };
 template <>
 class min<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_min_ps(a, b);
  }
 };
 template <>
 class max<__m128> {
 public:
  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
    return _mm_max_ps(a, b);
  }
 };
 #else
 template <>
 class add<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_add_pd(a, b);
  }
 };
 template <>
 class add_scale<__m128d> {
 private:
  const __m128d p1;
  const __m128d p2;
 public:
  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
  }
 };
 template <>
 class sub<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_sub_pd(a, b);
  }
 };
 template <>
 class mul<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_mul_pd(a, b);
  }
 };
 template <>
 class div<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_div_pd(a, b);
  }
 };
 template <>
 class min<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_min_pd(a, b);
  }
 };
 template <>
 class max<__m128d> {
 public:
  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
    return _mm_max_pd(a, b);
  }
 };
 #endif  // PADDLE_TYPE_DOUBLE
 #endif  // PADDLE_USE_SSE3
 #ifdef PADDLE_USE_NEON
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vmulq_f32(a, b);
  }
 };
 template <>
 class add_scale<float32x4_t> {
 private:
  const float32x4_t p1;
  const float32x4_t p2;
 public:
  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
      : p1(s1), p2(s2) {}
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
  }
 };
 template <>
 class sub<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vsubq_f32(a, b);
  }
 };
 template <>
 class mul<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vmulq_f32(a, b);
  }
 };
 template <>
 class div<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    float32x4_t tmp = vrecpeq_f32(b);
    return vmulq_f32(a, tmp);
  }
 };
 template <>
 class min<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vminq_f32(a, b);
  }
 };
 template <>
 class max<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
    return vmaxq_f32(a, b);
  }
 };
 #else
 #error To be implemented
 #endif  // PADDLE_TYPE_DOUBLE
 #endif  // PADDLE_USE_NEON
 }  // namespace binary
 }  // namespace hppl