From e4733224c9e6c6c2eede669e9cdbf17e7be86501 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Fri, 20 Oct 2017 14:08:51 -0700
Subject: [PATCH 01/16] initial commit for float16

---
 paddle/math/float16.h | 142 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 paddle/math/float16.h
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000..84e533d1fc
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <istream>
+#include <ostream>
+
+#include <cuda.h>
+
+#ifdef __arm__
+#define PADDLE_ARM_32
+#endif
+
+#ifdef __aarch64__
+#define PADDLE_ARM_64
+#endif
+
+#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
+#define PADDLE_NEON_32
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64)
+#define PADDLE_NEON_64
+#endif
+
+#ifdef __CUDA_ARCH__  // use __CUDACC__ instead
+#define PADDLE_HOSTDEVICE __host__ __device__
+#if CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif  // CUDA_VERSION >= 7050
+#else
+#define PADDLE_HOSTDEVICE
+#endif  // __CUDA_ARCH__
+
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM)
+#include <immintrin.h>
+#else
+#ifdef __F16C__
+#undef __F16C__
+#endif
+#endif
+
+#define PADDLE_ALIGNED(x) __attribute__((aligned(x)))
+
+// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h
+template <typename To, typename From>
+To convert(From f) {
+  return static_cast<To>(f);
+}
+
+namespace paddle {
+
+class float16;
+
+// convert from float to half precision in round-to-nearest-even mode
+float16 float2half_rn(float f);
+float half2float(float16 h);
+
+class float16 {
+public:
+  uint16_t val_;
+
+  PADDLE_HOSTDEVICE inline explicit float16() : x(0) {}
+
+  PADDLE_HOSTDEVICE inline explicit float16(float val) {
+    float16 res = float2half_rn(val);
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int val) {
+    float16 res = cpu_float2half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(double val) {
+    float16 res = cpu_float2half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+  // and aligned at least on a 2-byte boundary, which leads to efficient
+  // memory access of float16 struct.
+} PADDLE_ALIGNED(2);
+
+namespace fp16_impl {
+
+// Conversion routine adapted from
+// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+Union Bits {
+  float f;
+  int32_t si;
+  uint32_t ui;
+};
+
+static const int shift = 13;
+static const int shiftSign = 16;
+
+static const int32_t infN = 0x7F800000;
+static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+static const int32_t sigN = 0x80000000;  // sign bit
+
+static constexpr int32_t infC = infN >> shift;
+static constexpr int32_t nanN = (infC + 1)
+                                << shift;  // minimum flt16 nan as float32
+static constexpr int32_t maxC = maxN >> shift;
+static constexpr int32_t minC = minN >> shift;
+static constexpr int32_t sigC = sigN >> shiftSign;
+
+static const int32_t mulN = 0x52000000;  //(1 << 23) / minN
+static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+static constexpr int32_t maxD = infC - maxC - 1;
+static constexpr int32_t minD = minC - subC - 1;
+
+}  // namespace half_impl
+
+}  // namespace paddle

From a208dd64ae1d7a5662a1bf4728162d8123aa89bf Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 30 Oct 2017 17:39:18 -0700
Subject: [PATCH 02/16] add float16 data type

---
 paddle/math/float16.h | 365 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 320 insertions(+), 45 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 84e533d1fc..84fe613d51 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -18,7 +18,21 @@ limitations under the License. */
 #include <istream>
 #include <ostream>
 
-#include <cuda.h>
+#include <cuda.h>  // seems need to delete it
+
+#ifdef USE_EIGEN  // delete this #if macro
+#include "Eigen/src/Core/arch/CUDA/Half.h"
+#endif
+
+#ifdef __CUDACC__
+#define PADDLE_HOSTDEVICE __host__ __device__
+#if CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif  // CUDA_VERSION >= 7050
+#else
+#define PADDLE_HOSTDEVICE
+#endif  // __CUDA_ARCH__
 
 #ifdef __arm__
 #define PADDLE_ARM_32
@@ -44,15 +58,9 @@ limitations under the License. */
 #define PADDLE_NEON_64
 #endif
 
-#ifdef __CUDA_ARCH__  // use __CUDACC__ instead
-#define PADDLE_HOSTDEVICE __host__ __device__
-#if CUDA_VERSION >= 7050
-#define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
-#endif  // CUDA_VERSION >= 7050
-#else
-#define PADDLE_HOSTDEVICE
-#endif  // __CUDA_ARCH__
+#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+#include <arm_neon.h>
+#endif
 
 #if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM)
 #include <immintrin.h>
@@ -62,7 +70,7 @@ limitations under the License. */
 #endif
 #endif
 
-#define PADDLE_ALIGNED(x) __attribute__((aligned(x)))
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 
 // https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h
 template <typename To, typename From>
@@ -72,70 +80,337 @@ To convert(From f) {
 
 namespace paddle {
 
-class float16;
+struct float16;
 
+namespace fp16_impl {
 // convert from float to half precision in round-to-nearest-even mode
-float16 float2half_rn(float f);
-float half2float(float16 h);
+PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
+PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
+PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x);
+}  // namespace fp16_impl
 
-class float16 {
-public:
-  uint16_t val_;
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+  uint16_t x;
 
-  PADDLE_HOSTDEVICE inline explicit float16() : x(0) {}
+  // explicit for different types, implicit for half and Eigen::half
+
+  PADDLE_HOSTDEVICE inline float16() {}
+
+  PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  PADDLE_HOSTDEVICE inline float16(const half h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&h)->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+/*
+#ifdef PADDLE_CUDA_FP16
+  #if CUDA_VERSION < 9000
+  PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {}
+  #else
+  PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {}
+  PADDLE_HOSTDEVICE inline float16(const half& h)
+    : x(*reinterpret_cast<uint16_t*>(&h)) {}
+  #endif // CUDA_VERSION < 9000
+#endif // PADDLE_CUDA_FP16
+*/
+
+#ifdef USE_EIGEN
+  PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
+#endif  // USE_EIGEN
+
+#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16 in arm_fp16.h
+  // which is included in arm_neon.h
+  PADDLE_HOSTDEVICE inline float16(const float16_t h) {
+    x = *reinterpret_cast<uint16_t*>(&h);
+  }
+#endif
+
+  PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
 
   PADDLE_HOSTDEVICE inline explicit float16(float val) {
-    float16 res = float2half_rn(val);
+    float16 res = fp16_impl::float_to_half_rn(val);
+    x = res.x;
+  }
+
+  template <class T>
+  PADDLE_HOSTDEVICE inline explicit float16(const T& val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
   }
 
+  PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_CUDA_FP16
+  PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+#ifdef USE_EIGEN
+  PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+#endif  // USE_EIGEN
+
+#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) {
+    x = *reinterpret_cast<uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+/*
   PADDLE_HOSTDEVICE inline explicit float16(int val) {
-    float16 res = cpu_float2half_rn(static_cast<float>(val));
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
   }
 
   PADDLE_HOSTDEVICE inline explicit float16(double val) {
-    float16 res = cpu_float2half_rn(static_cast<float>(val));
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
   }
+*/
+
+#ifdef PADDLE_CUDA_FP16
+  PADDLE_HOSTDEVICE inline operator half() {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
 
-  // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
-  // and aligned at least on a 2-byte boundary, which leads to efficient
-  // memory access of float16 struct.
-} PADDLE_ALIGNED(2);
+#ifdef USE_EIGEN
+  PADDLE_HOSTDEVICE inline operator Eigen::half() {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+#endif  // USE_EIGEN
+
+#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+  PADDLE_HOSTDEVICE inline operator float16_t() {
+    float16 h = *this;
+    return *reinterpret_cast<float16_t*>(&h);
+  }
+#endif
+
+  PADDLE_HOSTDEVICE inline explicit operator bool() {
+    return (x & 0x7fff) != 0;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator int8_t() {
+    return static_cat<int8_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator uint8_t() {
+    return static_cat<uint8_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator int16_t() {
+    return static_cat<int16_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator uint16_t() {
+    return static_cat<uint16_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator int32_t() {
+    return static_cat<int32_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator uint32_t() {
+    return static_cat<uint32_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator int64_t() {
+    return static_cat<int64_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator uint64_t() {
+    return static_cat<uint64_t>(fp16_impl::half_to_float(*this));
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator float() {
+    return fp16_impl::half_to_float(*this);
+  }
+
+  PADDLE_HOSTDEVICE inline explicit operator double() {
+    return static_cat<double>(fp16_impl::half_to_float(*this));
+  }
+};
+
+// arithmetic operators
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+__device__ inline float16 operator+(const float16& a, const float16& b) {
+  return float16(__hadd(a, b));
+}
+
+__device__ inline float16 operator-(const float16& a, const float16& b) {
+  return __hsub(a, b);
+}
+
+__device__ inline float16 operator*(const float16& a, const float16& b) {
+  return __hmul(a, b);
+}
+
+#elif  // on arm cpu
+
+#else
+
+#endif
 
 namespace fp16_impl {
 
-// Conversion routine adapted from
-// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
 Union Bits {
   float f;
   int32_t si;
   uint32_t ui;
 };
 
-static const int shift = 13;
-static const int shiftSign = 16;
+const int shift = 13;
+const int shiftSign = 16;
+
+const int32_t infN = 0x7F800000;
+const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+const int32_t sigN = 0x80000000;  // sign bit
+
+constexpr int32_t infC = infN >> shift;
+constexpr int32_t nanN = (infC + 1) << shift;  // minimum flt16 nan as float32
+constexpr int32_t maxC = maxN >> shift;
+constexpr int32_t minC = minN >> shift;
+constexpr int32_t sigC = sigN >> shiftSign;
+
+const int32_t mulN = 0x52000000;  //(1 << 23) / minN
+const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+constexpr int32_t maxD = infC - maxC - 1;
+constexpr int32_t minD = minC - subC - 1;
+
+PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  half tmp = __float2half(f);
+  return *reinterpret_cast<float16*>(&(tmp));
+
+#elif defined(__F16C__)
+  float16 res;
+  res.x = _cvtss_sh(f, 0);
+  return res;
+
+#elif defined(PADDLE_ARM_64)  // test on RPI
+  float16 res;
+  asm volatile(
+      "ld1 {v0.s}[0], [%[float_ptr]]\n"
+      "FCVT h0, s0\n"
+      "st1 {v0.h}[0], [%[half_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [float_ptr] "r"(&f),
+      [half_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
 
-static const int32_t infN = 0x7F800000;
-static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-static const int32_t sigN = 0x80000000;  // sign bit
+#else
+  // Conversion routine adapted from
+  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+  Bits v, s;
+  v.f = f;
+  uint32_t sign = v.si & sigN;
+  v.si ^= sign;
+  sign >>= shiftSign;  // logical shift
+  s.si = mulN;
+  s.si = s.f * v.f;  // correct subnormals
+  v.si ^= (s.si ^ v.si) & -(minN > v.si);
+  v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+  v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+  v.ui >>= shift;  // logical shift
+  v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+  v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+  float16 res;
+  res.x = v.ui | sign;
+  return res;
 
-static constexpr int32_t infC = infN >> shift;
-static constexpr int32_t nanN = (infC + 1)
-                                << shift;  // minimum flt16 nan as float32
-static constexpr int32_t maxC = maxN >> shift;
-static constexpr int32_t minC = minN >> shift;
-static constexpr int32_t sigC = sigN >> shiftSign;
+#endif
+}
 
-static const int32_t mulN = 0x52000000;  //(1 << 23) / minN
-static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  half tmp = *reinterpret_cast<half*>(&h);
+  return __half2float(h);
+
+#elif defined(__F16C__)
+  return _cvtsh_ss(h.x);
+
+#elif defined(PADDLE_ARM_64)  // test on RPI
+  float res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[half_ptr]]\n"
+      "FCVT s0, h0\n"
+      "st1 {v0.s}[0], [%[float_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [half_ptr] "r"(&(h.x)),
+      [float_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0");
+  return res;
 
-static constexpr int32_t maxD = infC - maxC - 1;
-static constexpr int32_t minD = minC - subC - 1;
+#else
+  // Conversion routine adapted from
+  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+  Bits v;
+  v.ui = x;
+  int32_t sign = v.si & sigC;
+  v.si ^= sign;
+  sign <<= shiftSign;
+  v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+  v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+  Bits s;
+  s.si = mulC;
+  s.f *= v.si;
+  int32_t mask = -(norC > v.si);
+  v.si <<= shift;
+  v.si ^= (s.si ^ v.si) & mask;
+  v.si |= sign;
+  return v.f;
+
+#endif
+}
+
+PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) {
+  float16 res;
+  res.x = x;
+  return res;
+}
 
 }  // namespace half_impl
 

From 9d8b30596491930c6137e56d7883370bff24d2c8 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 7 Nov 2017 13:19:59 -0800
Subject: [PATCH 03/16] small fix

---
 paddle/math/float16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 84fe613d51..5fe2854969 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <istream>
 #include <ostream>
 
-#include <cuda.h>  // seems need to delete it
+#include <cuda.h>
 
 #ifdef USE_EIGEN  // delete this #if macro
 #include "Eigen/src/Core/arch/CUDA/Half.h"
@@ -32,7 +32,7 @@ limitations under the License. */
 #endif  // CUDA_VERSION >= 7050
 #else
 #define PADDLE_HOSTDEVICE
-#endif  // __CUDA_ARCH__
+#endif  // __CUDACC__
 
 #ifdef __arm__
 #define PADDLE_ARM_32

From e877cdb8f930cbcd4112a9224232efd898a780b5 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 13 Nov 2017 23:06:07 -0800
Subject: [PATCH 04/16] add float16 arithmetic on arm cpu

---
 paddle/math/float16.h | 479 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 389 insertions(+), 90 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 5fe2854969..ae7d9754aa 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// need to define PADDLE_ARM_FP16
+
 #pragma once
 
 #include <cstdint>
@@ -24,6 +26,18 @@ limitations under the License. */
 #include "Eigen/src/Core/arch/CUDA/Half.h"
 #endif
 
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
 #ifdef __CUDACC__
 #define PADDLE_HOSTDEVICE __host__ __device__
 #if CUDA_VERSION >= 7050
@@ -48,6 +62,7 @@ limitations under the License. */
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #define PADDLE_NEON
+#include <arm_neon.h>
 #endif
 
 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
@@ -58,26 +73,16 @@ limitations under the License. */
 #define PADDLE_NEON_64
 #endif
 
-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM)
-#include <immintrin.h>
-#else
+#ifdef PADDLE_ARM
 #ifdef __F16C__
 #undef __F16C__
-#endif
-#endif
+#endif  // __F16C__
+#else
+#include <immintrin.h>
+#endif  // PADDLE_ARM
 
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 
-// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h
-template <typename To, typename From>
-To convert(From f) {
-  return static_cast<To>(f);
-}
-
 namespace paddle {
 
 struct float16;
@@ -86,13 +91,12 @@ namespace fp16_impl {
 // convert from float to half precision in round-to-nearest-even mode
 PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
 PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
-PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x);
 }  // namespace fp16_impl
 
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
-// with CUDA half and Eigen::half data types.
+// with CUDA half, ARM float16_t, and Eigen::half data types.
 struct PADDLE_ALIGN(2) float16 {
   uint16_t x;
 
@@ -103,7 +107,7 @@ struct PADDLE_ALIGN(2) float16 {
   PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16(const half h) {
+  PADDLE_HOSTDEVICE inline float16(const half& h) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&h)->x;
 #else
@@ -111,40 +115,72 @@ struct PADDLE_ALIGN(2) float16 {
 #endif  // CUDA_VERSION >= 9000
   }
 #endif  // PADDLE_CUDA_FP16
-/*
-#ifdef PADDLE_CUDA_FP16
-  #if CUDA_VERSION < 9000
-  PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {}
-  #else
-  PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {}
-  PADDLE_HOSTDEVICE inline float16(const half& h)
-    : x(*reinterpret_cast<uint16_t*>(&h)) {}
-  #endif // CUDA_VERSION < 9000
-#endif // PADDLE_CUDA_FP16
-*/
 
 #ifdef USE_EIGEN
   PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
 #endif  // USE_EIGEN
 
-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+#ifdef PADDLE_NEON
   // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16 in arm_fp16.h
-  // which is included in arm_neon.h
-  PADDLE_HOSTDEVICE inline float16(const float16_t h) {
-    x = *reinterpret_cast<uint16_t*>(&h);
+  // float16_t is an alias for __fp16 in arm_fp16.h,
+  // which is included in arm_neon.h.
+  // According to gcc, __fp16 can only be used as an argument to fp16
+  // intrinsic defined in arm_neon.h or as a storage type. It cannot
+  // be used as a formal function argument.
+  // TODO (kexinzhao): test it on RPI
+  PADDLE_HOSTDEVICE inline float16(const float16_t* h) {
+    x = *reinterpret_cast<uint16_t*>(h);
   }
 #endif
 
   PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
 
+  PADDLE_HOSTDEVICE inline explicit float16(int8_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
   PADDLE_HOSTDEVICE inline explicit float16(float val) {
     float16 res = fp16_impl::float_to_half_rn(val);
     x = res.x;
   }
 
-  template <class T>
-  PADDLE_HOSTDEVICE inline explicit float16(const T& val) {
+  PADDLE_HOSTDEVICE inline explicit float16(double val) {
     float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
   }
@@ -155,7 +191,7 @@ struct PADDLE_ALIGN(2) float16 {
   }
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) {
+  PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&rhs)->x;
 #else
@@ -172,27 +208,80 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // USE_EIGEN
 
-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) {
-    x = *reinterpret_cast<uint16_t*>(&rhs);
+#ifdef PADDLE_NEON
+  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
+    x = *reinterpret_cast<uint16_t*>(rhs);
     return *this;
   }
 #endif
 
-/*
-  PADDLE_HOSTDEVICE inline explicit float16(int val) {
+  PADDLE_HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) {
     float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
+    return *this;
   }
 
-  PADDLE_HOSTDEVICE inline explicit float16(double val) {
+  PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) {
     float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
     x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(float val) {
+    float16 res = fp16_impl::float_to_half_rn(val);
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(double val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
   }
-*/
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline operator half() {
+  PADDLE_HOSTDEVICE inline operator half() const {
 #if CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
@@ -206,82 +295,270 @@ struct PADDLE_ALIGN(2) float16 {
 #endif  // PADDLE_CUDA_FP16
 
 #ifdef USE_EIGEN
-  PADDLE_HOSTDEVICE inline operator Eigen::half() {
+  PADDLE_HOSTDEVICE inline operator Eigen::half() const {
     Eigen::half h;
     h.x = x;
     return h;
   }
 #endif  // USE_EIGEN
 
-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-  PADDLE_HOSTDEVICE inline operator float16_t() {
+#ifdef PADDLE_NEON
+  // check whether it works or not
+  PADDLE_HOSTDEVICE inline operator float16_t() const {
     float16 h = *this;
     return *reinterpret_cast<float16_t*>(&h);
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline explicit operator bool() {
+  PADDLE_HOSTDEVICE inline explicit operator bool() const {
     return (x & 0x7fff) != 0;
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int8_t() {
-    return static_cat<int8_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint8_t() {
-    return static_cat<uint8_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int16_t() {
-    return static_cat<int16_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint16_t() {
-    return static_cat<uint16_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int32_t() {
-    return static_cat<int32_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint32_t() {
-    return static_cat<uint32_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int64_t() {
-    return static_cat<int64_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint64_t() {
-    return static_cat<uint64_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(fp16_impl::half_to_float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator float() {
+  PADDLE_HOSTDEVICE inline explicit operator float() const {
     return fp16_impl::half_to_float(*this);
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator double() {
-    return static_cat<double>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(fp16_impl::half_to_float(*this));
   }
 };
 
 // arithmetic operators
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 __device__ inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(a, b));
+  return float16(__hadd(half(a), half(b)));
 }
 
 __device__ inline float16 operator-(const float16& a, const float16& b) {
-  return __hsub(a, b);
+  return float16(__hsub(half(a), half(b)));
 }
 
 __device__ inline float16 operator*(const float16& a, const float16& b) {
-  return __hmul(a, b);
+  return float16(__hmul(half(a), half(b)));
 }
 
-#elif  // on arm cpu
+__device__ inline float16 operator/(const float16& a, const float16& b) {
+  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
+  // instinsic
+  float num = __half2float(half(a));
+  float denom = __half2float(half(b));
+  return float16(num / denom);
+}
 
-#else
+__device__ inline float16 operator-(const float16& a) {
+  return float16(__hneg(half(a)));
+}
+
+__device__ inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+__device__ inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+__device__ inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+__device__ inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+__device__ inline bool operator==(const float16& a, const float16& b) {
+  return __heq(half(a), half(b));
+}
+
+__device__ inline bool operator!=(const float16& a, const float16& b) {
+  return __hne(half(a), half(b));
+}
+
+__device__ inline bool operator<(const float16& a, const float16& b) {
+  return __hlt(half(a), half(b));
+}
+
+__device__ inline bool operator<=(const float16& a, const float16& b) {
+  return __hle(half(a), half(b));
+}
+
+__device__ inline bool operator>(const float16& a, const float16& b) {
+  return __hgt(half(a), half(b));
+}
+
+__device__ inline bool operator>=(const float16& a, const float16& b) {
+  return __hge(half(a), half(b));
+}
+
+// On ARMv8.2-A CPU
+#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \
+    defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16)
+__host__ inline float16 operator+(const float16& a, const float16& b) {
+  return float16(vaddh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator-(const float16& a, const float16& b) {
+  return float16(vsubh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator*(const float16& a, const float16& b) {
+  return float16(vmulh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator/(const float16& a, const float16& b) {
+  return float16(vdivh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator-(const float16& a) {
+  return float16(vnegh_f16(float16_t(a)));
+}
+
+__host__ inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+__host__ inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+__host__ inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+__host__ inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+__host__ inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+// compare only available in NEON_64
+__host__ inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
+}
+
+#else  // software emulation on other cpu
+PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
 
 #endif
 
@@ -320,16 +597,11 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
   half tmp = __float2half(f);
   return *reinterpret_cast<float16*>(&(tmp));
 
-#elif defined(__F16C__)
-  float16 res;
-  res.x = _cvtss_sh(f, 0);
-  return res;
-
-#elif defined(PADDLE_ARM_64)  // test on RPI
+#elif defined(PADDLE_NEON_64)  // test on RPI
   float16 res;
   asm volatile(
       "ld1 {v0.s}[0], [%[float_ptr]]\n"
-      "FCVT h0, s0\n"
+      "fcvt h0, s0\n"
       "st1 {v0.h}[0], [%[half_ptr]]\n"
       :  // outputs
       :  // inputs
@@ -339,6 +611,25 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
       "memory", "v0");
   return res;
 
+#elif defined(PADDLE_NEON_32)  // test on RPI
+  float16 res;
+  asm volatile(
+      "vld1.32 {d0[0]}, [%[float_ptr]]\n"
+      "vcvt.f16.f32 d0, q0\n"
+      "vst1.16 {d0[0]}, [%[half_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [float_ptr] "r"(&f),
+      [half_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "d0");
+  return res;
+
+#elif defined(__F16C__)
+  float16 res;
+  res.x = _cvtss_sh(f, 0);
+  return res;
+
 #else
   // Conversion routine adapted from
   // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
@@ -367,10 +658,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
   half tmp = *reinterpret_cast<half*>(&h);
   return __half2float(h);
 
-#elif defined(__F16C__)
-  return _cvtsh_ss(h.x);
-
-#elif defined(PADDLE_ARM_64)  // test on RPI
+#elif defined(PADDLE_NEON_64)
   float res;
   asm volatile(
       "ld1 {v0.h}[0], [%[half_ptr]]\n"
@@ -384,6 +672,23 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
       "memory", "v0");
   return res;
 
+#elif defined(PADDLE_NEON_32)
+  float res;
+  asm volatile(
+      "vld1.16 {d0[0]}, [%[half_ptr]]\n"
+      "vcvt.f32.f16 q0, d0\n"
+      "vst1.32 {d0[0]}, [%[float_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [half_ptr] "r"(&(h.x)),
+      [float_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0");
+  return res;
+
+#elif defined(__F16C__)
+  return _cvtsh_ss(h.x);
+
 #else
   // Conversion routine adapted from
   // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
@@ -406,12 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
 #endif
 }
 
-PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) {
-  float16 res;
-  res.x = x;
-  return res;
-}
-
 }  // namespace half_impl
 
 }  // namespace paddle

From af37838edf4a3ad3c1f098d4026218c130258ac2 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Wed, 15 Nov 2017 22:48:01 -0800
Subject: [PATCH 05/16] add test for float16

---
 paddle/math/float16.h            | 16 ++++++++--------
 paddle/math/tests/CMakeLists.txt |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index ae7d9754aa..e9d4e6737d 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <istream>
 #include <ostream>
 
-#include <cuda.h>
+#define USE_EIGEN
 
 #ifdef USE_EIGEN  // delete this #if macro
 #include "Eigen/src/Core/arch/CUDA/Half.h"
@@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
 struct PADDLE_ALIGN(2) float16 {
   uint16_t x;
 
-  // explicit for different types, implicit for half and Eigen::half
-
   PADDLE_HOSTDEVICE inline float16() {}
 
   PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
@@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 {
   PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
 #endif  // USE_EIGEN
 
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
@@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // USE_EIGEN
 
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
   PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
     x = *reinterpret_cast<uint16_t*>(rhs);
     return *this;
@@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // USE_EIGEN
 
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
   // check whether it works or not
   PADDLE_HOSTDEVICE inline operator float16_t() const {
     float16 h = *this;
@@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) {
 
 __device__ inline float16 operator/(const float16& a, const float16& b) {
   // TODO(kexinzhao): check the cuda version that starts to support __hdiv
-  // instinsic
   float num = __half2float(half(a));
   float denom = __half2float(half(b));
   return float16(num / denom);
@@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1;
 PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   half tmp = __float2half(f);
-  return *reinterpret_cast<float16*>(&(tmp));
+  return *reinterpret_cast<float16*>(&tmp);
 
 #elif defined(PADDLE_NEON_64)  // test on RPI
   float16 res;
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index d8b7f9e3fc..ab4ac38b3c 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WITH_GPU)
     CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-    link_paddle_test(test_lazyAssign)
+    link_paddle_test(test_lazyAssign) 
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +33,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)

From 4f1aa5bc0ee3c00fa792cfe188fabaab290938b1 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Thu, 16 Nov 2017 09:17:09 -0800
Subject: [PATCH 06/16] add test cases

---
 paddle/math/float16.h              | 12 ++---
 paddle/math/tests/test_float16.cpp | 78 ++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 paddle/math/tests/test_float16.cpp

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index e9d4e6737d..9c06b423ef 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #define USE_EIGEN
 
 #ifdef USE_EIGEN  // delete this #if macro
-#include "Eigen/src/Core/arch/CUDA/Half.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 #endif
 
 #ifdef __GNUC__
@@ -126,7 +126,7 @@ struct PADDLE_ALIGN(2) float16 {
   // According to gcc, __fp16 can only be used as an argument to fp16
   // intrinsic defined in arm_neon.h or as a storage type. It cannot
   // be used as a formal function argument.
-  // TODO (kexinzhao): test it on RPI
+  // TODO(kexinzhao): test it on RPI
   PADDLE_HOSTDEVICE inline float16(const float16_t* h) {
     x = *reinterpret_cast<uint16_t*>(h);
   }
@@ -564,7 +564,7 @@ PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 
 namespace fp16_impl {
 
-Union Bits {
+union Bits {
   float f;
   int32_t si;
   uint32_t ui;
@@ -584,7 +584,7 @@ constexpr int32_t maxC = maxN >> shift;
 constexpr int32_t minC = minN >> shift;
 constexpr int32_t sigC = sigN >> shiftSign;
 
-const int32_t mulN = 0x52000000;  //(1 << 23) / minN
+const int32_t mulN = 0x52000000;  // (1 << 23) / minN
 const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
 const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
 const int32_t norC = 0x00400;     // min flt32 normal downshifted
@@ -693,7 +693,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
   // Conversion routine adapted from
   // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
   Bits v;
-  v.ui = x;
+  v.ui = h.x;
   int32_t sign = v.si & sigC;
   v.si ^= sign;
   sign <<= shiftSign;
@@ -711,6 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
 #endif
 }
 
-}  // namespace half_impl
+}  // namespace fp16_impl
 
 }  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..79f63d3a80
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/float16.h"
+
+namespace paddle {
+
+#ifdef PADDLE_CUDA_FP16
+TEST(float16, gpu) {
+  // Conversion to and from cuda half
+  float16 v1 = half(float16(1.0f));
+  EXPECT_EQ(v1.x, 0x3c00);
+
+  // Conversion to and from Eigen::half
+  float16 v2 = Eigen::half(float16(0.5f));
+  EXPECT_EQ(v2.x, 0x3800);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+
+  // Conversion from int
+
+  // Conversion from bool
+}
+
+TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
+
+TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
+#endif
+
+TEST(float16, conversion_cpu) {
+  // Conversion to and from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+
+  // Conversion from int
+
+  // Conversion from bool
+}
+
+TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
+
+TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
+
+}  // namespace paddle

From 979d2e0b092a1378290ddae421f8793d00fd0938 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Thu, 16 Nov 2017 10:05:30 -0800
Subject: [PATCH 07/16] small fix

---
 paddle/math/float16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 9c06b423ef..3275546e69 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -426,8 +426,8 @@ __device__ inline bool operator>=(const float16& a, const float16& b) {
 }
 
 // On ARMv8.2-A CPU
-#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \
-    defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16)
+#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39)
 __host__ inline float16 operator+(const float16& a, const float16& b) {
   return float16(vaddh_f16(float16_t(a), float16_t(b)));
 }

From 22dfa5fa8aaec63753c73848813e280560a8152f Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Thu, 16 Nov 2017 14:39:49 -0800
Subject: [PATCH 08/16] fix GPU compiling

---
 paddle/math/float16.h                                | 12 ++++++------
 paddle/math/tests/CMakeLists.txt                     |  5 ++++-
 .../math/tests/{test_float16.cpp => test_float16.cu} |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)
 rename paddle/math/tests/{test_float16.cpp => test_float16.cu} (98%)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 3275546e69..6799a83bd3 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -118,8 +118,8 @@ struct PADDLE_ALIGN(2) float16 {
   PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
 #endif  // USE_EIGEN
 
-#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
-    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
@@ -207,8 +207,8 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // USE_EIGEN
 
-#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
-    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
   PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
     x = *reinterpret_cast<uint16_t*>(rhs);
     return *this;
@@ -302,8 +302,8 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // USE_EIGEN
 
-#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
-    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
   // check whether it works or not
   PADDLE_HOSTDEVICE inline operator float16_t() const {
     float16 h = *this;
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index ab4ac38b3c..dc06f99090 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -22,15 +22,18 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign) 
+    CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu)
+    link_paddle_test(test_float16)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
     compile_cu_as_cpp(test_lazyAssign.cu)
     add_unittest(test_lazyAssign test_lazyAssign.cu)
+    compile_cu_as_cpp(test_float16.cu)
+    add_unittest(test_float16 test_float16.cu)
 endif(WITH_GPU)
 
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
-add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cu
similarity index 98%
rename from paddle/math/tests/test_float16.cpp
rename to paddle/math/tests/test_float16.cu
index 79f63d3a80..40bc54f5b4 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 namespace paddle {
 
 #ifdef PADDLE_CUDA_FP16
-TEST(float16, gpu) {
+TEST(float16, conversion_gpu) {
   // Conversion to and from cuda half
   float16 v1 = half(float16(1.0f));
   EXPECT_EQ(v1.x, 0x3c00);

From 080ff0c83200a229fb032cd03d4d900b634b1b02 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Thu, 16 Nov 2017 16:28:33 -0800
Subject: [PATCH 09/16] two tests for cpu and gpu separately

---
 paddle/math/tests/CMakeLists.txt   |  6 ++--
 paddle/math/tests/test_float16.cpp | 47 ++++++++++++++++++++++++++++++
 paddle/math/tests/test_float16.cu  | 32 +-------------------
 3 files changed, 50 insertions(+), 35 deletions(-)
 create mode 100644 paddle/math/tests/test_float16.cpp

diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index dc06f99090..c131544515 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -18,21 +18,19 @@ add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 
 if(WITH_GPU)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
     CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign) 
-    CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu)
-    link_paddle_test(test_float16)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
     compile_cu_as_cpp(test_lazyAssign.cu)
     add_unittest(test_lazyAssign test_lazyAssign.cu)
-    compile_cu_as_cpp(test_float16.cu)
-    add_unittest(test_float16 test_float16.cu)
 endif(WITH_GPU)
 
+cc_test(test_float16 SRCS test_float16.cpp)
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..8d4279b413
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/float16.h"
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Conversion to and from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+
+  // Conversion from int
+
+  // Conversion from bool
+}
+
+TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
+
+TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
index 40bc54f5b4..6c0a1c351c 100644
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
@@ -39,40 +39,10 @@ TEST(float16, conversion_gpu) {
 
   // Conversion from bool
 }
+#endif
 
 TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
 
 TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
-#endif
-
-TEST(float16, conversion_cpu) {
-  // Conversion to and from Eigen::half
-  EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00);
-  EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800);
-  EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555);
-  EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000);
-  EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000);
-  EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff);
-  EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00);
-
-  // Conversion from float
-  EXPECT_EQ(float16(1.0f).x, 0x3c00);
-  EXPECT_EQ(float16(0.5f).x, 0x3800);
-  EXPECT_EQ(float16(0.33333f).x, 0x3555);
-  EXPECT_EQ(float16(0.0f).x, 0x0000);
-  EXPECT_EQ(float16(-0.0f).x, 0x8000);
-  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
-  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
-
-  // Conversion from double
-
-  // Conversion from int
-
-  // Conversion from bool
-}
-
-TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
-
-TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
 
 }  // namespace paddle

From 734cac1a53b904c7d3f76fe66cee1b2d19632dcf Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Fri, 17 Nov 2017 00:04:58 -0800
Subject: [PATCH 10/16] fix CUDA_VERSION issue

---
 paddle/math/float16.h              | 29 ++++++++++++++++++++++++++++-
 paddle/math/tests/test_float16.cpp |  2 ++
 paddle/math/tests/test_float16.cu  |  2 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 6799a83bd3..1922192f7b 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #include <istream>
 #include <ostream>
 
+#include <cuda.h>
+
+#include "paddle/utils/Logging.h"
+
 #define USE_EIGEN
 
 #ifdef USE_EIGEN  // delete this #if macro
@@ -48,6 +52,27 @@ limitations under the License. */
 #define PADDLE_HOSTDEVICE
 #endif  // __CUDACC__
 
+#define STR(x) #x
+#define XSTR(x) STR(x)
+
+#ifndef __CUDACC__
+#pragma message "__CUDACC__ not defined"
+#else
+#pragma message "__CUDACC__ defined"
+#endif
+
+#ifndef CUDA_VERSION
+#pragma message "CUDA_VERSION not defined"
+#else
+#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION)
+#endif
+
+#ifdef __CUDA_ARCH__
+#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__)
+#else
+#pragma message "CUDA ARCH NOT DEFINED!"
+#endif
+
 #ifdef __arm__
 #define PADDLE_ARM_32
 #endif
@@ -359,6 +384,7 @@ struct PADDLE_ALIGN(2) float16 {
 // arithmetic operators
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 __device__ inline float16 operator+(const float16& a, const float16& b) {
+  printf("GPU Intrinsic used!");
   return float16(__hadd(half(a), half(b)));
 }
 
@@ -495,6 +521,7 @@ __host__ inline bool operator>=(const float16& a, const float16& b) {
 
 #else  // software emulation on other cpu
 PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  LOG(INFO) << "CPU emulation used";
   return float16(float(a) + float(b));
 }
 
@@ -656,7 +683,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
 PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   half tmp = *reinterpret_cast<half*>(&h);
-  return __half2float(h);
+  return __half2float(tmp);
 
 #elif defined(PADDLE_NEON_64)
   float res;
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
index 8d4279b413..1a20d0e925 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 namespace paddle {
 
 TEST(float16, conversion_cpu) {
+  LOG(INFO) << "cpu test started!";
+
   // Conversion to and from Eigen::half
   EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00);
   EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800);
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
index 6c0a1c351c..9ca77cf86c 100644
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
@@ -16,6 +16,8 @@ namespace paddle {
 
 #ifdef PADDLE_CUDA_FP16
 TEST(float16, conversion_gpu) {
+  LOG(INFO) << "GPU tests started";
+
   // Conversion to and from cuda half
   float16 v1 = half(float16(1.0f));
   EXPECT_EQ(v1.x, 0x3c00);

From 0f4bf1c939cea4bd3c7516eb5a9787b05563cea0 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Sun, 19 Nov 2017 03:00:38 -0800
Subject: [PATCH 11/16] Add GPU device code for testing

---
 paddle/math/float16.h              |  71 ++--------
 paddle/math/tests/test_float16.cpp | 102 ++++++++++++--
 paddle/math/tests/test_float16.cu  | 217 +++++++++++++++++++++++++----
 3 files changed, 296 insertions(+), 94 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 1922192f7b..a1c341113f 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-// need to define PADDLE_ARM_FP16
-
 #pragma once
 
 #include <cstdint>
@@ -21,14 +19,7 @@ limitations under the License. */
 #include <ostream>
 
 #include <cuda.h>
-
-#include "paddle/utils/Logging.h"
-
-#define USE_EIGEN
-
-#ifdef USE_EIGEN  // delete this #if macro
 #include "unsupported/Eigen/CXX11/Tensor"
-#endif
 
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
@@ -52,27 +43,6 @@ limitations under the License. */
 #define PADDLE_HOSTDEVICE
 #endif  // __CUDACC__
 
-#define STR(x) #x
-#define XSTR(x) STR(x)
-
-#ifndef __CUDACC__
-#pragma message "__CUDACC__ not defined"
-#else
-#pragma message "__CUDACC__ defined"
-#endif
-
-#ifndef CUDA_VERSION
-#pragma message "CUDA_VERSION not defined"
-#else
-#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION)
-#endif
-
-#ifdef __CUDA_ARCH__
-#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__)
-#else
-#pragma message "CUDA ARCH NOT DEFINED!"
-#endif
-
 #ifdef __arm__
 #define PADDLE_ARM_32
 #endif
@@ -113,7 +83,7 @@ namespace paddle {
 struct float16;
 
 namespace fp16_impl {
-// convert from float to half precision in round-to-nearest-even mode
+// Convert from float to half precision in round-to-nearest-even mode
 PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
 PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
 }  // namespace fp16_impl
@@ -125,7 +95,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
 struct PADDLE_ALIGN(2) float16 {
   uint16_t x;
 
-  PADDLE_HOSTDEVICE inline float16() {}
+  PADDLE_HOSTDEVICE inline float16() : x(0) {}
 
   PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
 
@@ -139,21 +109,15 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-#ifdef USE_EIGEN
   PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
-#endif  // USE_EIGEN
 
 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
     (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
-  // According to gcc, __fp16 can only be used as an argument to fp16
-  // intrinsic defined in arm_neon.h or as a storage type. It cannot
-  // be used as a formal function argument.
-  // TODO(kexinzhao): test it on RPI
-  PADDLE_HOSTDEVICE inline float16(const float16_t* h) {
-    x = *reinterpret_cast<uint16_t*>(h);
+  PADDLE_HOSTDEVICE inline float16(const float16_t& h) {
+    x = *reinterpret_cast<uint16_t*>(&h);
   }
 #endif
 
@@ -225,17 +189,15 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-#ifdef USE_EIGEN
   PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
     x = rhs.x;
     return *this;
   }
-#endif  // USE_EIGEN
 
 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
     (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
-    x = *reinterpret_cast<uint16_t*>(rhs);
+  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<uint16_t*>(&rhs);
     return *this;
   }
 #endif
@@ -319,17 +281,14 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-#ifdef USE_EIGEN
   PADDLE_HOSTDEVICE inline operator Eigen::half() const {
     Eigen::half h;
     h.x = x;
     return h;
   }
-#endif  // USE_EIGEN
 
 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
     (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  // check whether it works or not
   PADDLE_HOSTDEVICE inline operator float16_t() const {
     float16 h = *this;
     return *reinterpret_cast<float16_t*>(&h);
@@ -381,10 +340,9 @@ struct PADDLE_ALIGN(2) float16 {
   }
 };
 
-// arithmetic operators
+// Arithmetic operators
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 __device__ inline float16 operator+(const float16& a, const float16& b) {
-  printf("GPU Intrinsic used!");
   return float16(__hadd(half(a), half(b)));
 }
 
@@ -452,7 +410,7 @@ __device__ inline bool operator>=(const float16& a, const float16& b) {
 }
 
 // On ARMv8.2-A CPU
-#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \
+#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
     (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39)
 __host__ inline float16 operator+(const float16& a, const float16& b) {
   return float16(vaddh_f16(float16_t(a), float16_t(b)));
@@ -502,7 +460,7 @@ __host__ inline bool operator!=(const float16& a, const float16& b) {
   return !(a == b);
 }
 
-// compare only available in NEON_64
+#ifdef PADDLE_NEON_64
 __host__ inline bool operator<(const float16& a, const float16& b) {
   return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
 }
@@ -518,10 +476,10 @@ __host__ inline bool operator>(const float16& a, const float16& b) {
 __host__ inline bool operator>=(const float16& a, const float16& b) {
   return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
 }
+#endif  // PADDLE_NEON_64
 
-#else  // software emulation on other cpu
+#else  // Software emulation on other cpu
 PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-  LOG(INFO) << "CPU emulation used";
   return float16(float(a) + float(b));
 }
 
@@ -624,7 +582,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
   half tmp = __float2half(f);
   return *reinterpret_cast<float16*>(&tmp);
 
-#elif defined(PADDLE_NEON_64)  // test on RPI
+#elif defined(PADDLE_NEON_64)
   float16 res;
   asm volatile(
       "ld1 {v0.s}[0], [%[float_ptr]]\n"
@@ -638,7 +596,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
       "memory", "v0");
   return res;
 
-#elif defined(PADDLE_NEON_32)  // test on RPI
+#elif defined(PADDLE_NEON_32)
   float16 res;
   asm volatile(
       "vld1.32 {d0[0]}, [%[float_ptr]]\n"
@@ -689,7 +647,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
   float res;
   asm volatile(
       "ld1 {v0.h}[0], [%[half_ptr]]\n"
-      "FCVT s0, h0\n"
+      "fcvt s0, h0\n"
       "st1 {v0.s}[0], [%[float_ptr]]\n"
       :  // outputs
       :  // inputs
@@ -739,5 +697,4 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
 }
 
 }  // namespace fp16_impl
-
 }  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
index 1a20d0e925..8c74bcc039 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -9,22 +9,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "paddle/math/float16.h"
 
+#include <gtest/gtest.h>
+
 namespace paddle {
 
 TEST(float16, conversion_cpu) {
-  LOG(INFO) << "cpu test started!";
-
-  // Conversion to and from Eigen::half
-  EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00);
-  EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800);
-  EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555);
-  EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000);
-  EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000);
-  EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff);
-  EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00);
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
 
   // Conversion from float
   EXPECT_EQ(float16(1.0f).x, 0x3c00);
@@ -36,14 +35,91 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(65536.0f).x, 0x7c00);
 
   // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
 
   // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
 
   // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Implicit conversion to and from Eigen::half
+  Eigen::half tmp = float16(1.0f);
+  float16 v_conv = tmp;
+  EXPECT_EQ(tmp.x, 0x3c00);
+  EXPECT_EQ(v_conv.x, 0x3c00);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
 }
 
-TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
 
-TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
 
 }  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
index 9ca77cf86c..941f266603 100644
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
@@ -9,42 +9,211 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "paddle/math/float16.h"
 
-namespace paddle {
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                      \
+  __global__ void op_type(                                    \
+      const float16* in1, const float16* in2, float16* out) { \
+    out[0] = in1[0] sign in2[0];                              \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign)                        \
+  __global__ void op_type(float16* in1, const float16* in2) { \
+    in1[0] sign in2[0];                                       \
+  }
+
+#define COMPARISON_KERNEL(op_type, sign)                                       \
+  __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                               \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    float16 *in1, *in2, *out;                                 \
+    float16 *d_in1, *d_in2, *d_out;                           \
+    int size = sizeof(float16);                               \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (float16*)malloc(size);                             \
+    in2 = (float16*)malloc(size);                             \
+    out = (float16*)malloc(size);                             \
+    in1[0] = float16(v_in1);                                  \
+    in2[0] = float16(v_in2);                                  \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(out[0]), v_out);                          \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    float16 *in1, *in2;                                       \
+    float16 *d_in1, *d_in2;                                   \
+    int size = sizeof(float16);                               \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (float16*)malloc(size);                             \
+    in2 = (float16*)malloc(size);                             \
+    in1[0] = float16(v_in1);                                  \
+    in2[0] = float16(v_in2);                                  \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(in1[0]), v_out);                          \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    float16 *in1, *in2;                                      \
+    float16 *d_in1, *d_in2;                                  \
+    bool *out, *d_out;                                       \
+    int size = sizeof(float16);                              \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (float16*)malloc(size);                            \
+    in2 = (float16*)malloc(size);                            \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = float16(v_in1);                                 \
+    in2[0] = float16(v_in2);                                 \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
 
 #ifdef PADDLE_CUDA_FP16
-TEST(float16, conversion_gpu) {
-  LOG(INFO) << "GPU tests started";
+namespace paddle {
 
-  // Conversion to and from cuda half
-  float16 v1 = half(float16(1.0f));
-  EXPECT_EQ(v1.x, 0x3c00);
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
 
-  // Conversion to and from Eigen::half
-  float16 v2 = Eigen::half(float16(0.5f));
-  EXPECT_EQ(v2.x, 0x3800);
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
 
-  // Conversion from float
-  EXPECT_EQ(float16(1.0f).x, 0x3c00);
-  EXPECT_EQ(float16(0.5f).x, 0x3800);
-  EXPECT_EQ(float16(0.33333f).x, 0x3555);
-  EXPECT_EQ(float16(0.0f).x, 0x0000);
-  EXPECT_EQ(float16(-0.0f).x, 0x8000);
-  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
-  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+// Negative sign kernel
+__global__ void Neg(float16* in) { in[0] = -in[0]; }
 
-  // Conversion from double
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  float16 *in, *d_in;
+  int size = sizeof(float16);
+  cudaMalloc((void**)&d_in, size);
+  in = (float16*)malloc(size);
+  in[0] = float16(v_in);
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(in[0]), v_out);
+  free(in);
+  cudaFree(d_in);
+}
 
-  // Conversion from int
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
 
-  // Conversion from bool
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
 }
-#endif
 
-TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
 
-TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Implicit conversion to and from cuda half
+  half tmp = float16(1.0f);
+  float16 val = tmp;
+  EXPECT_EQ(val.x, 0x3c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = tmp;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
 
 }  // namespace paddle
+#endif

From d646e4768fc0049e172f59f8786d9aeeec50491e Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 20 Nov 2017 00:33:27 -0800
Subject: [PATCH 12/16] fix cmake

---
 paddle/math/tests/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index c131544515..215bac1271 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -18,11 +18,11 @@ add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 
 if(WITH_GPU)
-    nv_test(test_float16_gpu SRCS test_float16.cu)
     CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-    link_paddle_test(test_lazyAssign) 
+    link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -30,8 +30,8 @@ else()
     add_unittest(test_lazyAssign test_lazyAssign.cu)
 endif(WITH_GPU)
 
-cc_test(test_float16 SRCS test_float16.cpp)
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+cc_test(test_float16 SRCS test_float16.cpp)

From 19e5c24f00fac22da84387510e94596fb577637b Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 20 Nov 2017 17:23:04 -0800
Subject: [PATCH 13/16] fix bug

---
 paddle/math/float16.h | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index a1c341113f..3b22174148 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -15,8 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
-#include <istream>
-#include <ostream>
 
 #include <cuda.h>
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -117,7 +115,8 @@ struct PADDLE_ALIGN(2) float16 {
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
   PADDLE_HOSTDEVICE inline float16(const float16_t& h) {
-    x = *reinterpret_cast<uint16_t*>(&h);
+    float16_t tmp = h;
+    x = *reinterpret_cast<uint16_t*>(&tmp);
   }
 #endif
 
@@ -197,7 +196,8 @@ struct PADDLE_ALIGN(2) float16 {
 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
     (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
   PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    x = *reinterpret_cast<uint16_t*>(&rhs);
+    float16_t tmp = rhs;
+    x = *reinterpret_cast<uint16_t*>(&tmp);
     return *this;
   }
 #endif
@@ -460,23 +460,37 @@ __host__ inline bool operator!=(const float16& a, const float16& b) {
   return !(a == b);
 }
 
-#ifdef PADDLE_NEON_64
 __host__ inline bool operator<(const float16& a, const float16& b) {
+#ifdef PADDLE_NEON_64
   return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
+#else
+  return float(a) < float(b);
+#endif  // PADDLE_NEON_64
 }
 
 __host__ inline bool operator<=(const float16& a, const float16& b) {
+#ifdef PADDLE_NEON_64
   return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
+#else
+  return float(a) <= float(b);
+#endif  // PADDLE_NEON_64
 }
 
 __host__ inline bool operator>(const float16& a, const float16& b) {
+#ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
+#else
+  return float(a) > float(b);
+#endif  // PADDLE_NEON_64
 }
 
 __host__ inline bool operator>=(const float16& a, const float16& b) {
+#ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
-}
+#else
+  return float(a) >= float(b);
 #endif  // PADDLE_NEON_64
+}
 
 #else  // Software emulation on other cpu
 PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {

From a5feb771592d1bd7340ff7132518d6c52829b8e7 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 27 Nov 2017 17:12:21 -0800
Subject: [PATCH 14/16] address pr comment

---
 paddle/math/float16.h              | 839 +++++++++++++++++------------
 paddle/math/tests/test_float16.cpp |   2 +
 2 files changed, 482 insertions(+), 359 deletions(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 3b22174148..65c0489e1f 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -16,9 +16,14 @@ limitations under the License. */
 
 #include <cstdint>
 
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "paddle/platform/hostdevice.h"
+
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
 #else
@@ -31,25 +36,12 @@ limitations under the License. */
 #define PADDLE_CLANG_VER 0
 #endif  // __clang__
 
-#ifdef __CUDACC__
-#define PADDLE_HOSTDEVICE __host__ __device__
-#if CUDA_VERSION >= 7050
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
 #define PADDLE_CUDA_FP16
 #include <cuda_fp16.h>
-#endif  // CUDA_VERSION >= 7050
-#else
-#define PADDLE_HOSTDEVICE
-#endif  // __CUDACC__
-
-#ifdef __arm__
-#define PADDLE_ARM_32
 #endif
 
-#ifdef __aarch64__
-#define PADDLE_ARM_64
-#endif
-
-#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64)
+#if defined(__arm__) || defined(__aarch64__)
 #define PADDLE_ARM
 #endif
 
@@ -58,19 +50,12 @@ limitations under the License. */
 #include <arm_neon.h>
 #endif
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
-#define PADDLE_NEON_32
-#endif
-
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64)
-#define PADDLE_NEON_64
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
 #endif
 
-#ifdef PADDLE_ARM
-#ifdef __F16C__
-#undef __F16C__
-#endif  // __F16C__
-#else
+#ifndef PADDLE_ARM
 #include <immintrin.h>
 #endif  // PADDLE_ARM
 
@@ -78,27 +63,20 @@ limitations under the License. */
 
 namespace paddle {
 
-struct float16;
-
-namespace fp16_impl {
-// Convert from float to half precision in round-to-nearest-even mode
-PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
-PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
-}  // namespace fp16_impl
-
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
 // with CUDA half, ARM float16_t, and Eigen::half data types.
 struct PADDLE_ALIGN(2) float16 {
+public:
   uint16_t x;
 
-  PADDLE_HOSTDEVICE inline float16() : x(0) {}
+  HOSTDEVICE inline float16() : x(0) {}
 
-  PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16(const half& h) {
+  HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&h)->x;
 #else
@@ -107,78 +85,64 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
+#ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
-  PADDLE_HOSTDEVICE inline float16(const float16_t& h) {
-    float16_t tmp = h;
-    x = *reinterpret_cast<uint16_t*>(&tmp);
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
-
-  PADDLE_HOSTDEVICE inline explicit float16(int8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(int16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 
-  PADDLE_HOSTDEVICE inline explicit float16(int32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#elif defined(PADDLE_NEON)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
 
-  PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
 
-  PADDLE_HOSTDEVICE inline explicit float16(int64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
 
-  PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+#endif
   }
 
-  PADDLE_HOSTDEVICE inline explicit float16(float val) {
-    float16 res = fp16_impl::float_to_half_rn(val);
-    x = res.x;
-  }
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
 
-  PADDLE_HOSTDEVICE inline explicit float16(double val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
 
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) {
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
     x = rhs.x;
     return *this;
   }
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) {
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&rhs)->x;
 #else
@@ -188,87 +152,75 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
     x = rhs.x;
     return *this;
   }
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    float16_t tmp = rhs;
-    x = *reinterpret_cast<uint16_t*>(&tmp);
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
     return *this;
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline float16& operator=(bool b) {
+  HOSTDEVICE inline float16& operator=(bool b) {
     x = b ? 0x3c00 : 0;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(float val) {
-    float16 res = fp16_impl::float_to_half_rn(val);
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(double val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
     return *this;
   }
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline operator half() const {
+  HOSTDEVICE inline explicit operator half() const {
 #if CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
@@ -281,186 +233,504 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  PADDLE_HOSTDEVICE inline operator Eigen::half() const {
+  HOSTDEVICE inline explicit operator Eigen::half() const {
     Eigen::half h;
     h.x = x;
     return h;
   }
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  PADDLE_HOSTDEVICE inline operator float16_t() const {
-    float16 h = *this;
-    return *reinterpret_cast<float16_t*>(&h);
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline explicit operator bool() const {
-    return (x & 0x7fff) != 0;
-  }
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_NEON)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
 
-  PADDLE_HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(fp16_impl::half_to_float(*this));
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator float() const {
-    return fp16_impl::half_to_float(*this);
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
   }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
 };
 
-// Arithmetic operators
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-__device__ inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
+DEVICE inline half operator+(const half& a, const half& b) {
+  return __hadd(a, b);
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+  return __hsub(a, b);
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+  return __hmul(a, b);
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+}
+
+DEVICE inline half operator-(const half& a) { return __hneg(a); }
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+  return __heq(a, b);
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+  return __hne(a, b);
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+  return __hlt(a, b);
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+  return __hle(a, b);
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+  return __hgt(a, b);
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+  return __hge(a, b);
+}
+
+/*
+DEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(__hadd(half(a), half(b)));
 }
 
-__device__ inline float16 operator-(const float16& a, const float16& b) {
+DEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(__hsub(half(a), half(b)));
 }
 
-__device__ inline float16 operator*(const float16& a, const float16& b) {
+DEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(__hmul(half(a), half(b)));
 }
 
-__device__ inline float16 operator/(const float16& a, const float16& b) {
-  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
+DEVICE inline float16 operator/(const float16& a, const float16& b) {
   float num = __half2float(half(a));
   float denom = __half2float(half(b));
   return float16(num / denom);
 }
 
-__device__ inline float16 operator-(const float16& a) {
+DEVICE inline float16 operator-(const float16& a) {
   return float16(__hneg(half(a)));
 }
 
-__device__ inline float16& operator+=(float16& a, const float16& b) {
+DEVICE inline float16& operator+=(float16& a, const float16& b) {
   a = a + b;
   return a;
 }
 
-__device__ inline float16& operator-=(float16& a, const float16& b) {
+DEVICE inline float16& operator-=(float16& a, const float16& b) {
   a = a - b;
   return a;
 }
 
-__device__ inline float16& operator*=(float16& a, const float16& b) {
+DEVICE inline float16& operator*=(float16& a, const float16& b) {
   a = a * b;
   return a;
 }
 
-__device__ inline float16& operator/=(float16& a, const float16& b) {
+DEVICE inline float16& operator/=(float16& a, const float16& b) {
   a = a / b;
   return a;
 }
 
-__device__ inline bool operator==(const float16& a, const float16& b) {
+DEVICE inline bool operator==(const float16& a, const float16& b) {
   return __heq(half(a), half(b));
 }
 
-__device__ inline bool operator!=(const float16& a, const float16& b) {
+DEVICE inline bool operator!=(const float16& a, const float16& b) {
   return __hne(half(a), half(b));
 }
 
-__device__ inline bool operator<(const float16& a, const float16& b) {
+DEVICE inline bool operator<(const float16& a, const float16& b) {
   return __hlt(half(a), half(b));
 }
 
-__device__ inline bool operator<=(const float16& a, const float16& b) {
+DEVICE inline bool operator<=(const float16& a, const float16& b) {
   return __hle(half(a), half(b));
 }
 
-__device__ inline bool operator>(const float16& a, const float16& b) {
+DEVICE inline bool operator>(const float16& a, const float16& b) {
   return __hgt(half(a), half(b));
 }
 
-__device__ inline bool operator>=(const float16& a, const float16& b) {
+DEVICE inline bool operator>=(const float16& a, const float16& b) {
   return __hge(half(a), half(b));
 }
+*/
+
+// Arithmetic operators on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
 
-// On ARMv8.2-A CPU
-#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39)
-__host__ inline float16 operator+(const float16& a, const float16& b) {
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+/*
+HOST inline float16 operator+(const float16& a, const float16& b) {
   return float16(vaddh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator-(const float16& a, const float16& b) {
+HOST inline float16 operator-(const float16& a, const float16& b) {
   return float16(vsubh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator*(const float16& a, const float16& b) {
+HOST inline float16 operator*(const float16& a, const float16& b) {
   return float16(vmulh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator/(const float16& a, const float16& b) {
+HOST inline float16 operator/(const float16& a, const float16& b) {
   return float16(vdivh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator-(const float16& a) {
+HOST inline float16 operator-(const float16& a) {
   return float16(vnegh_f16(float16_t(a)));
 }
 
-__host__ inline float16& operator+=(float16& a, const float16& b) {
+HOST inline float16& operator+=(float16& a, const float16& b) {
   a = a + b;
   return a;
 }
 
-__host__ inline float16& operator-=(float16& a, const float16& b) {
+HOST inline float16& operator-=(float16& a, const float16& b) {
   a = a - b;
   return a;
 }
 
-__host__ inline float16& operator*=(float16& a, const float16& b) {
+HOST inline float16& operator*=(float16& a, const float16& b) {
   a = a * b;
   return a;
 }
 
-__host__ inline float16& operator/=(float16& a, const float16& b) {
+HOST inline float16& operator/=(float16& a, const float16& b) {
   a = a / b;
   return a;
 }
 
-__host__ inline bool operator==(const float16& a, const float16& b) {
+HOST inline bool operator==(const float16& a, const float16& b) {
   return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline bool operator!=(const float16& a, const float16& b) {
+HOST inline bool operator!=(const float16& a, const float16& b) {
   return !(a == b);
 }
 
-__host__ inline bool operator<(const float16& a, const float16& b) {
+HOST inline bool operator<(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
 #else
@@ -468,7 +738,7 @@ __host__ inline bool operator<(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator<=(const float16& a, const float16& b) {
+HOST inline bool operator<=(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
 #else
@@ -476,7 +746,7 @@ __host__ inline bool operator<=(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator>(const float16& a, const float16& b) {
+HOST inline bool operator>(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
 #else
@@ -484,231 +754,82 @@ __host__ inline bool operator>(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator>=(const float16& a, const float16& b) {
+HOST inline bool operator>=(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
 #else
   return float(a) >= float(b);
 #endif  // PADDLE_NEON_64
 }
+*/
 
-#else  // Software emulation on other cpu
-PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(float(a) + float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(float(a) - float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(float(a) * float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
   return float16(float(a) / float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) {
+HOSTDEVICE inline float16 operator-(const float16& a) {
   float16 res;
   res.x = a.x ^ 0x8000;
   return res;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
   a = float16(float(a) + float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
   a = float16(float(a) - float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
   a = float16(float(a) * float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
   a = float16(float(a) / float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
   return float(a) == float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
   return float(a) != float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
   return float(a) < float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
   return float(a) <= float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
   return float(a) > float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
   return float(a) >= float(b);
 }
 
 #endif
-
-namespace fp16_impl {
-
-union Bits {
-  float f;
-  int32_t si;
-  uint32_t ui;
-};
-
-const int shift = 13;
-const int shiftSign = 16;
-
-const int32_t infN = 0x7F800000;
-const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-const int32_t sigN = 0x80000000;  // sign bit
-
-constexpr int32_t infC = infN >> shift;
-constexpr int32_t nanN = (infC + 1) << shift;  // minimum flt16 nan as float32
-constexpr int32_t maxC = maxN >> shift;
-constexpr int32_t minC = minN >> shift;
-constexpr int32_t sigC = sigN >> shiftSign;
-
-const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-constexpr int32_t maxD = infC - maxC - 1;
-constexpr int32_t minD = minC - subC - 1;
-
-PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  half tmp = __float2half(f);
-  return *reinterpret_cast<float16*>(&tmp);
-
-#elif defined(PADDLE_NEON_64)
-  float16 res;
-  asm volatile(
-      "ld1 {v0.s}[0], [%[float_ptr]]\n"
-      "fcvt h0, s0\n"
-      "st1 {v0.h}[0], [%[half_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [float_ptr] "r"(&f),
-      [half_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(PADDLE_NEON_32)
-  float16 res;
-  asm volatile(
-      "vld1.32 {d0[0]}, [%[float_ptr]]\n"
-      "vcvt.f16.f32 d0, q0\n"
-      "vst1.16 {d0[0]}, [%[half_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [float_ptr] "r"(&f),
-      [half_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "d0");
-  return res;
-
-#elif defined(__F16C__)
-  float16 res;
-  res.x = _cvtss_sh(f, 0);
-  return res;
-
-#else
-  // Conversion routine adapted from
-  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-  Bits v, s;
-  v.f = f;
-  uint32_t sign = v.si & sigN;
-  v.si ^= sign;
-  sign >>= shiftSign;  // logical shift
-  s.si = mulN;
-  s.si = s.f * v.f;  // correct subnormals
-  v.si ^= (s.si ^ v.si) & -(minN > v.si);
-  v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-  v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-  v.ui >>= shift;  // logical shift
-  v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-  v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
-  float16 res;
-  res.x = v.ui | sign;
-  return res;
-
-#endif
-}
-
-PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  half tmp = *reinterpret_cast<half*>(&h);
-  return __half2float(tmp);
-
-#elif defined(PADDLE_NEON_64)
-  float res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[half_ptr]]\n"
-      "fcvt s0, h0\n"
-      "st1 {v0.s}[0], [%[float_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [half_ptr] "r"(&(h.x)),
-      [float_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(PADDLE_NEON_32)
-  float res;
-  asm volatile(
-      "vld1.16 {d0[0]}, [%[half_ptr]]\n"
-      "vcvt.f32.f16 q0, d0\n"
-      "vst1.32 {d0[0]}, [%[float_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [half_ptr] "r"(&(h.x)),
-      [float_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(__F16C__)
-  return _cvtsh_ss(h.x);
-
-#else
-  // Conversion routine adapted from
-  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-  Bits v;
-  v.ui = h.x;
-  int32_t sign = v.si & sigC;
-  v.si ^= sign;
-  sign <<= shiftSign;
-  v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-  v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-  Bits s;
-  s.si = mulC;
-  s.f *= v.si;
-  int32_t mask = -(norC > v.si);
-  v.si <<= shift;
-  v.si ^= (s.si ^ v.si) & mask;
-  v.si |= sign;
-  return v.f;
-
-#endif
-}
-
-}  // namespace fp16_impl
 }  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
index 8c74bcc039..f5541d8f0f 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -55,10 +55,12 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(false).x, 0x0000);
 
   // Implicit conversion to and from Eigen::half
+  /*
   Eigen::half tmp = float16(1.0f);
   float16 v_conv = tmp;
   EXPECT_EQ(tmp.x, 0x3c00);
   EXPECT_EQ(v_conv.x, 0x3c00);
+  */
 
   // Default constructor
   float16 v_def;

From 41bd1f9115c4cb8a9a9afcc656b6d0f00d9b1cb5 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 28 Nov 2017 17:09:12 -0800
Subject: [PATCH 15/16] fix gpu test, clean code and add cmake

---
 CMakeLists.txt                     |   1 +
 cmake/configure.cmake              |   5 +
 paddle/math/float16.h              | 217 ++++++++---------------------
 paddle/math/tests/test_float16.cpp |   8 --
 paddle/math/tests/test_float16.cu  |  90 ++++++------
 5 files changed, 109 insertions(+), 212 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd3582a1bc..a2bb5d73bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 24ddb24399..2c202707ff 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 65c0489e1f..778b48bce8 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <cstdint>
+#include <stdint.h>
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
 public:
   uint16_t x;
 
+  // Constructors
   HOSTDEVICE inline float16() : x(0) {}
 
   HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
@@ -89,8 +90,7 @@ public:
 
 #ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16 in arm_fp16.h,
-  // which is included in arm_neon.h.
+  // float16_t is an alias for __fp16
   HOSTDEVICE inline explicit float16(const float16_t& h) {
     x = *reinterpret_cast<const uint16_t*>(&h);
   }
@@ -141,6 +141,7 @@ public:
     return *this;
   }
 
+// Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
@@ -219,6 +220,7 @@ public:
     return *this;
   }
 
+// Conversion opertors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit operator half() const {
 #if CUDA_VERSION >= 9000
@@ -353,27 +355,54 @@ private:
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
-    __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
 DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
 }
 
-DEVICE inline half operator-(const half& a) { return __hneg(a); }
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
 
 DEVICE inline half& operator+=(half& a, const half& b) {
   a = a + b;
@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
 }
 
 DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
 }
 
-/*
-DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(half(a), half(b)));
-}
-
-DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(half(a), half(b)));
-}
-
-DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(half(a), half(b)));
-}
-
-DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
-  return float16(num / denom);
-}
-
-DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(half(a)));
-}
-
-DEVICE inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-
-DEVICE inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-
-DEVICE inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-
-DEVICE inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-
-DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(half(a), half(b));
-}
-
-DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(half(a), half(b));
-}
-
-DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(half(a), half(b));
-}
-
-DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(half(a), half(b));
-}
-
-DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(half(a), half(b));
-}
-
-DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(half(a), half(b));
-}
-*/
+#endif  // PADDLE_CUDA_FP16
 
 // Arithmetic operators on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
+#if defined(PADDLE_WITH_NATIVE_FP16)
 HOST inline float16 operator+(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-/*
-HOST inline float16 operator+(const float16& a, const float16& b) {
-  return float16(vaddh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator-(const float16& a, const float16& b) {
-  return float16(vsubh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator*(const float16& a, const float16& b) {
-  return float16(vmulh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator/(const float16& a, const float16& b) {
-  return float16(vdivh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator-(const float16& a) {
-  return float16(vnegh_f16(float16_t(a)));
-}
-
-HOST inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-
-HOST inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-
-HOST inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-
-HOST inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-
-HOST inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return !(a == b);
-}
-
-HOST inline bool operator<(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) < float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator<=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) <= float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator>(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) > float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator>=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) >= float(b);
-#endif  // PADDLE_NEON_64
-}
-*/
-
 // Arithmetic operators, software emulated on other CPU
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
index f5541d8f0f..74cc55aa37 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(true).x, 0x3c00);
   EXPECT_EQ(float16(false).x, 0x0000);
 
-  // Implicit conversion to and from Eigen::half
-  /*
-  Eigen::half tmp = float16(1.0f);
-  float16 v_conv = tmp;
-  EXPECT_EQ(tmp.x, 0x3c00);
-  EXPECT_EQ(v_conv.x, 0x3c00);
-  */
-
   // Default constructor
   float16 v_def;
   EXPECT_EQ(v_def.x, 0x0000);
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
index 941f266603..4b520feaaf 100644
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
@@ -15,41 +15,38 @@ limitations under the License. */
 
 #include "paddle/utils/Logging.h"
 
-#define ARITHMETIC_KERNEL(op_type, sign)                      \
-  __global__ void op_type(                                    \
-      const float16* in1, const float16* in2, float16* out) { \
-    out[0] = in1[0] sign in2[0];                              \
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
   }
 
-#define COMPOUND_KERNEL(op_type, sign)                        \
-  __global__ void op_type(float16* in1, const float16* in2) { \
-    in1[0] sign in2[0];                                       \
-  }
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
 
-#define COMPARISON_KERNEL(op_type, sign)                                       \
-  __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \
-    out[0] = in1[0] sign in2[0];                                               \
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
   }
 
 #define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
   void Test##op_type(float v_in1, float v_in2, float v_out) { \
     LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    float16 *in1, *in2, *out;                                 \
-    float16 *d_in1, *d_in2, *d_out;                           \
-    int size = sizeof(float16);                               \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
     cudaMalloc((void**)&d_in1, size);                         \
     cudaMalloc((void**)&d_in2, size);                         \
     cudaMalloc((void**)&d_out, size);                         \
-    in1 = (float16*)malloc(size);                             \
-    in2 = (float16*)malloc(size);                             \
-    out = (float16*)malloc(size);                             \
-    in1[0] = float16(v_in1);                                  \
-    in2[0] = float16(v_in2);                                  \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
     cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(out[0]), v_out);                          \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
     free(in1);                                                \
     free(in2);                                                \
     free(out);                                                \
@@ -61,20 +58,20 @@ limitations under the License. */
 #define COMPOUND_KERNEL_LAUNCH(op_type)                       \
   void Test##op_type(float v_in1, float v_in2, float v_out) { \
     LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    float16 *in1, *in2;                                       \
-    float16 *d_in1, *d_in2;                                   \
-    int size = sizeof(float16);                               \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
     cudaMalloc((void**)&d_in1, size);                         \
     cudaMalloc((void**)&d_in2, size);                         \
-    in1 = (float16*)malloc(size);                             \
-    in2 = (float16*)malloc(size);                             \
-    in1[0] = float16(v_in1);                                  \
-    in2[0] = float16(v_in2);                                  \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2);                          \
     cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(in1[0]), v_out);                          \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
     free(in1);                                                \
     free(in2);                                                \
     cudaFree(d_in1);                                          \
@@ -84,18 +81,18 @@ limitations under the License. */
 #define COMPARISON_KERNEL_LAUNCH(op_type)                    \
   void Test##op_type(float v_in1, float v_in2, bool v_out) { \
     LOG(INFO) << "Test " << #op_type << " on GPU!";          \
-    float16 *in1, *in2;                                      \
-    float16 *d_in1, *d_in2;                                  \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
     bool *out, *d_out;                                       \
-    int size = sizeof(float16);                              \
+    int size = sizeof(half);                                 \
     cudaMalloc((void**)&d_in1, size);                        \
     cudaMalloc((void**)&d_in2, size);                        \
     cudaMalloc((void**)&d_out, 1);                           \
-    in1 = (float16*)malloc(size);                            \
-    in2 = (float16*)malloc(size);                            \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
     out = (bool*)malloc(1);                                  \
-    in1[0] = float16(v_in1);                                 \
-    in2[0] = float16(v_in2);                                 \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
@@ -112,6 +109,7 @@ limitations under the License. */
 #ifdef PADDLE_CUDA_FP16
 namespace paddle {
 
+#if CUDA_VERSION < 9000
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
 ARITHMETIC_KERNEL_LAUNCH(Div)
 
 // Negative sign kernel
-__global__ void Neg(float16* in) { in[0] = -in[0]; }
+__global__ void Neg(half* in) { in[0] = -in[0]; }
 
 void TestNeg(float v_in, float v_out) {
   LOG(INFO) << "Test Neg on GPU!";
-  float16 *in, *d_in;
-  int size = sizeof(float16);
+  half *in, *d_in;
+  int size = sizeof(half);
   cudaMalloc((void**)&d_in, size);
-  in = (float16*)malloc(size);
-  in[0] = float16(v_in);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
   cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
   Neg<<<1, 1>>>(d_in);
   cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(in[0]), v_out);
+  EXPECT_EQ(float(float16(in[0])), v_out);
   free(in);
   cudaFree(d_in);
 }
@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
   TestGreaterEqual(4, 4, true);
   TestGreaterEqual(4, 5, false);
 }
+#endif  // CUDA_VERSION
 
 TEST(float16, conversion_on_gpu) {
   // Explicit conversion to and from cuda half
@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
   EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
   EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
 
-  // Implicit conversion to and from cuda half
-  half tmp = float16(1.0f);
-  float16 val = tmp;
-  EXPECT_EQ(val.x, 0x3c00);
-
   // Assignment operator
   float16 v_assign;
-  v_assign = tmp;
+  v_assign = half(float16(1.0f));
   EXPECT_EQ(v_assign.x, 0x3c00);
 }
 
 }  // namespace paddle
-#endif
+#endif  // PADDLE_CUDA_FP16

From 36df67b17c0057725661f11065c87509a3cc898f Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Wed, 29 Nov 2017 16:30:55 -0800
Subject: [PATCH 16/16] small fix

---
 paddle/math/float16.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 778b48bce8..f805cad08b 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -735,6 +735,5 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
   return float(a) >= float(b);
 }
-
 #endif
 }  // namespace paddle