From e4733224c9e6c6c2eede669e9cdbf17e7be86501 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Fri, 20 Oct 2017 14:08:51 -0700 Subject: [PATCH 01/16] initial commit for float16 --- paddle/math/float16.h | 142 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 paddle/math/float16.h diff --git a/paddle/math/float16.h b/paddle/math/float16.h new file mode 100644 index 0000000000..84e533d1fc --- /dev/null +++ b/paddle/math/float16.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include + +#ifdef __arm__ +#define PADDLE_ARM_32 +#endif + +#ifdef __aarch64__ +#define PADDLE_ARM_64 +#endif + +#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64) +#define PADDLE_ARM +#endif + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#define PADDLE_NEON +#endif + +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) +#define PADDLE_NEON_32 +#endif + +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64) +#define PADDLE_NEON_64 +#endif + +#ifdef __CUDA_ARCH__ // use __CUDACC__ instead +#define PADDLE_HOSTDEVICE __host__ __device__ +#if CUDA_VERSION >= 7050 +#define PADDLE_CUDA_FP16 +#include +#endif // CUDA_VERSION >= 7050 +#else +#define PADDLE_HOSTDEVICE +#endif // __CUDA_ARCH__ + +#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) +#include +#else +#ifdef __F16C__ +#undef __F16C__ +#endif +#endif + +#define PADDLE_ALIGNED(x) __attribute__((aligned(x))) + +// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h +template +To convert(From f) { + return static_cast(f); +} + +namespace paddle { + +class float16; + +// convert from float to half precision in round-to-nearest-even mode +float16 float2half_rn(float f); +float half2float(float16 h); + +class float16 { +public: + uint16_t val_; + + PADDLE_HOSTDEVICE inline explicit float16() : x(0) {} + + PADDLE_HOSTDEVICE inline explicit float16(float val) { + float16 res = float2half_rn(val); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int val) { + float16 res = cpu_float2half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(double val) { + float16 res = cpu_float2half_rn(static_cast(val)); + x = res.x; + } + + // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated + // and aligned at least on a 2-byte boundary, which leads to efficient + // memory access of float16 struct. +} PADDLE_ALIGNED(2); + +namespace fp16_impl { + +// Conversion routine adapted from +// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion +Union Bits { + float f; + int32_t si; + uint32_t ui; +}; + +static const int shift = 13; +static const int shiftSign = 16; + +static const int32_t infN = 0x7F800000; +static const int32_t maxN = 0x477FE000; // max flt16 as flt32 +static const int32_t minN = 0x38800000; // min flt16 normal as flt32 +static const int32_t sigN = 0x80000000; // sign bit + +static constexpr int32_t infC = infN >> shift; +static constexpr int32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 +static constexpr int32_t maxC = maxN >> shift; +static constexpr int32_t minC = minN >> shift; +static constexpr int32_t sigC = sigN >> shiftSign; + +static const int32_t mulN = 0x52000000; //(1 << 23) / minN +static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) +static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted +static const int32_t norC = 0x00400; // min flt32 normal downshifted + +static constexpr int32_t maxD = infC - maxC - 1; +static constexpr int32_t minD = minC - subC - 1; + +} // namespace half_impl + +} // namespace paddle From a208dd64ae1d7a5662a1bf4728162d8123aa89bf Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 30 Oct 2017 17:39:18 -0700 Subject: [PATCH 02/16] add float16 data type --- paddle/math/float16.h | 365 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 320 insertions(+), 45 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 84e533d1fc..84fe613d51 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -18,7 +18,21 @@ limitations under the License. */ #include #include -#include +#include // seems need to delete it + +#ifdef USE_EIGEN // delete this #if macro +#include "Eigen/src/Core/arch/CUDA/Half.h" +#endif + +#ifdef __CUDACC__ +#define PADDLE_HOSTDEVICE __host__ __device__ +#if CUDA_VERSION >= 7050 +#define PADDLE_CUDA_FP16 +#include +#endif // CUDA_VERSION >= 7050 +#else +#define PADDLE_HOSTDEVICE +#endif // __CUDA_ARCH__ #ifdef __arm__ #define PADDLE_ARM_32 @@ -44,15 +58,9 @@ limitations under the License. */ #define PADDLE_NEON_64 #endif -#ifdef __CUDA_ARCH__ // use __CUDACC__ instead -#define PADDLE_HOSTDEVICE __host__ __device__ -#if CUDA_VERSION >= 7050 -#define PADDLE_CUDA_FP16 -#include -#endif // CUDA_VERSION >= 7050 -#else -#define PADDLE_HOSTDEVICE -#endif // __CUDA_ARCH__ +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) +#include +#endif #if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) #include @@ -62,7 +70,7 @@ limitations under the License. */ #endif #endif -#define PADDLE_ALIGNED(x) __attribute__((aligned(x))) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) // https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h template @@ -72,70 +80,337 @@ To convert(From f) { namespace paddle { -class float16; +struct float16; +namespace fp16_impl { // convert from float to half precision in round-to-nearest-even mode -float16 float2half_rn(float f); -float half2float(float16 h); +PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); +PADDLE_HOSTDEVICE inline float half_to_float(float16 h); +PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x); +} // namespace fp16_impl -class float16 { -public: - uint16_t val_; +// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated +// and aligned at least on a 2-byte boundary, which leads to efficient +// memory access of float16 struct and also makes float16 compatible +// with CUDA half and Eigen::half data types. +struct PADDLE_ALIGN(2) float16 { + uint16_t x; - PADDLE_HOSTDEVICE inline explicit float16() : x(0) {} + // explicit for different types, implicit for half and Eigen::half + + PADDLE_HOSTDEVICE inline float16() {} + + PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline float16(const half h) { +#if CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(&h)->x; +#else + x = h.x; +#endif // CUDA_VERSION >= 9000 + } +#endif // PADDLE_CUDA_FP16 +/* +#ifdef PADDLE_CUDA_FP16 + #if CUDA_VERSION < 9000 + PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {} + #else + PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {} + PADDLE_HOSTDEVICE inline float16(const half& h) + : x(*reinterpret_cast(&h)) {} + #endif // CUDA_VERSION < 9000 +#endif // PADDLE_CUDA_FP16 +*/ + +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + // __fp16 is a native half precision data type for arm cpu, + // float16_t is an alias for __fp16 in arm_fp16.h + // which is included in arm_neon.h + PADDLE_HOSTDEVICE inline float16(const float16_t h) { + x = *reinterpret_cast(&h); + } +#endif + + PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} PADDLE_HOSTDEVICE inline explicit float16(float val) { - float16 res = float2half_rn(val); + float16 res = fp16_impl::float_to_half_rn(val); + x = res.x; + } + + template + PADDLE_HOSTDEVICE inline explicit float16(const T& val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } + PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) { + x = rhs.x; + return *this; + } + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) { +#if CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(&rhs)->x; +#else + x = rhs.x; +#endif + return *this; + } +#endif + +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { + x = rhs.x; + return *this; + } +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) { + x = *reinterpret_cast(&rhs); + return *this; + } +#endif + +/* PADDLE_HOSTDEVICE inline explicit float16(int val) { - float16 res = cpu_float2half_rn(static_cast(val)); + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } PADDLE_HOSTDEVICE inline explicit float16(double val) { - float16 res = cpu_float2half_rn(static_cast(val)); + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } +*/ + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline operator half() { +#if CUDA_VERSION >= 9000 + __half_raw h; + h.x = x; + return half(h); +#else + half h; + h.x = x; + return h; +#endif // CUDA_VERSION >= 9000 + } +#endif // PADDLE_CUDA_FP16 - // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated - // and aligned at least on a 2-byte boundary, which leads to efficient - // memory access of float16 struct. -} PADDLE_ALIGNED(2); +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline operator Eigen::half() { + Eigen::half h; + h.x = x; + return h; + } +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + PADDLE_HOSTDEVICE inline operator float16_t() { + float16 h = *this; + return *reinterpret_cast(&h); + } +#endif + + PADDLE_HOSTDEVICE inline explicit operator bool() { + return (x & 0x7fff) != 0; + } + + PADDLE_HOSTDEVICE inline explicit operator int8_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint8_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int16_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint16_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int32_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint32_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int64_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint64_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator float() { + return fp16_impl::half_to_float(*this); + } + + PADDLE_HOSTDEVICE inline explicit operator double() { + return static_cat(fp16_impl::half_to_float(*this)); + } +}; + +// arithmetic operators +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +__device__ inline float16 operator+(const float16& a, const float16& b) { + return float16(__hadd(a, b)); +} + +__device__ inline float16 operator-(const float16& a, const float16& b) { + return __hsub(a, b); +} + +__device__ inline float16 operator*(const float16& a, const float16& b) { + return __hmul(a, b); +} + +#elif // on arm cpu + +#else + +#endif namespace fp16_impl { -// Conversion routine adapted from -// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Union Bits { float f; int32_t si; uint32_t ui; }; -static const int shift = 13; -static const int shiftSign = 16; +const int shift = 13; +const int shiftSign = 16; + +const int32_t infN = 0x7F800000; +const int32_t maxN = 0x477FE000; // max flt16 as flt32 +const int32_t minN = 0x38800000; // min flt16 normal as flt32 +const int32_t sigN = 0x80000000; // sign bit + +constexpr int32_t infC = infN >> shift; +constexpr int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as float32 +constexpr int32_t maxC = maxN >> shift; +constexpr int32_t minC = minN >> shift; +constexpr int32_t sigC = sigN >> shiftSign; + +const int32_t mulN = 0x52000000; //(1 << 23) / minN +const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) +const int32_t subC = 0x003FF; // max flt32 subnormal downshifted +const int32_t norC = 0x00400; // min flt32 normal downshifted + +constexpr int32_t maxD = infC - maxC - 1; +constexpr int32_t minD = minC - subC - 1; + +PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = __float2half(f); + return *reinterpret_cast(&(tmp)); + +#elif defined(__F16C__) + float16 res; + res.x = _cvtss_sh(f, 0); + return res; + +#elif defined(PADDLE_ARM_64) // test on RPI + float16 res; + asm volatile( + "ld1 {v0.s}[0], [%[float_ptr]]\n" + "FCVT h0, s0\n" + "st1 {v0.h}[0], [%[half_ptr]]\n" + : // outputs + : // inputs + [float_ptr] "r"(&f), + [half_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0"); + return res; -static const int32_t infN = 0x7F800000; -static const int32_t maxN = 0x477FE000; // max flt16 as flt32 -static const int32_t minN = 0x38800000; // min flt16 normal as flt32 -static const int32_t sigN = 0x80000000; // sign bit +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = f; + uint32_t sign = v.si & sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + float16 res; + res.x = v.ui | sign; + return res; -static constexpr int32_t infC = infN >> shift; -static constexpr int32_t nanN = (infC + 1) - << shift; // minimum flt16 nan as float32 -static constexpr int32_t maxC = maxN >> shift; -static constexpr int32_t minC = minN >> shift; -static constexpr int32_t sigC = sigN >> shiftSign; +#endif +} -static const int32_t mulN = 0x52000000; //(1 << 23) / minN -static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) -static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted -static const int32_t norC = 0x00400; // min flt32 normal downshifted +PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = *reinterpret_cast(&h); + return __half2float(h); + +#elif defined(__F16C__) + return _cvtsh_ss(h.x); + +#elif defined(PADDLE_ARM_64) // test on RPI + float res; + asm volatile( + "ld1 {v0.h}[0], [%[half_ptr]]\n" + "FCVT s0, h0\n" + "st1 {v0.s}[0], [%[float_ptr]]\n" + : // outputs + : // inputs + [half_ptr] "r"(&(h.x)), + [float_ptr] "r"(&res) + : // clobbers + "memory", "v0"); + return res; -static constexpr int32_t maxD = infC - maxC - 1; -static constexpr int32_t minD = minC - subC - 1; +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v; + v.ui = x; + int32_t sign = v.si & sigC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + +#endif +} + +PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) { + float16 res; + res.x = x; + return res; +} } // namespace half_impl From 9d8b30596491930c6137e56d7883370bff24d2c8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 7 Nov 2017 13:19:59 -0800 Subject: [PATCH 03/16] small fix --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 84fe613d51..5fe2854969 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include -#include // seems need to delete it +#include #ifdef USE_EIGEN // delete this #if macro #include "Eigen/src/Core/arch/CUDA/Half.h" @@ -32,7 +32,7 @@ limitations under the License. */ #endif // CUDA_VERSION >= 7050 #else #define PADDLE_HOSTDEVICE -#endif // __CUDA_ARCH__ +#endif // __CUDACC__ #ifdef __arm__ #define PADDLE_ARM_32 From e877cdb8f930cbcd4112a9224232efd898a780b5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 13 Nov 2017 23:06:07 -0800 Subject: [PATCH 04/16] add float16 arithmetic on arm cpu --- paddle/math/float16.h | 479 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 389 insertions(+), 90 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 5fe2854969..ae7d9754aa 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// need to define PADDLE_ARM_FP16 + #pragma once #include @@ -24,6 +26,18 @@ limitations under the License. */ #include "Eigen/src/Core/arch/CUDA/Half.h" #endif +#ifdef __GNUC__ +#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) +#else +#define PADDLE_GNUC_VER 0 +#endif // __GNUC__ + +#ifdef __clang__ +#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__) +#else +#define PADDLE_CLANG_VER 0 +#endif // __clang__ + #ifdef __CUDACC__ #define PADDLE_HOSTDEVICE __host__ __device__ #if CUDA_VERSION >= 7050 @@ -48,6 +62,7 @@ limitations under the License. */ #if defined(__ARM_NEON) || defined(__ARM_NEON__) #define PADDLE_NEON +#include #endif #if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) @@ -58,26 +73,16 @@ limitations under the License. */ #define PADDLE_NEON_64 #endif -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) -#include -#endif - -#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) -#include -#else +#ifdef PADDLE_ARM #ifdef __F16C__ #undef __F16C__ -#endif -#endif +#endif // __F16C__ +#else +#include +#endif // PADDLE_ARM #define PADDLE_ALIGN(x) __attribute__((aligned(x))) -// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h -template -To convert(From f) { - return static_cast(f); -} - namespace paddle { struct float16; @@ -86,13 +91,12 @@ namespace fp16_impl { // convert from float to half precision in round-to-nearest-even mode PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); PADDLE_HOSTDEVICE inline float half_to_float(float16 h); -PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x); } // namespace fp16_impl // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible -// with CUDA half and Eigen::half data types. +// with CUDA half, ARM float16_t, and Eigen::half data types. struct PADDLE_ALIGN(2) float16 { uint16_t x; @@ -103,7 +107,7 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const half h) { + PADDLE_HOSTDEVICE inline float16(const half& h) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&h)->x; #else @@ -111,40 +115,72 @@ struct PADDLE_ALIGN(2) float16 { #endif // CUDA_VERSION >= 9000 } #endif // PADDLE_CUDA_FP16 -/* -#ifdef PADDLE_CUDA_FP16 - #if CUDA_VERSION < 9000 - PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {} - #else - PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {} - PADDLE_HOSTDEVICE inline float16(const half& h) - : x(*reinterpret_cast(&h)) {} - #endif // CUDA_VERSION < 9000 -#endif // PADDLE_CUDA_FP16 -*/ #ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) +#ifdef PADDLE_NEON // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 in arm_fp16.h - // which is included in arm_neon.h - PADDLE_HOSTDEVICE inline float16(const float16_t h) { - x = *reinterpret_cast(&h); + // float16_t is an alias for __fp16 in arm_fp16.h, + // which is included in arm_neon.h. + // According to gcc, __fp16 can only be used as an argument to fp16 + // intrinsic defined in arm_neon.h or as a storage type. It cannot + // be used as a formal function argument. + // TODO (kexinzhao): test it on RPI + PADDLE_HOSTDEVICE inline float16(const float16_t* h) { + x = *reinterpret_cast(h); } #endif PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} + PADDLE_HOSTDEVICE inline explicit float16(int8_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + PADDLE_HOSTDEVICE inline explicit float16(float val) { float16 res = fp16_impl::float_to_half_rn(val); x = res.x; } - template - PADDLE_HOSTDEVICE inline explicit float16(const T& val) { + PADDLE_HOSTDEVICE inline explicit float16(double val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } @@ -155,7 +191,7 @@ struct PADDLE_ALIGN(2) float16 { } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) { + PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&rhs)->x; #else @@ -172,27 +208,80 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) { - x = *reinterpret_cast(&rhs); +#ifdef PADDLE_NEON + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { + x = *reinterpret_cast(rhs); return *this; } #endif -/* - PADDLE_HOSTDEVICE inline explicit float16(int val) { + PADDLE_HOSTDEVICE inline float16& operator=(bool b) { + x = b ? 0x3c00 : 0; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; + return *this; } - PADDLE_HOSTDEVICE inline explicit float16(double val) { + PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(float val) { + float16 res = fp16_impl::float_to_half_rn(val); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(double val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; } -*/ #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator half() { + PADDLE_HOSTDEVICE inline operator half() const { #if CUDA_VERSION >= 9000 __half_raw h; h.x = x; @@ -206,82 +295,270 @@ struct PADDLE_ALIGN(2) float16 { #endif // PADDLE_CUDA_FP16 #ifdef USE_EIGEN - PADDLE_HOSTDEVICE inline operator Eigen::half() { + PADDLE_HOSTDEVICE inline operator Eigen::half() const { Eigen::half h; h.x = x; return h; } #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) - PADDLE_HOSTDEVICE inline operator float16_t() { +#ifdef PADDLE_NEON + // check whether it works or not + PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; return *reinterpret_cast(&h); } #endif - PADDLE_HOSTDEVICE inline explicit operator bool() { + PADDLE_HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } - PADDLE_HOSTDEVICE inline explicit operator int8_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint8_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int16_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint16_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int32_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint32_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int64_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint64_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator float() { + PADDLE_HOSTDEVICE inline explicit operator float() const { return fp16_impl::half_to_float(*this); } - PADDLE_HOSTDEVICE inline explicit operator double() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator double() const { + return static_cast(fp16_impl::half_to_float(*this)); } }; // arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(a, b)); + return float16(__hadd(half(a), half(b))); } __device__ inline float16 operator-(const float16& a, const float16& b) { - return __hsub(a, b); + return float16(__hsub(half(a), half(b))); } __device__ inline float16 operator*(const float16& a, const float16& b) { - return __hmul(a, b); + return float16(__hmul(half(a), half(b))); } -#elif // on arm cpu +__device__ inline float16 operator/(const float16& a, const float16& b) { + // TODO(kexinzhao): check the cuda version that starts to support __hdiv + // instinsic + float num = __half2float(half(a)); + float denom = __half2float(half(b)); + return float16(num / denom); +} -#else +__device__ inline float16 operator-(const float16& a) { + return float16(__hneg(half(a))); +} + +__device__ inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +__device__ inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +__device__ inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +__device__ inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +__device__ inline bool operator==(const float16& a, const float16& b) { + return __heq(half(a), half(b)); +} + +__device__ inline bool operator!=(const float16& a, const float16& b) { + return __hne(half(a), half(b)); +} + +__device__ inline bool operator<(const float16& a, const float16& b) { + return __hlt(half(a), half(b)); +} + +__device__ inline bool operator<=(const float16& a, const float16& b) { + return __hle(half(a), half(b)); +} + +__device__ inline bool operator>(const float16& a, const float16& b) { + return __hgt(half(a), half(b)); +} + +__device__ inline bool operator>=(const float16& a, const float16& b) { + return __hge(half(a), half(b)); +} + +// On ARMv8.2-A CPU +#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \ + defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) +__host__ inline float16 operator+(const float16& a, const float16& b) { + return float16(vaddh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator-(const float16& a, const float16& b) { + return float16(vsubh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator*(const float16& a, const float16& b) { + return float16(vmulh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator/(const float16& a, const float16& b) { + return float16(vdivh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator-(const float16& a) { + return float16(vnegh_f16(float16_t(a))); +} + +__host__ inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +__host__ inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +__host__ inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +__host__ inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +__host__ inline bool operator==(const float16& a, const float16& b) { + return static_cast(vceqh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator!=(const float16& a, const float16& b) { + return !(a == b); +} + +// compare only available in NEON_64 +__host__ inline bool operator<(const float16& a, const float16& b) { + return static_cast(vclth_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator<=(const float16& a, const float16& b) { + return static_cast(vcleh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator>(const float16& a, const float16& b) { + return static_cast(vcgth_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator>=(const float16& a, const float16& b) { + return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); +} + +#else // software emulation on other cpu +PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { + return float16(float(a) + float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { + return float16(float(a) - float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { + return float16(float(a) * float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { + return float16(float(a) / float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) { + float16 res; + res.x = a.x ^ 0x8000; + return res; +} + +PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { + a = float16(float(a) + float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { + a = float16(float(a) - float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { + a = float16(float(a) * float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { + a = float16(float(a) / float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { + return float(a) == float(b); +} + +PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { + return float(a) != float(b); +} + +PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { + return float(a) < float(b); +} + +PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { + return float(a) <= float(b); +} + +PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { + return float(a) > float(b); +} + +PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { + return float(a) >= float(b); +} #endif @@ -320,16 +597,11 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { half tmp = __float2half(f); return *reinterpret_cast(&(tmp)); -#elif defined(__F16C__) - float16 res; - res.x = _cvtss_sh(f, 0); - return res; - -#elif defined(PADDLE_ARM_64) // test on RPI +#elif defined(PADDLE_NEON_64) // test on RPI float16 res; asm volatile( "ld1 {v0.s}[0], [%[float_ptr]]\n" - "FCVT h0, s0\n" + "fcvt h0, s0\n" "st1 {v0.h}[0], [%[half_ptr]]\n" : // outputs : // inputs @@ -339,6 +611,25 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { "memory", "v0"); return res; +#elif defined(PADDLE_NEON_32) // test on RPI + float16 res; + asm volatile( + "vld1.32 {d0[0]}, [%[float_ptr]]\n" + "vcvt.f16.f32 d0, q0\n" + "vst1.16 {d0[0]}, [%[half_ptr]]\n" + : // outputs + : // inputs + [float_ptr] "r"(&f), + [half_ptr] "r"(&(res.x)) + : // clobbers + "memory", "d0"); + return res; + +#elif defined(__F16C__) + float16 res; + res.x = _cvtss_sh(f, 0); + return res; + #else // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion @@ -367,10 +658,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { half tmp = *reinterpret_cast(&h); return __half2float(h); -#elif defined(__F16C__) - return _cvtsh_ss(h.x); - -#elif defined(PADDLE_ARM_64) // test on RPI +#elif defined(PADDLE_NEON_64) float res; asm volatile( "ld1 {v0.h}[0], [%[half_ptr]]\n" @@ -384,6 +672,23 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { "memory", "v0"); return res; +#elif defined(PADDLE_NEON_32) + float res; + asm volatile( + "vld1.16 {d0[0]}, [%[half_ptr]]\n" + "vcvt.f32.f16 q0, d0\n" + "vst1.32 {d0[0]}, [%[float_ptr]]\n" + : // outputs + : // inputs + [half_ptr] "r"(&(h.x)), + [float_ptr] "r"(&res) + : // clobbers + "memory", "v0"); + return res; + +#elif defined(__F16C__) + return _cvtsh_ss(h.x); + #else // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion @@ -406,12 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #endif } -PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) { - float16 res; - res.x = x; - return res; -} - } // namespace half_impl } // namespace paddle From af37838edf4a3ad3c1f098d4026218c130258ac2 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 15 Nov 2017 22:48:01 -0800 Subject: [PATCH 05/16] add test for float16 --- paddle/math/float16.h | 16 ++++++++-------- paddle/math/tests/CMakeLists.txt | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index ae7d9754aa..e9d4e6737d 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include +#define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro #include "Eigen/src/Core/arch/CUDA/Half.h" @@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h); struct PADDLE_ALIGN(2) float16 { uint16_t x; - // explicit for different types, implicit for half and Eigen::half - PADDLE_HOSTDEVICE inline float16() {} PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. @@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { x = *reinterpret_cast(rhs); return *this; @@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; @@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) { __device__ inline float16 operator/(const float16& a, const float16& b) { // TODO(kexinzhao): check the cuda version that starts to support __hdiv - // instinsic float num = __half2float(half(a)); float denom = __half2float(half(b)); return float16(num / denom); @@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1; PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 half tmp = __float2half(f); - return *reinterpret_cast(&(tmp)); + return *reinterpret_cast(&tmp); #elif defined(PADDLE_NEON_64) // test on RPI float16 res; diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index d8b7f9e3fc..ab4ac38b3c 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -21,7 +21,7 @@ if(WITH_GPU) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) - link_paddle_test(test_lazyAssign) + link_paddle_test(test_lazyAssign) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) @@ -33,3 +33,4 @@ add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) +add_simple_unittest(test_float16) From 4f1aa5bc0ee3c00fa792cfe188fabaab290938b1 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 09:17:09 -0800 Subject: [PATCH 06/16] add test cases --- paddle/math/float16.h | 12 ++--- paddle/math/tests/test_float16.cpp | 78 ++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 paddle/math/tests/test_float16.cpp diff --git a/paddle/math/float16.h b/paddle/math/float16.h index e9d4e6737d..9c06b423ef 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -23,7 +23,7 @@ limitations under the License. */ #define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro -#include "Eigen/src/Core/arch/CUDA/Half.h" +#include "unsupported/Eigen/CXX11/Tensor" #endif #ifdef __GNUC__ @@ -126,7 +126,7 @@ struct PADDLE_ALIGN(2) float16 { // According to gcc, __fp16 can only be used as an argument to fp16 // intrinsic defined in arm_neon.h or as a storage type. It cannot // be used as a formal function argument. - // TODO (kexinzhao): test it on RPI + // TODO(kexinzhao): test it on RPI PADDLE_HOSTDEVICE inline float16(const float16_t* h) { x = *reinterpret_cast(h); } @@ -564,7 +564,7 @@ PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { namespace fp16_impl { -Union Bits { +union Bits { float f; int32_t si; uint32_t ui; @@ -584,7 +584,7 @@ constexpr int32_t maxC = maxN >> shift; constexpr int32_t minC = minN >> shift; constexpr int32_t sigC = sigN >> shiftSign; -const int32_t mulN = 0x52000000; //(1 << 23) / minN +const int32_t mulN = 0x52000000; // (1 << 23) / minN const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) const int32_t subC = 0x003FF; // max flt32 subnormal downshifted const int32_t norC = 0x00400; // min flt32 normal downshifted @@ -693,7 +693,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Bits v; - v.ui = x; + v.ui = h.x; int32_t sign = v.si & sigC; v.si ^= sign; sign <<= shiftSign; @@ -711,6 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #endif } -} // namespace half_impl +} // namespace fp16_impl } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp new file mode 100644 index 0000000000..79f63d3a80 --- /dev/null +++ b/paddle/math/tests/test_float16.cpp @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/math/float16.h" + +namespace paddle { + +#ifdef PADDLE_CUDA_FP16 +TEST(float16, gpu) { + // Conversion to and from cuda half + float16 v1 = half(float16(1.0f)); + EXPECT_EQ(v1.x, 0x3c00); + + // Conversion to and from Eigen::half + float16 v2 = Eigen::half(float16(0.5f)); + EXPECT_EQ(v2.x, 0x3800); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +#endif + +TEST(float16, conversion_cpu) { + // Conversion to and from Eigen::half + EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } + +} // namespace paddle From 979d2e0b092a1378290ddae421f8793d00fd0938 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 10:05:30 -0800 Subject: [PATCH 07/16] small fix --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 9c06b423ef..3275546e69 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -426,8 +426,8 @@ __device__ inline bool operator>=(const float16& a, const float16& b) { } // On ARMv8.2-A CPU -#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \ - defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) +#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) __host__ inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); } From 22dfa5fa8aaec63753c73848813e280560a8152f Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 14:39:49 -0800 Subject: [PATCH 08/16] fix GPU compiling --- paddle/math/float16.h | 12 ++++++------ paddle/math/tests/CMakeLists.txt | 5 ++++- .../math/tests/{test_float16.cpp => test_float16.cu} | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) rename paddle/math/tests/{test_float16.cpp => test_float16.cu} (98%) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 3275546e69..6799a83bd3 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -118,8 +118,8 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. @@ -207,8 +207,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { x = *reinterpret_cast(rhs); return *this; @@ -302,8 +302,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index ab4ac38b3c..dc06f99090 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -22,15 +22,18 @@ if(WITH_GPU) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) link_paddle_test(test_lazyAssign) + CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu) + link_paddle_test(test_float16) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) compile_cu_as_cpp(test_lazyAssign.cu) add_unittest(test_lazyAssign test_lazyAssign.cu) + compile_cu_as_cpp(test_float16.cu) + add_unittest(test_float16 test_float16.cu) endif(WITH_GPU) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) -add_simple_unittest(test_float16) diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cu similarity index 98% rename from paddle/math/tests/test_float16.cpp rename to paddle/math/tests/test_float16.cu index 79f63d3a80..40bc54f5b4 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cu @@ -15,7 +15,7 @@ limitations under the License. */ namespace paddle { #ifdef PADDLE_CUDA_FP16 -TEST(float16, gpu) { +TEST(float16, conversion_gpu) { // Conversion to and from cuda half float16 v1 = half(float16(1.0f)); EXPECT_EQ(v1.x, 0x3c00); From 080ff0c83200a229fb032cd03d4d900b634b1b02 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 16:28:33 -0800 Subject: [PATCH 09/16] two tests for cpu and gpu separately --- paddle/math/tests/CMakeLists.txt | 6 ++-- paddle/math/tests/test_float16.cpp | 47 ++++++++++++++++++++++++++++++ paddle/math/tests/test_float16.cu | 32 +------------------- 3 files changed, 50 insertions(+), 35 deletions(-) create mode 100644 paddle/math/tests/test_float16.cpp diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index dc06f99090..c131544515 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -18,21 +18,19 @@ add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) if(WITH_GPU) + nv_test(test_float16_gpu SRCS test_float16.cu) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) link_paddle_test(test_lazyAssign) - CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu) - link_paddle_test(test_float16) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) compile_cu_as_cpp(test_lazyAssign.cu) add_unittest(test_lazyAssign test_lazyAssign.cu) - compile_cu_as_cpp(test_float16.cu) - add_unittest(test_float16 test_float16.cu) endif(WITH_GPU) +cc_test(test_float16 SRCS test_float16.cpp) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp new file mode 100644 index 0000000000..8d4279b413 --- /dev/null +++ b/paddle/math/tests/test_float16.cpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/math/float16.h" + +namespace paddle { + +TEST(float16, conversion_cpu) { + // Conversion to and from Eigen::half + EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } + +} // namespace paddle diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 40bc54f5b4..6c0a1c351c 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -39,40 +39,10 @@ TEST(float16, conversion_gpu) { // Conversion from bool } +#endif TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } -#endif - -TEST(float16, conversion_cpu) { - // Conversion to and from Eigen::half - EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); - EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); - EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); - EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); - EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); - EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); - EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); - - // Conversion from float - EXPECT_EQ(float16(1.0f).x, 0x3c00); - EXPECT_EQ(float16(0.5f).x, 0x3800); - EXPECT_EQ(float16(0.33333f).x, 0x3555); - EXPECT_EQ(float16(0.0f).x, 0x0000); - EXPECT_EQ(float16(-0.0f).x, 0x8000); - EXPECT_EQ(float16(65504.0f).x, 0x7bff); - EXPECT_EQ(float16(65536.0f).x, 0x7c00); - - // Conversion from double - - // Conversion from int - - // Conversion from bool -} - -TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } - -TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } } // namespace paddle From 734cac1a53b904c7d3f76fe66cee1b2d19632dcf Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Fri, 17 Nov 2017 00:04:58 -0800 Subject: [PATCH 10/16] fix CUDA_VERSION issue --- paddle/math/float16.h | 29 ++++++++++++++++++++++++++++- paddle/math/tests/test_float16.cpp | 2 ++ paddle/math/tests/test_float16.cu | 2 ++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 6799a83bd3..1922192f7b 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -20,6 +20,10 @@ limitations under the License. */ #include #include +#include + +#include "paddle/utils/Logging.h" + #define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro @@ -48,6 +52,27 @@ limitations under the License. */ #define PADDLE_HOSTDEVICE #endif // __CUDACC__ +#define STR(x) #x +#define XSTR(x) STR(x) + +#ifndef __CUDACC__ +#pragma message "__CUDACC__ not defined" +#else +#pragma message "__CUDACC__ defined" +#endif + +#ifndef CUDA_VERSION +#pragma message "CUDA_VERSION not defined" +#else +#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION) +#endif + +#ifdef __CUDA_ARCH__ +#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__) +#else +#pragma message "CUDA ARCH NOT DEFINED!" +#endif + #ifdef __arm__ #define PADDLE_ARM_32 #endif @@ -359,6 +384,7 @@ struct PADDLE_ALIGN(2) float16 { // arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { + printf("GPU Intrinsic used!"); return float16(__hadd(half(a), half(b))); } @@ -495,6 +521,7 @@ __host__ inline bool operator>=(const float16& a, const float16& b) { #else // software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { + LOG(INFO) << "CPU emulation used"; return float16(float(a) + float(b)); } @@ -656,7 +683,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 half tmp = *reinterpret_cast(&h); - return __half2float(h); + return __half2float(tmp); #elif defined(PADDLE_NEON_64) float res; diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 8d4279b413..1a20d0e925 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -15,6 +15,8 @@ limitations under the License. */ namespace paddle { TEST(float16, conversion_cpu) { + LOG(INFO) << "cpu test started!"; + // Conversion to and from Eigen::half EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 6c0a1c351c..9ca77cf86c 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -16,6 +16,8 @@ namespace paddle { #ifdef PADDLE_CUDA_FP16 TEST(float16, conversion_gpu) { + LOG(INFO) << "GPU tests started"; + // Conversion to and from cuda half float16 v1 = half(float16(1.0f)); EXPECT_EQ(v1.x, 0x3c00); From 0f4bf1c939cea4bd3c7516eb5a9787b05563cea0 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Sun, 19 Nov 2017 03:00:38 -0800 Subject: [PATCH 11/16] Add GPU device code for testing --- paddle/math/float16.h | 71 ++-------- paddle/math/tests/test_float16.cpp | 102 ++++++++++++-- paddle/math/tests/test_float16.cu | 217 +++++++++++++++++++++++++---- 3 files changed, 296 insertions(+), 94 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 1922192f7b..a1c341113f 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// need to define PADDLE_ARM_FP16 - #pragma once #include @@ -21,14 +19,7 @@ limitations under the License. */ #include #include - -#include "paddle/utils/Logging.h" - -#define USE_EIGEN - -#ifdef USE_EIGEN // delete this #if macro #include "unsupported/Eigen/CXX11/Tensor" -#endif #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) @@ -52,27 +43,6 @@ limitations under the License. */ #define PADDLE_HOSTDEVICE #endif // __CUDACC__ -#define STR(x) #x -#define XSTR(x) STR(x) - -#ifndef __CUDACC__ -#pragma message "__CUDACC__ not defined" -#else -#pragma message "__CUDACC__ defined" -#endif - -#ifndef CUDA_VERSION -#pragma message "CUDA_VERSION not defined" -#else -#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION) -#endif - -#ifdef __CUDA_ARCH__ -#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__) -#else -#pragma message "CUDA ARCH NOT DEFINED!" -#endif - #ifdef __arm__ #define PADDLE_ARM_32 #endif @@ -113,7 +83,7 @@ namespace paddle { struct float16; namespace fp16_impl { -// convert from float to half precision in round-to-nearest-even mode +// Convert from float to half precision in round-to-nearest-even mode PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); PADDLE_HOSTDEVICE inline float half_to_float(float16 h); } // namespace fp16_impl @@ -125,7 +95,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h); struct PADDLE_ALIGN(2) float16 { uint16_t x; - PADDLE_HOSTDEVICE inline float16() {} + PADDLE_HOSTDEVICE inline float16() : x(0) {} PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -139,21 +109,15 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. - // According to gcc, __fp16 can only be used as an argument to fp16 - // intrinsic defined in arm_neon.h or as a storage type. It cannot - // be used as a formal function argument. - // TODO(kexinzhao): test it on RPI - PADDLE_HOSTDEVICE inline float16(const float16_t* h) { - x = *reinterpret_cast(h); + PADDLE_HOSTDEVICE inline float16(const float16_t& h) { + x = *reinterpret_cast(&h); } #endif @@ -225,17 +189,15 @@ struct PADDLE_ALIGN(2) float16 { } #endif -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { x = rhs.x; return *this; } -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { - x = *reinterpret_cast(rhs); + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { + x = *reinterpret_cast(&rhs); return *this; } #endif @@ -319,17 +281,14 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline operator Eigen::half() const { Eigen::half h; h.x = x; return h; } -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; return *reinterpret_cast(&h); @@ -381,10 +340,9 @@ struct PADDLE_ALIGN(2) float16 { } }; -// arithmetic operators +// Arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { - printf("GPU Intrinsic used!"); return float16(__hadd(half(a), half(b))); } @@ -452,7 +410,7 @@ __device__ inline bool operator>=(const float16& a, const float16& b) { } // On ARMv8.2-A CPU -#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \ +#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) __host__ inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); @@ -502,7 +460,7 @@ __host__ inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -// compare only available in NEON_64 +#ifdef PADDLE_NEON_64 __host__ inline bool operator<(const float16& a, const float16& b) { return static_cast(vclth_f16(float16_t(a), float16_t(b))); } @@ -518,10 +476,10 @@ __host__ inline bool operator>(const float16& a, const float16& b) { __host__ inline bool operator>=(const float16& a, const float16& b) { return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); } +#endif // PADDLE_NEON_64 -#else // software emulation on other cpu +#else // Software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { - LOG(INFO) << "CPU emulation used"; return float16(float(a) + float(b)); } @@ -624,7 +582,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { half tmp = __float2half(f); return *reinterpret_cast(&tmp); -#elif defined(PADDLE_NEON_64) // test on RPI +#elif defined(PADDLE_NEON_64) float16 res; asm volatile( "ld1 {v0.s}[0], [%[float_ptr]]\n" @@ -638,7 +596,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { "memory", "v0"); return res; -#elif defined(PADDLE_NEON_32) // test on RPI +#elif defined(PADDLE_NEON_32) float16 res; asm volatile( "vld1.32 {d0[0]}, [%[float_ptr]]\n" @@ -689,7 +647,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { float res; asm volatile( "ld1 {v0.h}[0], [%[half_ptr]]\n" - "FCVT s0, h0\n" + "fcvt s0, h0\n" "st1 {v0.s}[0], [%[float_ptr]]\n" : // outputs : // inputs @@ -739,5 +697,4 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { } } // namespace fp16_impl - } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 1a20d0e925..8c74bcc039 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -9,22 +9,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/math/float16.h" +#include + namespace paddle { TEST(float16, conversion_cpu) { - LOG(INFO) << "cpu test started!"; - - // Conversion to and from Eigen::half - EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); - EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); - EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); - EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); - EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); - EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); - EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + // Explicit conversion from Eigen::half + EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00); // Conversion from float EXPECT_EQ(float16(1.0f).x, 0x3c00); @@ -36,14 +35,91 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(65536.0f).x, 0x7c00); // Conversion from double + EXPECT_EQ(float16(1.0).x, 0x3c00); + EXPECT_EQ(float16(0.5).x, 0x3800); + EXPECT_EQ(float16(0.33333).x, 0x3555); + EXPECT_EQ(float16(0.0).x, 0x0000); + EXPECT_EQ(float16(-0.0).x, 0x8000); + EXPECT_EQ(float16(65504.0).x, 0x7bff); + EXPECT_EQ(float16(65536.0).x, 0x7c00); // Conversion from int + EXPECT_EQ(float16(-1).x, 0xbc00); + EXPECT_EQ(float16(0).x, 0x0000); + EXPECT_EQ(float16(1).x, 0x3c00); + EXPECT_EQ(float16(2).x, 0x4000); + EXPECT_EQ(float16(3).x, 0x4200); // Conversion from bool + EXPECT_EQ(float16(true).x, 0x3c00); + EXPECT_EQ(float16(false).x, 0x0000); + + // Implicit conversion to and from Eigen::half + Eigen::half tmp = float16(1.0f); + float16 v_conv = tmp; + EXPECT_EQ(tmp.x, 0x3c00); + EXPECT_EQ(v_conv.x, 0x3c00); + + // Default constructor + float16 v_def; + EXPECT_EQ(v_def.x, 0x0000); + + // Assignment operator + float16 v_assign; + v_assign = v_def; + EXPECT_EQ(v_assign.x, 0x0000); + v_assign = Eigen::half(1.0f); + EXPECT_EQ(v_assign.x, 0x3c00); + v_assign = 0.5f; + EXPECT_EQ(v_assign.x, 0x3800); + v_assign = 0.33333; + EXPECT_EQ(v_assign.x, 0x3555); + v_assign = -1; + EXPECT_EQ(v_assign.x, 0xbc00); + v_assign = true; + EXPECT_EQ(v_assign.x, 0x3c00); + + // Conversion operator + EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); + EXPECT_EQ(float(float16(0.5f)), 0.5f); + EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); + EXPECT_EQ(int(float16(-1)), -1); + EXPECT_EQ(bool(float16(true)), true); } -TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } +TEST(float16, arithmetic_cpu) { + EXPECT_EQ(float(float16(1) + float16(1)), 2); + EXPECT_EQ(float(float16(5) + float16(-5)), 0); + EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); + EXPECT_EQ(float(float16(3) - float16(5)), -2); + EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); + EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); + EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); + EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); + EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); + EXPECT_EQ(float(-float16(512.0f)), -512.0f); + EXPECT_EQ(float(-float16(-512.0f)), 512.0f); +} -TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +TEST(float16, comparison_cpu) { + EXPECT_TRUE(float16(1.0f) == float16(1.0f)); + EXPECT_FALSE(float16(-1.0f) == float16(-0.5f)); + EXPECT_TRUE(float16(1.0f) != float16(0.5f)); + EXPECT_FALSE(float16(-1.0f) != float16(-1.0f)); + EXPECT_TRUE(float16(1.0f) < float16(2.0f)); + EXPECT_FALSE(float16(-1.0f) < float16(-1.0f)); + EXPECT_TRUE(float16(1.0f) <= float16(1.0f)); + EXPECT_TRUE(float16(2.0f) > float16(1.0f)); + EXPECT_FALSE(float16(-2.0f) > float16(-2.0f)); + EXPECT_TRUE(float16(2.0f) >= float16(2.0f)); + + EXPECT_TRUE(float16(0.0f) == float16(-0.0f)); + EXPECT_TRUE(float16(0.0f) <= float16(-0.0f)); + EXPECT_TRUE(float16(0.0f) >= float16(-0.0f)); + EXPECT_FALSE(float16(0.0f) < float16(-0.0f)); + EXPECT_FALSE(float16(-0.0f) < float16(0.0f)); + EXPECT_FALSE(float16(0.0f) > float16(-0.0f)); + EXPECT_FALSE(float16(-0.0f) > float16(0.0f)); +} } // namespace paddle diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 9ca77cf86c..941f266603 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -9,42 +9,211 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/math/float16.h" -namespace paddle { +#include + +#include "paddle/utils/Logging.h" + +#define ARITHMETIC_KERNEL(op_type, sign) \ + __global__ void op_type( \ + const float16* in1, const float16* in2, float16* out) { \ + out[0] = in1[0] sign in2[0]; \ + } + +#define COMPOUND_KERNEL(op_type, sign) \ + __global__ void op_type(float16* in1, const float16* in2) { \ + in1[0] sign in2[0]; \ + } + +#define COMPARISON_KERNEL(op_type, sign) \ + __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ + out[0] = in1[0] sign in2[0]; \ + } + +#define ARITHMETIC_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, float v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2, *out; \ + float16 *d_in1, *d_in2, *d_out; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + cudaMalloc((void**)&d_out, size); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + out = (float16*)malloc(size); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ + cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(float(out[0]), v_out); \ + free(in1); \ + free(in2); \ + free(out); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + cudaFree(d_out); \ + } + +#define COMPOUND_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, float v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2; \ + float16 *d_in1, *d_in2; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2); \ + cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(float(in1[0]), v_out); \ + free(in1); \ + free(in2); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + } + +#define COMPARISON_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, bool v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2; \ + float16 *d_in1, *d_in2; \ + bool *out, *d_out; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + cudaMalloc((void**)&d_out, 1); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + out = (bool*)malloc(1); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ + cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(out[0], v_out); \ + free(in1); \ + free(in2); \ + free(out); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + cudaFree(d_out); \ + } #ifdef PADDLE_CUDA_FP16 -TEST(float16, conversion_gpu) { - LOG(INFO) << "GPU tests started"; +namespace paddle { - // Conversion to and from cuda half - float16 v1 = half(float16(1.0f)); - EXPECT_EQ(v1.x, 0x3c00); +ARITHMETIC_KERNEL(Add, +) +ARITHMETIC_KERNEL(Sub, -) +ARITHMETIC_KERNEL(Mul, *) +ARITHMETIC_KERNEL(Div, /) - // Conversion to and from Eigen::half - float16 v2 = Eigen::half(float16(0.5f)); - EXPECT_EQ(v2.x, 0x3800); +ARITHMETIC_KERNEL_LAUNCH(Add) +ARITHMETIC_KERNEL_LAUNCH(Sub) +ARITHMETIC_KERNEL_LAUNCH(Mul) +ARITHMETIC_KERNEL_LAUNCH(Div) - // Conversion from float - EXPECT_EQ(float16(1.0f).x, 0x3c00); - EXPECT_EQ(float16(0.5f).x, 0x3800); - EXPECT_EQ(float16(0.33333f).x, 0x3555); - EXPECT_EQ(float16(0.0f).x, 0x0000); - EXPECT_EQ(float16(-0.0f).x, 0x8000); - EXPECT_EQ(float16(65504.0f).x, 0x7bff); - EXPECT_EQ(float16(65536.0f).x, 0x7c00); +// Negative sign kernel +__global__ void Neg(float16* in) { in[0] = -in[0]; } - // Conversion from double +void TestNeg(float v_in, float v_out) { + LOG(INFO) << "Test Neg on GPU!"; + float16 *in, *d_in; + int size = sizeof(float16); + cudaMalloc((void**)&d_in, size); + in = (float16*)malloc(size); + in[0] = float16(v_in); + cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); + Neg<<<1, 1>>>(d_in); + cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); + EXPECT_EQ(float(in[0]), v_out); + free(in); + cudaFree(d_in); +} - // Conversion from int +COMPOUND_KERNEL(AddAssign, +=) +COMPOUND_KERNEL(SubAssign, -=) +COMPOUND_KERNEL(MulAssign, *=) +COMPOUND_KERNEL(DivAssign, /=) - // Conversion from bool +COMPOUND_KERNEL_LAUNCH(AddAssign) +COMPOUND_KERNEL_LAUNCH(SubAssign) +COMPOUND_KERNEL_LAUNCH(MulAssign) +COMPOUND_KERNEL_LAUNCH(DivAssign) + +COMPARISON_KERNEL(Equal, ==) +COMPARISON_KERNEL(NotEqual, !=) +COMPARISON_KERNEL(Less, <) +COMPARISON_KERNEL(LessEqual, <=) +COMPARISON_KERNEL(Greater, >) +COMPARISON_KERNEL(GreaterEqual, >=) + +COMPARISON_KERNEL_LAUNCH(Equal) +COMPARISON_KERNEL_LAUNCH(NotEqual) +COMPARISON_KERNEL_LAUNCH(Less) +COMPARISON_KERNEL_LAUNCH(LessEqual) +COMPARISON_KERNEL_LAUNCH(Greater) +COMPARISON_KERNEL_LAUNCH(GreaterEqual) + +TEST(float16, arithmetic_on_gpu) { + TestAdd(1, 2, 3); + TestSub(2, 1, 1); + TestMul(2, 3, 6); + TestDiv(6, 2, 3); + TestNeg(1, -1); } -#endif -TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } +TEST(float16, compound_on_gpu) { + TestAddAssign(1, 2, 3); + TestSubAssign(2, 1, 1); + TestMulAssign(2, 3, 6); + TestDivAssign(6, 2, 3); +} -TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +TEST(float16, comparision_on_gpu) { + TestEqual(1, 1, true); + TestEqual(1, 2, false); + TestNotEqual(2, 3, true); + TestNotEqual(2, 2, false); + TestLess(3, 4, true); + TestLess(3, 3, false); + TestLessEqual(3, 3, true); + TestLessEqual(3, 2, false); + TestGreater(4, 3, true); + TestGreater(4, 4, false); + TestGreaterEqual(4, 4, true); + TestGreaterEqual(4, 5, false); +} + +TEST(float16, conversion_on_gpu) { + // Explicit conversion to and from cuda half + EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); + + // Implicit conversion to and from cuda half + half tmp = float16(1.0f); + float16 val = tmp; + EXPECT_EQ(val.x, 0x3c00); + + // Assignment operator + float16 v_assign; + v_assign = tmp; + EXPECT_EQ(v_assign.x, 0x3c00); +} } // namespace paddle +#endif From d646e4768fc0049e172f59f8786d9aeeec50491e Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 20 Nov 2017 00:33:27 -0800 Subject: [PATCH 12/16] fix cmake --- paddle/math/tests/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index c131544515..215bac1271 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -18,11 +18,11 @@ add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) if(WITH_GPU) - nv_test(test_float16_gpu SRCS test_float16.cu) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) - link_paddle_test(test_lazyAssign) + link_paddle_test(test_lazyAssign) + nv_test(test_float16_gpu SRCS test_float16.cu) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) @@ -30,8 +30,8 @@ else() add_unittest(test_lazyAssign test_lazyAssign.cu) endif(WITH_GPU) -cc_test(test_float16 SRCS test_float16.cpp) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) +cc_test(test_float16 SRCS test_float16.cpp) From 19e5c24f00fac22da84387510e94596fb577637b Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 20 Nov 2017 17:23:04 -0800 Subject: [PATCH 13/16] fix bug --- paddle/math/float16.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index a1c341113f..3b22174148 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -15,8 +15,6 @@ limitations under the License. */ #pragma once #include -#include -#include #include #include "unsupported/Eigen/CXX11/Tensor" @@ -117,7 +115,8 @@ struct PADDLE_ALIGN(2) float16 { // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. PADDLE_HOSTDEVICE inline float16(const float16_t& h) { - x = *reinterpret_cast(&h); + float16_t tmp = h; + x = *reinterpret_cast(&tmp); } #endif @@ -197,7 +196,8 @@ struct PADDLE_ALIGN(2) float16 { #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - x = *reinterpret_cast(&rhs); + float16_t tmp = rhs; + x = *reinterpret_cast(&tmp); return *this; } #endif @@ -460,23 +460,37 @@ __host__ inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -#ifdef PADDLE_NEON_64 __host__ inline bool operator<(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vclth_f16(float16_t(a), float16_t(b))); +#else + return float(a) < float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator<=(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcleh_f16(float16_t(a), float16_t(b))); +#else + return float(a) <= float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator>(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcgth_f16(float16_t(a), float16_t(b))); +#else + return float(a) > float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator>=(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); -} +#else + return float(a) >= float(b); #endif // PADDLE_NEON_64 +} #else // Software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { From a5feb771592d1bd7340ff7132518d6c52829b8e7 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 27 Nov 2017 17:12:21 -0800 Subject: [PATCH 14/16] address pr comment --- paddle/math/float16.h | 839 +++++++++++++++++------------ paddle/math/tests/test_float16.cpp | 2 + 2 files changed, 482 insertions(+), 359 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 3b22174148..65c0489e1f 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -16,9 +16,14 @@ limitations under the License. */ #include +#ifdef PADDLE_WITH_CUDA #include +#endif // PADDLE_WITH_CUDA + #include "unsupported/Eigen/CXX11/Tensor" +#include "paddle/platform/hostdevice.h" + #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) #else @@ -31,25 +36,12 @@ limitations under the License. */ #define PADDLE_CLANG_VER 0 #endif // __clang__ -#ifdef __CUDACC__ -#define PADDLE_HOSTDEVICE __host__ __device__ -#if CUDA_VERSION >= 7050 +#if defined(__CUDACC__) && CUDA_VERSION >= 7050 #define PADDLE_CUDA_FP16 #include -#endif // CUDA_VERSION >= 7050 -#else -#define PADDLE_HOSTDEVICE -#endif // __CUDACC__ - -#ifdef __arm__ -#define PADDLE_ARM_32 #endif -#ifdef __aarch64__ -#define PADDLE_ARM_64 -#endif - -#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64) +#if defined(__arm__) || defined(__aarch64__) #define PADDLE_ARM #endif @@ -58,19 +50,12 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) -#define PADDLE_NEON_32 -#endif - -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64) -#define PADDLE_NEON_64 +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37) +#define PADDLE_WITH_NATIVE_FP16 #endif -#ifdef PADDLE_ARM -#ifdef __F16C__ -#undef __F16C__ -#endif // __F16C__ -#else +#ifndef PADDLE_ARM #include #endif // PADDLE_ARM @@ -78,27 +63,20 @@ limitations under the License. */ namespace paddle { -struct float16; - -namespace fp16_impl { -// Convert from float to half precision in round-to-nearest-even mode -PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); -PADDLE_HOSTDEVICE inline float half_to_float(float16 h); -} // namespace fp16_impl - // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible // with CUDA half, ARM float16_t, and Eigen::half data types. struct PADDLE_ALIGN(2) float16 { +public: uint16_t x; - PADDLE_HOSTDEVICE inline float16() : x(0) {} + HOSTDEVICE inline float16() : x(0) {} - PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} + HOSTDEVICE inline float16(const float16& h) : x(h.x) {} #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const half& h) { + HOSTDEVICE inline explicit float16(const half& h) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&h)->x; #else @@ -107,78 +85,64 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} + HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {} -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) +#ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. - PADDLE_HOSTDEVICE inline float16(const float16_t& h) { - float16_t tmp = h; - x = *reinterpret_cast(&tmp); + HOSTDEVICE inline explicit float16(const float16_t& h) { + x = *reinterpret_cast(&h); } #endif - PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - - PADDLE_HOSTDEVICE inline explicit float16(int8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(int16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } + HOSTDEVICE inline explicit float16(float val) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = __float2half(val); + x = *reinterpret_cast(&tmp); - PADDLE_HOSTDEVICE inline explicit float16(int32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#elif defined(PADDLE_NEON) + float32x4_t tmp = vld1q_dup_f32(&val); + float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); + x = *reinterpret_cast(&res); - PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#elif defined(__F16C__) + x = _cvtss_sh(val, 0); - PADDLE_HOSTDEVICE inline explicit float16(int64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = val; + uint32_t sign = v.si & sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + x = v.ui | sign; - PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; +#endif } - PADDLE_HOSTDEVICE inline explicit float16(float val) { - float16 res = fp16_impl::float_to_half_rn(val); - x = res.x; - } + HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - PADDLE_HOSTDEVICE inline explicit float16(double val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } + template + HOSTDEVICE inline explicit float16(const T& val) + : x(float16(static_cast(val)).x) {} - PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) { + HOSTDEVICE inline float16& operator=(const float16& rhs) { x = rhs.x; return *this; } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) { + HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&rhs)->x; #else @@ -188,87 +152,75 @@ struct PADDLE_ALIGN(2) float16 { } #endif - PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { + HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { x = rhs.x; return *this; } -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - float16_t tmp = rhs; - x = *reinterpret_cast(&tmp); +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline float16& operator=(const float16_t& rhs) { + x = *reinterpret_cast(&rhs); return *this; } #endif - PADDLE_HOSTDEVICE inline float16& operator=(bool b) { + HOSTDEVICE inline float16& operator=(bool b) { x = b ? 0x3c00 : 0; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int8_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint8_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int16_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint16_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int32_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint32_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int64_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint64_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(float val) { - float16 res = fp16_impl::float_to_half_rn(val); - x = res.x; + HOSTDEVICE inline float16& operator=(float val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(double val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(double val) { + x = float16(val).x; return *this; } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator half() const { + HOSTDEVICE inline explicit operator half() const { #if CUDA_VERSION >= 9000 __half_raw h; h.x = x; @@ -281,186 +233,504 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator Eigen::half() const { + HOSTDEVICE inline explicit operator Eigen::half() const { Eigen::half h; h.x = x; return h; } -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline operator float16_t() const { - float16 h = *this; - return *reinterpret_cast(&h); +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline explicit operator float16_t() const { + return *reinterpret_cast(this); } #endif - PADDLE_HOSTDEVICE inline explicit operator bool() const { - return (x & 0x7fff) != 0; - } + HOSTDEVICE inline explicit operator float() const { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = *reinterpret_cast(this); + return __half2float(tmp); + +#elif defined(PADDLE_NEON) + float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); + return vgetq_lane_f32(vcvt_f32_f16(res), 0); - PADDLE_HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(fp16_impl::half_to_float(*this)); +#elif defined(__F16C__) + return _cvtsh_ss(this->x); + +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v; + v.ui = this->x; + int32_t sign = v.si & sigC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + +#endif } - PADDLE_HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator float() const { - return fp16_impl::half_to_float(*this); + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator double() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator double() const { + return static_cast(float(*this)); } + +private: + union Bits { + float f; + int32_t si; + uint32_t ui; + }; + + static const int shift = 13; + static const int shiftSign = 16; + + static const int32_t infN = 0x7F800000; + static const int32_t maxN = 0x477FE000; // max flt16 as flt32 + static const int32_t minN = 0x38800000; // min flt16 normal as flt32 + static const int32_t sigN = 0x80000000; // sign bit + + static constexpr int32_t infC = infN >> shift; + static constexpr int32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 + static constexpr int32_t maxC = maxN >> shift; + static constexpr int32_t minC = minN >> shift; + static constexpr int32_t sigC = sigN >> shiftSign; + + static const int32_t mulN = 0x52000000; // (1 << 23) / minN + static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) + static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted + static const int32_t norC = 0x00400; // min flt32 normal downshifted + + static constexpr int32_t maxD = infC - maxC - 1; + static constexpr int32_t minD = minC - subC - 1; }; -// Arithmetic operators -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 -__device__ inline float16 operator+(const float16& a, const float16& b) { +// Arithmetic operators on GPU +// CUDA 9.0 provides built-in arithmetic operators for half while +// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are +// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in +// CUDA 9.0 regarding the half data type. +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000 +DEVICE inline half operator+(const half& a, const half& b) { + return __hadd(a, b); +} + +DEVICE inline half operator-(const half& a, const half& b) { + return __hsub(a, b); +} + +DEVICE inline half operator*(const half& a, const half& b) { + return __hmul(a, b); +} + +DEVICE inline half operator/(const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} + +DEVICE inline half operator-(const half& a) { return __hneg(a); } + +DEVICE inline half& operator+=(half& a, const half& b) { + a = a + b; + return a; +} + +DEVICE inline half& operator-=(half& a, const half& b) { + a = a - b; + return a; +} + +DEVICE inline half& operator*=(half& a, const half& b) { + a = a * b; + return a; +} + +DEVICE inline half& operator/=(half& a, const half& b) { + a = a / b; + return a; +} + +DEVICE inline bool operator==(const half& a, const half& b) { + return __heq(a, b); +} + +DEVICE inline bool operator!=(const half& a, const half& b) { + return __hne(a, b); +} + +DEVICE inline bool operator<(const half& a, const half& b) { + return __hlt(a, b); +} + +DEVICE inline bool operator<=(const half& a, const half& b) { + return __hle(a, b); +} + +DEVICE inline bool operator>(const half& a, const half& b) { + return __hgt(a, b); +} + +DEVICE inline bool operator>=(const half& a, const half& b) { + return __hge(a, b); +} + +/* +DEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(__hadd(half(a), half(b))); } -__device__ inline float16 operator-(const float16& a, const float16& b) { +DEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(__hsub(half(a), half(b))); } -__device__ inline float16 operator*(const float16& a, const float16& b) { +DEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(__hmul(half(a), half(b))); } -__device__ inline float16 operator/(const float16& a, const float16& b) { - // TODO(kexinzhao): check the cuda version that starts to support __hdiv +DEVICE inline float16 operator/(const float16& a, const float16& b) { float num = __half2float(half(a)); float denom = __half2float(half(b)); return float16(num / denom); } -__device__ inline float16 operator-(const float16& a) { +DEVICE inline float16 operator-(const float16& a) { return float16(__hneg(half(a))); } -__device__ inline float16& operator+=(float16& a, const float16& b) { +DEVICE inline float16& operator+=(float16& a, const float16& b) { a = a + b; return a; } -__device__ inline float16& operator-=(float16& a, const float16& b) { +DEVICE inline float16& operator-=(float16& a, const float16& b) { a = a - b; return a; } -__device__ inline float16& operator*=(float16& a, const float16& b) { +DEVICE inline float16& operator*=(float16& a, const float16& b) { a = a * b; return a; } -__device__ inline float16& operator/=(float16& a, const float16& b) { +DEVICE inline float16& operator/=(float16& a, const float16& b) { a = a / b; return a; } -__device__ inline bool operator==(const float16& a, const float16& b) { +DEVICE inline bool operator==(const float16& a, const float16& b) { return __heq(half(a), half(b)); } -__device__ inline bool operator!=(const float16& a, const float16& b) { +DEVICE inline bool operator!=(const float16& a, const float16& b) { return __hne(half(a), half(b)); } -__device__ inline bool operator<(const float16& a, const float16& b) { +DEVICE inline bool operator<(const float16& a, const float16& b) { return __hlt(half(a), half(b)); } -__device__ inline bool operator<=(const float16& a, const float16& b) { +DEVICE inline bool operator<=(const float16& a, const float16& b) { return __hle(half(a), half(b)); } -__device__ inline bool operator>(const float16& a, const float16& b) { +DEVICE inline bool operator>(const float16& a, const float16& b) { return __hgt(half(a), half(b)); } -__device__ inline bool operator>=(const float16& a, const float16& b) { +DEVICE inline bool operator>=(const float16& a, const float16& b) { return __hge(half(a), half(b)); } +*/ + +// Arithmetic operators on ARMv8.2-A CPU +#elif defined(PADDLE_WITH_NATIVE_FP16) +HOST inline float16 operator+(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fadd h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator-(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fsub h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator*(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fmul h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator/(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fdiv h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} -// On ARMv8.2-A CPU -#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) -__host__ inline float16 operator+(const float16& a, const float16& b) { +HOST inline float16 operator-(const float16& a) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "fneg h0, h0\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0"); + return res; +} + +HOST inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +HOST inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +HOST inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +HOST inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +HOST inline bool operator==(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmeq h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator!=(const float16& a, const float16& b) { + return !(a == b); +} + +HOST inline bool operator<(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator<=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator>(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator>=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +/* +HOST inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator-(const float16& a, const float16& b) { +HOST inline float16 operator-(const float16& a, const float16& b) { return float16(vsubh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator*(const float16& a, const float16& b) { +HOST inline float16 operator*(const float16& a, const float16& b) { return float16(vmulh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator/(const float16& a, const float16& b) { +HOST inline float16 operator/(const float16& a, const float16& b) { return float16(vdivh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator-(const float16& a) { +HOST inline float16 operator-(const float16& a) { return float16(vnegh_f16(float16_t(a))); } -__host__ inline float16& operator+=(float16& a, const float16& b) { +HOST inline float16& operator+=(float16& a, const float16& b) { a = a + b; return a; } -__host__ inline float16& operator-=(float16& a, const float16& b) { +HOST inline float16& operator-=(float16& a, const float16& b) { a = a - b; return a; } -__host__ inline float16& operator*=(float16& a, const float16& b) { +HOST inline float16& operator*=(float16& a, const float16& b) { a = a * b; return a; } -__host__ inline float16& operator/=(float16& a, const float16& b) { +HOST inline float16& operator/=(float16& a, const float16& b) { a = a / b; return a; } -__host__ inline bool operator==(const float16& a, const float16& b) { +HOST inline bool operator==(const float16& a, const float16& b) { return static_cast(vceqh_f16(float16_t(a), float16_t(b))); } -__host__ inline bool operator!=(const float16& a, const float16& b) { +HOST inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -__host__ inline bool operator<(const float16& a, const float16& b) { +HOST inline bool operator<(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vclth_f16(float16_t(a), float16_t(b))); #else @@ -468,7 +738,7 @@ __host__ inline bool operator<(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator<=(const float16& a, const float16& b) { +HOST inline bool operator<=(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcleh_f16(float16_t(a), float16_t(b))); #else @@ -476,7 +746,7 @@ __host__ inline bool operator<=(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator>(const float16& a, const float16& b) { +HOST inline bool operator>(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcgth_f16(float16_t(a), float16_t(b))); #else @@ -484,231 +754,82 @@ __host__ inline bool operator>(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator>=(const float16& a, const float16& b) { +HOST inline bool operator>=(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); #else return float(a) >= float(b); #endif // PADDLE_NEON_64 } +*/ -#else // Software emulation on other cpu -PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { +// Arithmetic operators, software emulated on other CPU +#else +HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(float(a) + float(b)); } -PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(float(a) - float(b)); } -PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(float(a) * float(b)); } -PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { return float16(float(a) / float(b)); } -PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) { +HOSTDEVICE inline float16 operator-(const float16& a) { float16 res; res.x = a.x ^ 0x8000; return res; } -PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { a = float16(float(a) + float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { a = float16(float(a) - float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { a = float16(float(a) * float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { a = float16(float(a) / float(b)); return a; } -PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { return float(a) == float(b); } -PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { return float(a) != float(b); } -PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { return float(a) < float(b); } -PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { return float(a) <= float(b); } -PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { return float(a) > float(b); } -PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { return float(a) >= float(b); } #endif - -namespace fp16_impl { - -union Bits { - float f; - int32_t si; - uint32_t ui; -}; - -const int shift = 13; -const int shiftSign = 16; - -const int32_t infN = 0x7F800000; -const int32_t maxN = 0x477FE000; // max flt16 as flt32 -const int32_t minN = 0x38800000; // min flt16 normal as flt32 -const int32_t sigN = 0x80000000; // sign bit - -constexpr int32_t infC = infN >> shift; -constexpr int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as float32 -constexpr int32_t maxC = maxN >> shift; -constexpr int32_t minC = minN >> shift; -constexpr int32_t sigC = sigN >> shiftSign; - -const int32_t mulN = 0x52000000; // (1 << 23) / minN -const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) -const int32_t subC = 0x003FF; // max flt32 subnormal downshifted -const int32_t norC = 0x00400; // min flt32 normal downshifted - -constexpr int32_t maxD = infC - maxC - 1; -constexpr int32_t minD = minC - subC - 1; - -PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = __float2half(f); - return *reinterpret_cast(&tmp); - -#elif defined(PADDLE_NEON_64) - float16 res; - asm volatile( - "ld1 {v0.s}[0], [%[float_ptr]]\n" - "fcvt h0, s0\n" - "st1 {v0.h}[0], [%[half_ptr]]\n" - : // outputs - : // inputs - [float_ptr] "r"(&f), - [half_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(PADDLE_NEON_32) - float16 res; - asm volatile( - "vld1.32 {d0[0]}, [%[float_ptr]]\n" - "vcvt.f16.f32 d0, q0\n" - "vst1.16 {d0[0]}, [%[half_ptr]]\n" - : // outputs - : // inputs - [float_ptr] "r"(&f), - [half_ptr] "r"(&(res.x)) - : // clobbers - "memory", "d0"); - return res; - -#elif defined(__F16C__) - float16 res; - res.x = _cvtss_sh(f, 0); - return res; - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v, s; - v.f = f; - uint32_t sign = v.si & sigN; - v.si ^= sign; - sign >>= shiftSign; // logical shift - s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - v.ui >>= shift; // logical shift - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); - float16 res; - res.x = v.ui | sign; - return res; - -#endif -} - -PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = *reinterpret_cast(&h); - return __half2float(tmp); - -#elif defined(PADDLE_NEON_64) - float res; - asm volatile( - "ld1 {v0.h}[0], [%[half_ptr]]\n" - "fcvt s0, h0\n" - "st1 {v0.s}[0], [%[float_ptr]]\n" - : // outputs - : // inputs - [half_ptr] "r"(&(h.x)), - [float_ptr] "r"(&res) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(PADDLE_NEON_32) - float res; - asm volatile( - "vld1.16 {d0[0]}, [%[half_ptr]]\n" - "vcvt.f32.f16 q0, d0\n" - "vst1.32 {d0[0]}, [%[float_ptr]]\n" - : // outputs - : // inputs - [half_ptr] "r"(&(h.x)), - [float_ptr] "r"(&res) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(__F16C__) - return _cvtsh_ss(h.x); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v; - v.ui = h.x; - int32_t sign = v.si & sigC; - v.si ^= sign; - sign <<= shiftSign; - v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); - v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); - Bits s; - s.si = mulC; - s.f *= v.si; - int32_t mask = -(norC > v.si); - v.si <<= shift; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; - return v.f; - -#endif -} - -} // namespace fp16_impl } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 8c74bcc039..f5541d8f0f 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -55,10 +55,12 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(false).x, 0x0000); // Implicit conversion to and from Eigen::half + /* Eigen::half tmp = float16(1.0f); float16 v_conv = tmp; EXPECT_EQ(tmp.x, 0x3c00); EXPECT_EQ(v_conv.x, 0x3c00); + */ // Default constructor float16 v_def; From 41bd1f9115c4cb8a9a9afcc656b6d0f00d9b1cb5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 28 Nov 2017 17:09:12 -0800 Subject: [PATCH 15/16] fix gpu test, clean code and add cmake --- CMakeLists.txt | 1 + cmake/configure.cmake | 5 + paddle/math/float16.h | 217 ++++++++--------------------- paddle/math/tests/test_float16.cpp | 8 -- paddle/math/tests/test_float16.cu | 90 ++++++------ 5 files changed, 109 insertions(+), 212 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3582a1bc..a2bb5d73bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 24ddb24399..2c202707ff 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -24,6 +24,11 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 65c0489e1f..778b48bce8 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #ifdef PADDLE_WITH_CUDA #include @@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 { public: uint16_t x; + // Constructors HOSTDEVICE inline float16() : x(0) {} HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -89,8 +90,7 @@ public: #ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 in arm_fp16.h, - // which is included in arm_neon.h. + // float16_t is an alias for __fp16 HOSTDEVICE inline explicit float16(const float16_t& h) { x = *reinterpret_cast(&h); } @@ -141,6 +141,7 @@ public: return *this; } +// Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 @@ -219,6 +220,7 @@ public: return *this; } +// Conversion opertors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit operator half() const { #if CUDA_VERSION >= 9000 @@ -353,27 +355,54 @@ private: // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // CUDA 9.0 regarding the half data type. -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000 +#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 + DEVICE inline half operator+(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hadd(a, b); +#else + float res = float(float16(a)) + float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator-(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hsub(a, b); +#else + float res = float(float16(a)) - float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator*(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hmul(a, b); +#else + float res = float(float16(a)) * float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator/(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); +#else + float res = float(float16(a)) / float(float16(b)); + return half(float16(res)); +#endif } -DEVICE inline half operator-(const half& a) { return __hneg(a); } +DEVICE inline half operator-(const half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hneg(a); +#else + float res = -float(float16(a)); + return half(float16(res)); +#endif +} DEVICE inline half& operator+=(half& a, const half& b) { a = a + b; @@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) { } DEVICE inline bool operator==(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __heq(a, b); +#else + return float(float16(a)) == float(float16(b)); +#endif } DEVICE inline bool operator!=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hne(a, b); +#else + return float(float16(a)) != float(float16(b)); +#endif } DEVICE inline bool operator<(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hlt(a, b); +#else + return float(float16(a)) < float(float16(b)); +#endif } DEVICE inline bool operator<=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hle(a, b); +#else + return float(float16(a)) <= float(float16(b)); +#endif } DEVICE inline bool operator>(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hgt(a, b); +#else + return float(float16(a)) > float(float16(b)); +#endif } DEVICE inline bool operator>=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hge(a, b); +#else + return float(float16(a)) >= float(float16(b)); +#endif } -/* -DEVICE inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(half(a), half(b))); -} - -DEVICE inline float16 operator-(const float16& a, const float16& b) { - return float16(__hsub(half(a), half(b))); -} - -DEVICE inline float16 operator*(const float16& a, const float16& b) { - return float16(__hmul(half(a), half(b))); -} - -DEVICE inline float16 operator/(const float16& a, const float16& b) { - float num = __half2float(half(a)); - float denom = __half2float(half(b)); - return float16(num / denom); -} - -DEVICE inline float16 operator-(const float16& a) { - return float16(__hneg(half(a))); -} - -DEVICE inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -DEVICE inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -DEVICE inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -DEVICE inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -DEVICE inline bool operator==(const float16& a, const float16& b) { - return __heq(half(a), half(b)); -} - -DEVICE inline bool operator!=(const float16& a, const float16& b) { - return __hne(half(a), half(b)); -} - -DEVICE inline bool operator<(const float16& a, const float16& b) { - return __hlt(half(a), half(b)); -} - -DEVICE inline bool operator<=(const float16& a, const float16& b) { - return __hle(half(a), half(b)); -} - -DEVICE inline bool operator>(const float16& a, const float16& b) { - return __hgt(half(a), half(b)); -} - -DEVICE inline bool operator>=(const float16& a, const float16& b) { - return __hge(half(a), half(b)); -} -*/ +#endif // PADDLE_CUDA_FP16 // Arithmetic operators on ARMv8.2-A CPU -#elif defined(PADDLE_WITH_NATIVE_FP16) +#if defined(PADDLE_WITH_NATIVE_FP16) HOST inline float16 operator+(const float16& a, const float16& b) { float16 res; asm volatile( @@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) { return (res & 0xffff) != 0; } -/* -HOST inline float16 operator+(const float16& a, const float16& b) { - return float16(vaddh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a, const float16& b) { - return float16(vsubh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator*(const float16& a, const float16& b) { - return float16(vmulh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator/(const float16& a, const float16& b) { - return float16(vdivh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a) { - return float16(vnegh_f16(float16_t(a))); -} - -HOST inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -HOST inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -HOST inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -HOST inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -HOST inline bool operator==(const float16& a, const float16& b) { - return static_cast(vceqh_f16(float16_t(a), float16_t(b))); -} - -HOST inline bool operator!=(const float16& a, const float16& b) { - return !(a == b); -} - -HOST inline bool operator<(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vclth_f16(float16_t(a), float16_t(b))); -#else - return float(a) < float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator<=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcleh_f16(float16_t(a), float16_t(b))); -#else - return float(a) <= float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgth_f16(float16_t(a), float16_t(b))); -#else - return float(a) > float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); -#else - return float(a) >= float(b); -#endif // PADDLE_NEON_64 -} -*/ - // Arithmetic operators, software emulated on other CPU #else HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index f5541d8f0f..74cc55aa37 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(true).x, 0x3c00); EXPECT_EQ(float16(false).x, 0x0000); - // Implicit conversion to and from Eigen::half - /* - Eigen::half tmp = float16(1.0f); - float16 v_conv = tmp; - EXPECT_EQ(tmp.x, 0x3c00); - EXPECT_EQ(v_conv.x, 0x3c00); - */ - // Default constructor float16 v_def; EXPECT_EQ(v_def.x, 0x0000); diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 941f266603..4b520feaaf 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -15,41 +15,38 @@ limitations under the License. */ #include "paddle/utils/Logging.h" -#define ARITHMETIC_KERNEL(op_type, sign) \ - __global__ void op_type( \ - const float16* in1, const float16* in2, float16* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define ARITHMETIC_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, half* out) { \ + out[0] = in1[0] sign in2[0]; \ } -#define COMPOUND_KERNEL(op_type, sign) \ - __global__ void op_type(float16* in1, const float16* in2) { \ - in1[0] sign in2[0]; \ - } +#define COMPOUND_KERNEL(op_type, sign) \ + __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; } -#define COMPARISON_KERNEL(op_type, sign) \ - __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define COMPARISON_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, bool* out) { \ + out[0] = in1[0] sign in2[0]; \ } #define ARITHMETIC_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2, *out; \ - float16 *d_in1, *d_in2, *d_out; \ - int size = sizeof(float16); \ + half *in1, *in2, *out; \ + half *d_in1, *d_in2, *d_out; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - out = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + out = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(out[0]), v_out); \ + EXPECT_EQ(float(float16(out[0])), v_out); \ free(in1); \ free(in2); \ free(out); \ @@ -61,20 +58,20 @@ limitations under the License. */ #define COMPOUND_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ - int size = sizeof(float16); \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(in1[0]), v_out); \ + EXPECT_EQ(float(float16(in1[0])), v_out); \ free(in1); \ free(in2); \ cudaFree(d_in1); \ @@ -84,18 +81,18 @@ limitations under the License. */ #define COMPARISON_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, bool v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ bool *out, *d_out; \ - int size = sizeof(float16); \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, 1); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ out = (bool*)malloc(1); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ @@ -112,6 +109,7 @@ limitations under the License. */ #ifdef PADDLE_CUDA_FP16 namespace paddle { +#if CUDA_VERSION < 9000 ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Mul, *) @@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul) ARITHMETIC_KERNEL_LAUNCH(Div) // Negative sign kernel -__global__ void Neg(float16* in) { in[0] = -in[0]; } +__global__ void Neg(half* in) { in[0] = -in[0]; } void TestNeg(float v_in, float v_out) { LOG(INFO) << "Test Neg on GPU!"; - float16 *in, *d_in; - int size = sizeof(float16); + half *in, *d_in; + int size = sizeof(half); cudaMalloc((void**)&d_in, size); - in = (float16*)malloc(size); - in[0] = float16(v_in); + in = (half*)malloc(size); + in[0] = half(float16(v_in)); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); Neg<<<1, 1>>>(d_in); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); - EXPECT_EQ(float(in[0]), v_out); + EXPECT_EQ(float(float16(in[0])), v_out); free(in); cudaFree(d_in); } @@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) { TestGreaterEqual(4, 4, true); TestGreaterEqual(4, 5, false); } +#endif // CUDA_VERSION TEST(float16, conversion_on_gpu) { // Explicit conversion to and from cuda half @@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) { EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); - // Implicit conversion to and from cuda half - half tmp = float16(1.0f); - float16 val = tmp; - EXPECT_EQ(val.x, 0x3c00); - // Assignment operator float16 v_assign; - v_assign = tmp; + v_assign = half(float16(1.0f)); EXPECT_EQ(v_assign.x, 0x3c00); } } // namespace paddle -#endif +#endif // PADDLE_CUDA_FP16 From 36df67b17c0057725661f11065c87509a3cc898f Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 29 Nov 2017 16:30:55 -0800 Subject: [PATCH 16/16] small fix --- paddle/math/float16.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 778b48bce8..f805cad08b 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -735,6 +735,5 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { return float(a) >= float(b); } - #endif } // namespace paddle