|
|
|
@ -36,11 +36,6 @@ __device__ inline double Sqrt(double x) { return sqrt(x); }
|
|
|
|
|
)";
|
|
|
|
|
|
|
|
|
|
static constexpr char predefined_cuda_functions_fp16[] = R"(
|
|
|
|
|
__device__ inline float Max(float x, float y) { return fmaxf(x, y); }
|
|
|
|
|
__device__ inline float Exp(float x) { return expf(x); }
|
|
|
|
|
__device__ inline float Log(float x) { return logf(x); }
|
|
|
|
|
__device__ inline float Sqrt(float x) { return sqrtf(x); }
|
|
|
|
|
|
|
|
|
|
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
|
|
|
|
|
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
|
|
|
|
|
|
|
|
|
@ -65,6 +60,219 @@ __device__ float __half2float(const __half h) {
|
|
|
|
|
return val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define __CUDA_FP16_DECL__ __host__ __device__
|
|
|
|
|
/******************************************************************************
|
|
|
|
|
* __half comparison *
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
#define __COMPARISON_OP_HALF_MACRO(name) do {\
|
|
|
|
|
unsigned short val; \
|
|
|
|
|
asm( "{ .reg .pred __$temp3;\n" \
|
|
|
|
|
" setp."#name".f16 __$temp3, %1, %2;\n" \
|
|
|
|
|
" selp.u16 %0, 1, 0, __$temp3;}" \
|
|
|
|
|
: "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
|
|
|
|
|
return val ? true : false; \
|
|
|
|
|
} while(0);
|
|
|
|
|
__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(eq);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(ne);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(le);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(ge);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(lt);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(gt);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(equ);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(neu);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(leu);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(geu);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(ltu);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__COMPARISON_OP_HALF_MACRO(gtu);
|
|
|
|
|
}
|
|
|
|
|
#undef __COMPARISON_OP_HALF_MACRO
|
|
|
|
|
|
|
|
|
|
/******************************************************************************
|
|
|
|
|
* __half arithmetic *
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
#define __BINARY_OP_HALF_MACRO(name) do {\
|
|
|
|
|
__half val; \
|
|
|
|
|
asm( "{"#name".f16 %0,%1,%2;\n}" \
|
|
|
|
|
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
|
|
|
|
|
return val; \
|
|
|
|
|
} while(0);
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(add);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(sub);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(mul);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(add.sat);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(sub.sat);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
|
|
|
|
|
{
|
|
|
|
|
__BINARY_OP_HALF_MACRO(mul.sat);
|
|
|
|
|
}
|
|
|
|
|
#undef __BINARY_OP_HALF_MACRO
|
|
|
|
|
#define __TERNARY_OP_HALF_MACRO(name) do {\
|
|
|
|
|
__half val; \
|
|
|
|
|
asm( "{"#name".f16 %0,%1,%2,%3;\n}" \
|
|
|
|
|
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
|
|
|
|
|
return val; \
|
|
|
|
|
} while(0);
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
|
|
|
|
|
{
|
|
|
|
|
__TERNARY_OP_HALF_MACRO(fma.rn);
|
|
|
|
|
}
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
|
|
|
|
|
{
|
|
|
|
|
__TERNARY_OP_HALF_MACRO(fma.rn.sat);
|
|
|
|
|
}
|
|
|
|
|
#undef __TERNARY_OP_HALF2_MACRO
|
|
|
|
|
__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) {
|
|
|
|
|
__half v, abs, den;
|
|
|
|
|
__HALF_TO_US(den) = 0x008F;
|
|
|
|
|
float fa, fb, fv, rcp;
|
|
|
|
|
|
|
|
|
|
fa = __half2float(a);
|
|
|
|
|
fb = __half2float(b);
|
|
|
|
|
|
|
|
|
|
asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
|
|
|
|
|
|
|
|
|
|
fv = rcp * fa;
|
|
|
|
|
|
|
|
|
|
v = __float2half(fv);
|
|
|
|
|
__HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF);
|
|
|
|
|
if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) {
|
|
|
|
|
float err = __fmaf_rn(-fb, fv, fa);
|
|
|
|
|
fv = __fmaf_rn(rcp, err, fv);
|
|
|
|
|
v = __float2half(fv);
|
|
|
|
|
}
|
|
|
|
|
return v;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Some basic arithmetic operations expected of a builtin */
|
|
|
|
|
__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
|
|
|
|
|
|
|
|
|
|
/* Some basic comparison operations to make it look like a builtin */
|
|
|
|
|
__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
|
|
|
|
|
__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
|
|
|
|
|
|
|
|
|
|
#define __SPEC_CASE(i,r, spc, ulp) \
|
|
|
|
|
"{.reg.b16 spc, ulp, p;\n"\
|
|
|
|
|
" mov.b16 spc,"#spc";\n"\
|
|
|
|
|
" mov.b16 ulp,"#ulp";\n"\
|
|
|
|
|
" set.eq.f16.f16 p,"#i", spc;\n"\
|
|
|
|
|
" fma.rn.f16 "#r",p,ulp,"#r";\n}\n"
|
|
|
|
|
|
|
|
|
|
__CUDA_FP16_DECL__ __half hexp(const __half a) {
|
|
|
|
|
__half val;
|
|
|
|
|
asm("{.reg.b32 f, C; \n"
|
|
|
|
|
" .reg.b16 h,r; \n"
|
|
|
|
|
" mov.b16 h,%1; \n"
|
|
|
|
|
" cvt.f32.f16 f,h; \n"
|
|
|
|
|
" mov.b32 C, 0x3fb8aa3b; \n"
|
|
|
|
|
" mul.f32 f,f,C; \n"
|
|
|
|
|
" ex2.approx.f32 f,f; \n"
|
|
|
|
|
" cvt.rn.f16.f32 r,f; \n"
|
|
|
|
|
__SPEC_CASE(h, r, 0X1F79, 0x9400)
|
|
|
|
|
__SPEC_CASE(h, r, 0X25CF, 0x9400)
|
|
|
|
|
__SPEC_CASE(h, r, 0XC13B, 0x0400)
|
|
|
|
|
__SPEC_CASE(h, r, 0XC1EF, 0x0200)
|
|
|
|
|
" mov.b16 %0,r; \n"
|
|
|
|
|
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
|
|
|
|
return val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__CUDA_FP16_DECL__ __half hlog(const __half a) {
|
|
|
|
|
__half val;
|
|
|
|
|
asm("{.reg.b32 f, C; \n"
|
|
|
|
|
" .reg.b16 r,h; \n"
|
|
|
|
|
" mov.b16 h,%1; \n"
|
|
|
|
|
" cvt.f32.f16 f,h; \n"
|
|
|
|
|
" lg2.approx.f32 f,f; \n"
|
|
|
|
|
" mov.b32 C, 0x3f317218; \n"
|
|
|
|
|
" mul.f32 f,f,C; \n"
|
|
|
|
|
" cvt.rn.f16.f32 r,f; \n"
|
|
|
|
|
__SPEC_CASE(h, r, 0X160D, 0x9C00)
|
|
|
|
|
__SPEC_CASE(h, r, 0X3BFE, 0x8010)
|
|
|
|
|
__SPEC_CASE(h, r, 0X3C0B, 0x8080)
|
|
|
|
|
__SPEC_CASE(h, r, 0X6051, 0x1C00)
|
|
|
|
|
" mov.b16 %0,r; \n"
|
|
|
|
|
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
|
|
|
|
return val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define __APPROX_FCAST(fun) do {\
|
|
|
|
|
__half val;\
|
|
|
|
|
asm("{.reg.b32 f; \n"\
|
|
|
|
|
" .reg.b16 r; \n"\
|
|
|
|
|
" mov.b16 r,%1; \n"\
|
|
|
|
|
" cvt.f32.f16 f,r; \n"\
|
|
|
|
|
" "#fun".approx.f32 f,f; \n"\
|
|
|
|
|
" cvt.rn.f16.f32 r,f; \n"\
|
|
|
|
|
" mov.b16 %0,r; \n"\
|
|
|
|
|
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
|
|
|
|
|
return val;\
|
|
|
|
|
} while(0);
|
|
|
|
|
__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
|
|
|
|
|
__APPROX_FCAST(sqrt);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__device__ inline __half Exp(const __half x) { return hexp(x); }
|
|
|
|
|
__device__ inline __half Log(const __half x) { return hlog(x); }
|
|
|
|
|
__device__ inline __half Sqrt(const __half x) { return hsqrt(x); }
|
|
|
|
|
|
|
|
|
|
#undef __HALF_TO_US
|
|
|
|
|
#undef __HALF_TO_CUS
|
|
|
|
|
|
|
|
|
@ -81,7 +289,6 @@ extern "C" __global__ void $func_name($parameters) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
)";
|
|
|
|
|
|
|
|
|
|
} // namespace fusion_group
|
|
|
|
|
} // namespace ir
|
|
|
|
|
} // namespace framework
|
|
|
|
|