-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
/
Copy pathfloat16.jl
105 lines (96 loc) · 3.12 KB
/
float16.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
function convert(::Type{Float32}, val::Float16)
ival::Uint32 = reinterpret(Uint16, val)
sign::Uint32 = (ival & 0x8000) >> 15
exp::Uint32 = (ival & 0x7c00) >> 10
sig::Uint32 = (ival & 0x3ff) >> 0
ret::Uint32
if exp == 0
if sig == 0
sign = sign << 31
ret = sign | exp | sig
else
n_bit = 1
bit = 0x0200
while (bit & sig) == 0
n_bit = n_bit + 1
bit = bit >> 1
end
sign = sign << 31
exp = (-14 - n_bit + 127) << 23
sig = ((sig & (~bit)) << n_bit) << (23 - 10)
ret = sign | exp | sig
end
elseif exp == 0x1f
if sig == 0
if sign == 0
ret = 0x7f800000
else
ret = 0xff800000
end
else
ret = 0xffffffff
end
else
sign = sign << 31
exp = (exp - 15 + 127) << 23
sig = sig << (23 - 10)
ret = sign | exp | sig
end
return reinterpret(Float32, ret)
end
# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
const basetable = Array(Uint16, 512)
const shifttable = Array(Uint8, 512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
basetable[i|0x000+1] = 0x0000
basetable[i|0x100+1] = 0x8000
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
basetable[i|0x000+1] = (0x0400>>(-e-14))
basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
shifttable[i|0x000+1] = -e-1
shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
basetable[i|0x000+1] = ((e+15)<<10)
basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
end
end
function convert(::Type{Float16}, val::Float32)
f = reinterpret(Uint32, val)
i = (f >> 23) & 0x1ff + 1
h = basetable[i] + ((f & 0x007fffff) >> shifttable[i])
reinterpret(Float16, uint16(h))
end
isnan(x::Float16) = reinterpret(Uint16,x)&0x7e00 == 0x7e00
isinf(x::Float16) = reinterpret(Uint16,x)&0x7e00 == 0x7c00
function ==(x::Float16, y::Float16)
ix = reinterpret(Uint16,x)
iy = reinterpret(Uint16,y)
if (ix|iy)&0x7e00 == 0x7e00 #isnan(x) || isnan(y)
return false
end
if (ix|iy)&0x7fff == 0x0000
return true
end
return ix == iy
end
<(x::Float16, y::Float16) = float32(x) < float32(y)
isless(x::Float16, y::Float16) = isless(float32(x), float32(y))
hash(x::Float16) = hash(reinterpret(Uint16, isnan(x) ? NaN16 : x))