From 24dde5dbb42fca28a387f72b7d3f8ef027713df8 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 14 Oct 2020 18:15:03 -0400 Subject: [PATCH] faster hashing by avoiding UB In LLVM (inherited from C), fptosi has undefined behavior if the result does not fit the integer size after rounding down. But by using the same strategy as generic hashing of Real values, we actually can end up with a sitatuion that is faster for the CPU to deal with and avoids the UB. Refs #6624 (36969687fb9bbd936b3adc0baa6cf0a408d2ccb7) Fixes #37800 --- base/abstractset.jl | 4 ++-- base/float.jl | 30 +++++++++++++++++++++--------- base/hashing2.jl | 8 ++++---- test/hashing.jl | 37 +++++++++++++++++++------------------ test/show.jl | 2 +- 5 files changed, 47 insertions(+), 34 deletions(-) diff --git a/base/abstractset.jl b/base/abstractset.jl index 05b5300952822..179b9f7be5d4b 100644 --- a/base/abstractset.jl +++ b/base/abstractset.jl @@ -64,10 +64,10 @@ julia> union!(a, 1:2:8); julia> a Set{Int64} with 5 elements: - 7 + 5 4 + 7 3 - 5 1 ``` """ diff --git a/base/float.jl b/base/float.jl index ec28e4f741f14..cba1d6ba9b522 100644 --- a/base/float.jl +++ b/base/float.jl @@ -460,17 +460,29 @@ Test whether a number is infinite. """ isinf(x::Real) = !isnan(x) & !isfinite(x) -## hashing small, built-in numeric types ## - -hx(a::UInt64, b::Float64, h::UInt) = hash_uint64((3a + reinterpret(UInt64,b)) - h) -const hx_NaN = hx(UInt64(0), NaN, UInt(0 )) - -hash(x::UInt64, h::UInt) = hx(x, Float64(x), h) -hash(x::Int64, h::UInt) = hx(reinterpret(UInt64, abs(x)), Float64(x), h) -hash(x::Float64, h::UInt) = isnan(x) ? (hx_NaN ⊻ h) : hx(fptoui(UInt64, abs(x)), x, h) +const hx_NaN = hash_uint64(reinterpret(UInt64, NaN)) +let Tf = Float64, Tu = UInt64, Ti = Int64 + @eval function hash(x::$Tf, h::UInt) + # see comments on trunc and hash(Real, UInt) + if $(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti))) + xi = fptosi($Ti, x) + if isequal(xi, x) + return hash(xi, h) + end + elseif $(Tf(typemin(Tu))) <= x < $(Tf(typemax(Tu))) + xu = fptoui($Tu, x) + if isequal(xu, x) + return hash(xu, h) + end + elseif isnan(x) + return hx_NaN ⊻ h # NaN does not have a stable bit pattern + end + return hash_uint64(bitcast(UInt64, x)) - 3h + end +end -hash(x::Union{Bool,Int8,UInt8,Int16,UInt16,Int32,UInt32}, h::UInt) = hash(Int64(x), h) hash(x::Float32, h::UInt) = hash(Float64(x), h) +hash(x::Float16, h::UInt) = hash(Float64(x), h) """ precision(num::AbstractFloat) diff --git a/base/hashing2.jl b/base/hashing2.jl index f7ea3838aa096..e0fef571e5c90 100644 --- a/base/hashing2.jl +++ b/base/hashing2.jl @@ -2,6 +2,10 @@ ## efficient value-based hashing of integers ## +hash(x::Int64, h::UInt) = hash_uint64(bitcast(UInt64, x)) - 3h +hash(x::UInt64, h::UInt) = hash_uint64(x) - 3h +hash(x::Union{Bool,Int8,UInt8,Int16,UInt16,Int32,UInt32}, h::UInt) = hash(Int64(x), h) + function hash_integer(n::Integer, h::UInt) h ⊻= hash_uint((n % UInt) ⊻ h) n = abs(n) @@ -226,7 +230,3 @@ function hash(x::Rational{<:BitInteger64}, h::UInt) h = hash_integer(num, h) return h end - -## hashing Float16s ## - -hash(x::Float16, h::UInt) = hash(Float64(x), h) diff --git a/test/hashing.jl b/test/hashing.jl index c2b3ed27f6a51..569e0aeb7ade0 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -32,28 +32,29 @@ function coerce(T::Type, x) end end -for T = types[2:end], - x = vals, +for T = types[2:end], x = vals a = coerce(T, x) - @test hash(a,zero(UInt)) == invoke(hash, Tuple{Real, UInt}, a, zero(UInt)) - @test hash(a,one(UInt)) == invoke(hash, Tuple{Real, UInt}, a, one(UInt)) + @test hash(a, zero(UInt)) == invoke(hash, Tuple{Real, UInt}, a, zero(UInt)) + @test hash(a, one(UInt)) == invoke(hash, Tuple{Real, UInt}, a, one(UInt)) end -for T = types, - S = types, - x = vals, - a = coerce(T, x), - b = coerce(S, x) - #println("$(typeof(a)) $a") - #println("$(typeof(b)) $b") - @test isequal(a,b) == (hash(a)==hash(b)) - # for y=vals - # println("T=$T; S=$S; x=$x; y=$y") - # c = convert(T,x//y) - # d = convert(S,x//y) - # @test !isequal(a,b) || hash(a)==hash(b) - # end +let collides = 0 + for T = types, S = types, x = vals + a = coerce(T, x) + b = coerce(S, x) + eq = hash(a) == hash(b) + #println("$(typeof(a)) $a") + #println("$(typeof(b)) $b") + if isequal(a, b) + @test eq + else + collides += eq + end + end + # each pair of types has one collision for these values + @test collides <= (length(types) - 1)^2 end +@test hash(0.0) != hash(-0.0) # issue #8619 @test hash(nextfloat(2.0^63)) == hash(UInt64(nextfloat(2.0^63))) diff --git a/test/show.jl b/test/show.jl index 046baf89a5d17..c215289767b09 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1608,7 +1608,7 @@ end # issue #27680 @test showstr(Set([(1.0,1.0), (2.0,2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? - "Set([(3.0, 3.0), (2.0, 2.0), (1.0, 1.0)])" : + "Set([(1.0, 1.0), (3.0, 3.0), (2.0, 2.0)])" : "Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])") # issue #27747