Skip to content

Commit

Permalink
Make arrays and ranges hash and compare equal (#16401)
Browse files Browse the repository at this point in the history
* Make vectors and ranges hash and compare equal

When hashing AbstractVectors, first check whether their first elements are equal to a range, and hash them as a such if that's the case. This allows for O(1) hashing of (some) ranges consistent with AbstractArrays, which means they can now compare equal. Types which have a regular range step have to use the new TypeRangeStep trait to enable O(1) hashing rather than the O(N) AbstractArray fallback.

Apply the new trait to date ranges which have a regular step. Add tests for the new behaviors.
  • Loading branch information
nalimilan authored and StefanKarpinski committed Dec 21, 2017
1 parent bd04c13 commit 8872f90
Show file tree
Hide file tree
Showing 28 changed files with 329 additions and 82 deletions.
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,13 @@ This section lists changes that do not have deprecation warnings.
* `eachindex(A, B...)` now requires that all inputs have the same number of elements.
When the chosen indexing is Cartesian, they must have the same axes.

* `AbstractRange` objects are now considered as equal to other `AbstractArray` objects
by `==` and `isequal` if all of their elements are equal ([#16401]).
This has required changing the hashing algorithm: ranges now use an O(N) fallback
instead of a O(1) specialized method unless they define the `Base.TypeRangeStep`
trait; see its documentation for details. Types which support subtraction (operator
`-`) must now implement `widen` for hashing to work inside heterogeneous arrays.

Library improvements
--------------------

Expand Down
96 changes: 83 additions & 13 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1547,9 +1547,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
if axes(A) != axes(B)
return false
end
if isa(A,AbstractRange) != isa(B,AbstractRange)
return false
end
for (a, b) in zip(A, B)
if !isequal(a, b)
return false
Expand All @@ -1570,9 +1567,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
if axes(A) != axes(B)
return false
end
if isa(A,AbstractRange) != isa(B,AbstractRange)
return false
end
anymissing = false
for (a, b) in zip(A, B)
eq = (a == b)
Expand Down Expand Up @@ -1944,19 +1938,92 @@ unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt === UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
function hash(a::AbstractArray, h::UInt)
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087

# Efficient O(1) method equivalent to the O(N) AbstractArray fallback,
# which works only for ranges with regular step (RangeStepRegular)
function hash_range(r::AbstractRange, h::UInt)
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h
h = hash(first(r), h)
length(r) == 1 && return h
length(r) == 2 && return hash(last(r), h)

h += hashr_seed
h = hash(length(r), h)
h = hash(last(r), h)
end

function hash(a::AbstractArray{T}, h::UInt) where T
# O(1) hashing for types with regular step
if isa(a, AbstractRange) && isa(TypeRangeStep(a), RangeStepRegular)
return hash_range(a, h)
end

h += hashaa_seed
h += hash(size(a))

state = start(a)
done(a, state) && return h
x1, state = next(a, state)
done(a, state) && return hash(x1, h)
x2, state = next(a, state)
done(a, state) && return hash(x2, h)
done(a, state) && return hash(x2, hash(x1, h))

# Check whether the array is equal to a range, and hash the elements
# at the beginning of the array as such as long as they match this assumption
# This needs to be done even for non-RangeStepRegular types since they may still be equal
# to RangeStepRegular values (e.g. 1.0:3.0 == 1:3)
if isa(a, AbstractVector) && applicable(-, x2, x1)
n = 1
local step, laststep, laststate
while true
# If overflow happens with entries of the same type, a cannot be equal
# to a range with more than two elements because more extreme values
# cannot be represented. We must still hash the two first values as a
# range since they can always be considered as such (in a wider type)
if isconcrete(T)
try
step = x2 - x1
catch err
isa(err, OverflowError) || rethrow(err)
break
end
# If true, wraparound overflow happened
sign(step) == cmp(x2, x1) || break
else
# widen() is here to ensure no overflow can happen
step = widen(x2) - widen(x1)
end
n > 1 && !isequal(step, laststep) && break
n += 1
x1 = x2
laststep = step
laststate = state
done(a, state) && break
x2, state = next(a, state)
end

x1 = x2
while !done(a, state)
x1 = x2
x2, state = next(a, state)
h = hash(first(a), h)
h += hashr_seed
# Always hash at least the two first elements as a range (even in case of overflow)
if n < 2
h = hash(2, h)
h = hash(x2, h)
done(a, state) && return h
x1 = x2
x2, state = next(a, state)
else
h = hash(n, h)
h = hash(x1, h)
done(a, laststate) && return h
end
end

# Hash elements which do not correspond to a range (if any)
while true
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
Expand All @@ -1970,7 +2037,10 @@ function hash(a::AbstractArray, h::UInt)
h = hash(runlength, h)
end
h = hash(x1, h)
done(a, state) && break
x1 = x2
x2, state = next(a, state)
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
end
1 change: 1 addition & 0 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ in(x::Char, y::Char) = x == y
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
widen(::Type{Char}) = Char

-(x::Char, y::Char) = Int(x) - Int(y)
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
Expand Down
13 changes: 3 additions & 10 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ optional second argument `h` is a hash code to be mixed with the result.
New types should implement the 2-argument form, typically by calling the 2-argument `hash`
method recursively in order to mix hashes of the contents with each other (and with `h`).
Typically, any type that implements `hash` should also implement its own `==` (hence
`isequal`) to guarantee the property mentioned above.
`isequal`) to guarantee the property mentioned above. Types supporting subtraction
(operator `-`) should also implement [`widen`](@ref), which is required to hash
values inside heterogeneous arrays.
"""
hash(x::Any) = hash(x, zero(UInt))
hash(w::WeakRef, h::UInt) = hash(w.value, h)
Expand Down Expand Up @@ -73,12 +75,3 @@ else
end

hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))

# hashing ranges by component at worst leads to collisions for very similar ranges
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
function hash(r::AbstractRange, h::UInt)
h += hashr_seed
h = hash(first(r), h)
h = hash(step(r), h)
h = hash(last(r), h)
end
2 changes: 1 addition & 1 deletion base/hashing2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,4 +178,4 @@ function hash(s::Union{String,SubString{String}}, h::UInt)
# note: use pointer(s) here (see #6058).
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), pointer(s), sizeof(s), h % UInt32) + h
end
hash(s::AbstractString, h::UInt) = hash(String(s), h)
hash(s::AbstractString, h::UInt) = hash(String(s), h)
2 changes: 2 additions & 0 deletions base/irrationals.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ isone(::AbstractIrrational) = false

hash(x::Irrational, h::UInt) = 3*object_id(x) - h

widen(::Type{T}) where {T<:Irrational} = T

-(x::AbstractIrrational) = -Float64(x)
for op in Symbol[:+, :-, :*, :/, :^]
@eval $op(x::AbstractIrrational, y::AbstractIrrational) = $op(Float64(x),Float64(y))
Expand Down
10 changes: 6 additions & 4 deletions base/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -771,9 +771,11 @@ conj(x) = x
"""
widen(x)
If `x` is a type, return a "larger" type (for numeric types, this will be
a type with at least as much range and precision as the argument, and usually more).
Otherwise `x` is converted to `widen(typeof(x))`.
If `x` is a type, return a "larger" type, defined so that arithmetic operations
`+` and `-` are guaranteed not to overflow nor lose precision for any combination
of values that type `x` can hold.
If `x` is a value, it is converted to `widen(typeof(x))`.
# Examples
```jldoctest
Expand All @@ -784,7 +786,7 @@ julia> widen(1.5f0)
1.5
```
"""
widen(x::T) where {T<:Number} = convert(widen(T), x)
widen(x::T) where {T} = convert(widen(T), x)

# function pipelining

Expand Down
3 changes: 3 additions & 0 deletions base/range.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ range(a::AbstractFloat, st::Real, len::Integer) = range(a, float(st), len)

abstract type AbstractRange{T} <: AbstractArray{T,1} end

TypeRangeStep(::Type{<:AbstractRange}) = RangeStepIrregular()
TypeRangeStep(::Type{<:AbstractRange{<:Integer}}) = RangeStepRegular()

## ordinal ranges

abstract type OrdinalRange{T,S} <: AbstractRange{T} end
Expand Down
37 changes: 37 additions & 0 deletions base/traits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,40 @@ TypeArithmetic(instance) = TypeArithmetic(typeof(instance))
TypeArithmetic(::Type{<:AbstractFloat}) = ArithmeticRounds()
TypeArithmetic(::Type{<:Integer}) = ArithmeticOverflows()
TypeArithmetic(::Type{<:Any}) = ArithmeticUnknown()

# trait for objects that support ranges with regular step
"""
TypeRangeStep(instance)
TypeRangeStep(T::Type)
Indicate whether an instance or a type supports constructing a range with
a perfectly regular step or not. A regular step means that
[`step`](@ref) will always be exactly equal to the difference between two
subsequent elements in a range, i.e. for a range `r::AbstractRange{T}`:
```julia
all(diff(r) .== step(r))
```
When a type `T` always leads to ranges with regular steps, it should
define the following method:
```julia
Base.TypeRangeStep(::Type{<:AbstractRange{<:T}}) = Base.RangeStepRegular()
```
This will allow [`hash`](@ref) to use an O(1) algorithm for `AbstractRange{T}`
objects instead of the default O(N) algorithm (with N the length of the range).
In some cases, whether the step will be regular depends not only on the
element type `T`, but also on the type of the step `S`. In that case, more
specific methods should be defined:
```julia
Base.TypeRangeStep(::Type{<:OrdinalRange{<:T, <:S}}) = Base.RangeStepRegular()
```
By default, all range types are assumed to be `RangeStepIrregular`, except
ranges with an element type which is a subtype of `Integer`.
"""
abstract type TypeRangeStep end
struct RangeStepRegular <: TypeRangeStep end # range with regular step
struct RangeStepIrregular <: TypeRangeStep end # range with rounding error

TypeRangeStep(instance) = TypeRangeStep(typeof(instance))
1 change: 1 addition & 0 deletions stdlib/Dates/src/periods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ end
# intfuncs
Base.gcdx(a::T, b::T) where {T<:Period} = ((g, x, y) = gcdx(value(a), value(b)); return T(g), x, y)
Base.abs(a::T) where {T<:Period} = T(abs(value(a)))
Base.sign(x::Period) = sign(value(x))

periodisless(::Period,::Year) = true
periodisless(::Period,::Month) = true
Expand Down
4 changes: 4 additions & 0 deletions stdlib/Dates/src/ranges.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ Base.done(r::StepRange{<:TimeType,<:Period}, i::Integer) = length(r) <= i
+(x::Period, r::AbstractRange{<:TimeType}) = (x + first(r)):step(r):(x + last(r))
+(r::AbstractRange{<:TimeType}, x::Period) = x + r
-(r::AbstractRange{<:TimeType}, x::Period) = (first(r)-x):step(r):(last(r)-x)

# Combinations of types and periods for which the range step is regular
Base.TypeRangeStep(::Type{<:OrdinalRange{<:TimeType, <:FixedPeriod}}) =
Base.RangeStepRegular()
3 changes: 3 additions & 0 deletions stdlib/Dates/test/periods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ using Test
@test mod.([t, t, t, t, t], Dates.Year(2)) == ([t, t, t, t, t])
@test [t, t, t] / t2 == [0.5, 0.5, 0.5]
@test abs(-t) == t
@test sign(t) == sign(t2) == 1
@test sign(-t) == sign(-t2) == -1
@test sign(Dates.Year(0)) == 0
end
@testset "div/mod/gcd/lcm/rem" begin
@test Dates.Year(10) % Dates.Year(4) == Dates.Year(2)
Expand Down
16 changes: 16 additions & 0 deletions stdlib/Dates/test/ranges.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ let
@test !(l1 in dr)
@test !(f1 - pos_step in dr)
@test !(l1 + pos_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = f:pos_step:l
Expand All @@ -58,6 +60,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -92,6 +96,8 @@ let
@test !(l1 in dr)
@test !(l1 - neg_step in dr)
@test !(l1 + neg_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = l:neg_step:f
Expand All @@ -112,6 +118,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -148,6 +156,8 @@ let
@test !(l1 in dr)
@test !(f1 - pos_step in dr)
@test !(l1 + pos_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = f:pos_step:l
Expand All @@ -168,6 +178,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down Expand Up @@ -202,6 +214,8 @@ let
@test !(l1 in dr)
@test !(l1 - neg_step in dr)
@test !(l1 + neg_step in dr)
@test dr == []
@test hash(dr) == hash([])

for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
dr = l:neg_step:f
Expand All @@ -222,6 +236,8 @@ let
@test length(dr1) == len
@test findin(dr, dr) == [1:len;]
@test length([dr;]) == len
@test dr == dr1
@test hash(dr) == hash(dr1)
end
@test !isempty(reverse(dr))
@test length(reverse(dr)) == len
Expand Down
4 changes: 2 additions & 2 deletions test/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,8 @@ function test_vector_indexing(::Type{T}, shape, ::Type{TestAbstractArray}) where
idxs = rand(1:N, 3, 3, 3)
@test B[idxs] == A[idxs] == idxs
@test B[vec(idxs)] == A[vec(idxs)] == vec(idxs)
@test B[:] == A[:] == collect(1:N)
@test B[1:end] == A[1:end] == collect(1:N)
@test B[:] == A[:] == 1:N
@test B[1:end] == A[1:end] == 1:N
@test B[:,:,trailing2] == A[:,:,trailing2] == B[:,:,1,trailing3] == A[:,:,1,trailing3]
B[1:end,1:end,trailing2] == A[1:end,1:end,trailing2] == B[1:end,1:end,1,trailing3] == A[1:end,1:end,1,trailing3]

Expand Down
6 changes: 3 additions & 3 deletions test/arrayops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ end
# base case w/ Vector
a = collect(1:10)
filter!(x -> x > 5, a)
@test a == collect(6:10)
@test a == 6:10

# different subtype of AbstractVector
ba = rand(10) .> 0.5
Expand Down Expand Up @@ -1761,8 +1761,8 @@ end
for A in (reshape(collect(1:20), 4, 5),
reshape(1:20, 4, 5))
local A
@test slicedim(A, 1, 2) == collect(2:4:20)
@test slicedim(A, 2, 2) == collect(5:8)
@test slicedim(A, 1, 2) == 2:4:20
@test slicedim(A, 2, 2) == 5:8
@test_throws ArgumentError slicedim(A,0,1)
@test slicedim(A, 3, 1) == A
@test_throws BoundsError slicedim(A, 3, 2)
Expand Down
Loading

1 comment on commit 8872f90

@vchuravy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier run("array", vs="@b57a59256fbe646e63153cb35fc09dd6bc3d9ae9")

Please sign in to comment.