Make arrays and ranges hash and compare equal (#16401)

* Make vectors and ranges hash and compare equal When hashing AbstractVectors, first check whether their first elements are equal to a range, and hash them as a such if that's the case. This allows for O(1) hashing of (some) ranges consistent with AbstractArrays, which means they can now compare equal. Types which have a regular range step have to use the new TypeRangeStep trait to enable O(1) hashing rather than the O(N) AbstractArray fallback. Apply the new trait to date ranges which have a regular step. Add tests for the new behaviors.
JuliaLang · Dec 21, 2017 · 8872f90 · 8872f90 · vchuravy · Dec 22, 2017
1 parent bd04c13
commit 8872f90
Show file tree

Hide file tree

Showing 28 changed files with 329 additions and 82 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -344,6 +344,13 @@ This section lists changes that do not have deprecation warnings.
   * `eachindex(A, B...)` now requires that all inputs have the same number of elements.
     When the chosen indexing is Cartesian, they must have the same axes.
 
+  * `AbstractRange` objects are now considered as equal to other `AbstractArray` objects
+    by `==` and `isequal` if all of their elements are equal ([#16401]).
+    This has required changing the hashing algorithm: ranges now use an O(N) fallback
+    instead of a O(1) specialized method unless they define the `Base.TypeRangeStep`
+    trait; see its documentation for details. Types which support subtraction (operator
+    `-`) must now implement `widen` for hashing to work inside heterogeneous arrays.
+
 Library improvements
 --------------------
 

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -1547,9 +1547,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
     if axes(A) != axes(B)
         return false
     end
-    if isa(A,AbstractRange) != isa(B,AbstractRange)
-        return false
-    end
     for (a, b) in zip(A, B)
         if !isequal(a, b)
             return false
@@ -1570,9 +1567,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
     if axes(A) != axes(B)
         return false
     end
-    if isa(A,AbstractRange) != isa(B,AbstractRange)
-        return false
-    end
     anymissing = false
     for (a, b) in zip(A, B)
         eq = (a == b)
@@ -1944,19 +1938,92 @@ unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)
 
 const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
 const hashrle_seed = UInt === UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
-function hash(a::AbstractArray, h::UInt)
+const hashr_seed   = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
+
+# Efficient O(1) method equivalent to the O(N) AbstractArray fallback,
+# which works only for ranges with regular step (RangeStepRegular)
+function hash_range(r::AbstractRange, h::UInt)
+    h += hashaa_seed
+    h += hash(size(r))
+
+    length(r) == 0 && return h
+    h = hash(first(r), h)
+    length(r) == 1 && return h
+    length(r) == 2 && return hash(last(r), h)
+
+    h += hashr_seed
+    h = hash(length(r), h)
+    h = hash(last(r), h)
+end
+
+function hash(a::AbstractArray{T}, h::UInt) where T
+    # O(1) hashing for types with regular step
+    if isa(a, AbstractRange) && isa(TypeRangeStep(a), RangeStepRegular)
+        return hash_range(a, h)
+    end
+
     h += hashaa_seed
     h += hash(size(a))
 
     state = start(a)
     done(a, state) && return h
+    x1, state = next(a, state)
+    done(a, state) && return hash(x1, h)
     x2, state = next(a, state)
-    done(a, state) && return hash(x2, h)
+    done(a, state) && return hash(x2, hash(x1, h))
+
+    # Check whether the array is equal to a range, and hash the elements
+    # at the beginning of the array as such as long as they match this assumption
+    # This needs to be done even for non-RangeStepRegular types since they may still be equal
+    # to RangeStepRegular values (e.g. 1.0:3.0 == 1:3)
+    if isa(a, AbstractVector) && applicable(-, x2, x1)
+        n = 1
+        local step, laststep, laststate
+        while true
+            # If overflow happens with entries of the same type, a cannot be equal
+            # to a range with more than two elements because more extreme values
+            # cannot be represented. We must still hash the two first values as a
+            # range since they can always be considered as such (in a wider type)
+            if isconcrete(T)
+                try
+                    step = x2 - x1
+                catch err
+                    isa(err, OverflowError) || rethrow(err)
+                    break
+                end
+                # If true, wraparound overflow happened
+                sign(step) == cmp(x2, x1) || break
+            else
+                # widen() is here to ensure no overflow can happen
+                step = widen(x2) - widen(x1)
+            end
+            n > 1 && !isequal(step, laststep) && break
+            n += 1
+            x1 = x2
+            laststep = step
+            laststate = state
+            done(a, state) && break
+            x2, state = next(a, state)
+        end
 
-    x1 = x2
-    while !done(a, state)
-        x1 = x2
-        x2, state = next(a, state)
+        h = hash(first(a), h)
+        h += hashr_seed
+        # Always hash at least the two first elements as a range (even in case of overflow)
+        if n < 2
+            h = hash(2, h)
+            h = hash(x2, h)
+            done(a, state) && return h
+            x1 = x2
+            x2, state = next(a, state)
+        else
+            h = hash(n, h)
+            h = hash(x1, h)
+            done(a, laststate) && return h
+        end
+    end
+
+    # Hash elements which do not correspond to a range (if any)
+    while true
         if isequal(x2, x1)
             # For repeated elements, use run length encoding
             # This allows efficient hashing of sparse arrays
@@ -1970,7 +2037,10 @@ function hash(a::AbstractArray, h::UInt)
             h = hash(runlength, h)
         end
         h = hash(x1, h)
+        done(a, state) && break
+        x1 = x2
+        x2, state = next(a, state)
     end
     !isequal(x2, x1) && (h = hash(x2, h))
     return h
-end
+end
diff --git a/base/char.jl b/base/char.jl
@@ -88,6 +88,7 @@ in(x::Char, y::Char) = x == y
 isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
 hash(x::Char, h::UInt) =
     hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
+widen(::Type{Char}) = Char
 
 -(x::Char, y::Char) = Int(x) - Int(y)
 -(x::Char, y::Integer) = Char(Int32(x) - Int32(y))

diff --git a/base/hashing.jl b/base/hashing.jl
@@ -11,7 +11,9 @@ optional second argument `h` is a hash code to be mixed with the result.
 New types should implement the 2-argument form, typically by calling the 2-argument `hash`
 method recursively in order to mix hashes of the contents with each other (and with `h`).
 Typically, any type that implements `hash` should also implement its own `==` (hence
-`isequal`) to guarantee the property mentioned above.
+`isequal`) to guarantee the property mentioned above. Types supporting subtraction
+(operator `-`) should also implement [`widen`](@ref), which is required to hash
+values inside heterogeneous arrays.
 """
 hash(x::Any) = hash(x, zero(UInt))
 hash(w::WeakRef, h::UInt) = hash(w.value, h)
@@ -73,12 +75,3 @@ else
 end
 
 hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))
-
-# hashing ranges by component at worst leads to collisions for very similar ranges
-const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
-function hash(r::AbstractRange, h::UInt)
-    h += hashr_seed
-    h = hash(first(r), h)
-    h = hash(step(r), h)
-    h = hash(last(r), h)
-end
diff --git a/base/hashing2.jl b/base/hashing2.jl
@@ -178,4 +178,4 @@ function hash(s::Union{String,SubString{String}}, h::UInt)
     # note: use pointer(s) here (see #6058).
     ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), pointer(s), sizeof(s), h % UInt32) + h
 end
-hash(s::AbstractString, h::UInt) = hash(String(s), h)
+hash(s::AbstractString, h::UInt) = hash(String(s), h)
diff --git a/base/irrationals.jl b/base/irrationals.jl
@@ -116,6 +116,8 @@ isone(::AbstractIrrational) = false
 
 hash(x::Irrational, h::UInt) = 3*object_id(x) - h
 
+widen(::Type{T}) where {T<:Irrational} = T
+
 -(x::AbstractIrrational) = -Float64(x)
 for op in Symbol[:+, :-, :*, :/, :^]
     @eval $op(x::AbstractIrrational, y::AbstractIrrational) = $op(Float64(x),Float64(y))

diff --git a/base/operators.jl b/base/operators.jl
@@ -771,9 +771,11 @@ conj(x) = x
 """
     widen(x)
 
-If `x` is a type, return a "larger" type (for numeric types, this will be
-a type with at least as much range and precision as the argument, and usually more).
-Otherwise `x` is converted to `widen(typeof(x))`.
+If `x` is a type, return a "larger" type, defined so that arithmetic operations
+`+` and `-` are guaranteed not to overflow nor lose precision for any combination
+of values that type `x` can hold.
+
+If `x` is a value, it is converted to `widen(typeof(x))`.
 
 # Examples
 ```jldoctest
@@ -784,7 +786,7 @@ julia> widen(1.5f0)
 1.5
 ```
 """
-widen(x::T) where {T<:Number} = convert(widen(T), x)
+widen(x::T) where {T} = convert(widen(T), x)
 
 # function pipelining
 

diff --git a/base/range.jl b/base/range.jl
@@ -79,6 +79,9 @@ range(a::AbstractFloat, st::Real, len::Integer) = range(a, float(st), len)
 
 abstract type AbstractRange{T} <: AbstractArray{T,1} end
 
+TypeRangeStep(::Type{<:AbstractRange}) = RangeStepIrregular()
+TypeRangeStep(::Type{<:AbstractRange{<:Integer}}) = RangeStepRegular()
+
 ## ordinal ranges
 
 abstract type OrdinalRange{T,S} <: AbstractRange{T} end

diff --git a/base/traits.jl b/base/traits.jl
@@ -20,3 +20,40 @@ TypeArithmetic(instance) = TypeArithmetic(typeof(instance))
 TypeArithmetic(::Type{<:AbstractFloat}) = ArithmeticRounds()
 TypeArithmetic(::Type{<:Integer}) = ArithmeticOverflows()
 TypeArithmetic(::Type{<:Any}) = ArithmeticUnknown()
+
+# trait for objects that support ranges with regular step
+"""
+    TypeRangeStep(instance)
+    TypeRangeStep(T::Type)
+
+Indicate whether an instance or a type supports constructing a range with
+a perfectly regular step or not. A regular step means that
+[`step`](@ref) will always be exactly equal to the difference between two
+subsequent elements in a range, i.e. for a range `r::AbstractRange{T}`:
+```julia
+all(diff(r) .== step(r))
+```
+
+When a type `T` always leads to ranges with regular steps, it should
+define the following method:
+```julia
+Base.TypeRangeStep(::Type{<:AbstractRange{<:T}}) = Base.RangeStepRegular()
+```
+This will allow [`hash`](@ref) to use an O(1) algorithm for `AbstractRange{T}`
+objects instead of the default O(N) algorithm (with N the length of the range).
+
+In some cases, whether the step will be regular depends not only on the
+element type `T`, but also on the type of the step `S`. In that case, more
+specific methods should be defined:
+```julia
+Base.TypeRangeStep(::Type{<:OrdinalRange{<:T, <:S}}) = Base.RangeStepRegular()
+```
+
+By default, all range types are assumed to be `RangeStepIrregular`, except
+ranges with an element type which is a subtype of `Integer`.
+"""
+abstract type TypeRangeStep end
+struct RangeStepRegular   <: TypeRangeStep end # range with regular step
+struct RangeStepIrregular <: TypeRangeStep end # range with rounding error
+
+TypeRangeStep(instance) = TypeRangeStep(typeof(instance))
diff --git a/stdlib/Dates/src/periods.jl b/stdlib/Dates/src/periods.jl
@@ -101,6 +101,7 @@ end
 # intfuncs
 Base.gcdx(a::T, b::T) where {T<:Period} = ((g, x, y) = gcdx(value(a), value(b)); return T(g), x, y)
 Base.abs(a::T) where {T<:Period} = T(abs(value(a)))
+Base.sign(x::Period) = sign(value(x))
 
 periodisless(::Period,::Year)        = true
 periodisless(::Period,::Month)       = true

diff --git a/stdlib/Dates/src/ranges.jl b/stdlib/Dates/src/ranges.jl
@@ -38,3 +38,7 @@ Base.done(r::StepRange{<:TimeType,<:Period}, i::Integer) = length(r) <= i
 +(x::Period, r::AbstractRange{<:TimeType}) = (x + first(r)):step(r):(x + last(r))
 +(r::AbstractRange{<:TimeType}, x::Period) = x + r
 -(r::AbstractRange{<:TimeType}, x::Period) = (first(r)-x):step(r):(last(r)-x)
+
+# Combinations of types and periods for which the range step is regular
+Base.TypeRangeStep(::Type{<:OrdinalRange{<:TimeType, <:FixedPeriod}}) =
+    Base.RangeStepRegular()
diff --git a/stdlib/Dates/test/periods.jl b/stdlib/Dates/test/periods.jl
@@ -26,6 +26,9 @@ using Test
     @test mod.([t, t, t, t, t], Dates.Year(2)) == ([t, t, t, t, t])
     @test [t, t, t] / t2 == [0.5, 0.5, 0.5]
     @test abs(-t) == t
+    @test sign(t) == sign(t2) == 1
+    @test sign(-t) == sign(-t2) == -1
+    @test sign(Dates.Year(0)) == 0
 end
 @testset "div/mod/gcd/lcm/rem" begin
     @test Dates.Year(10) % Dates.Year(4) == Dates.Year(2)

diff --git a/stdlib/Dates/test/ranges.jl b/stdlib/Dates/test/ranges.jl
@@ -38,6 +38,8 @@ let
                 @test !(l1 in dr)
                 @test !(f1 - pos_step in dr)
                 @test !(l1 + pos_step in dr)
+                @test dr == []
+                @test hash(dr) == hash([])
 
                 for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
                     dr = f:pos_step:l
@@ -58,6 +60,8 @@ let
                         @test length(dr1) == len
                         @test findin(dr, dr) == [1:len;]
                         @test length([dr;]) == len
+                        @test dr == dr1
+                        @test hash(dr) == hash(dr1)
                     end
                     @test !isempty(reverse(dr))
                     @test length(reverse(dr)) == len
@@ -92,6 +96,8 @@ let
                 @test !(l1 in dr)
                 @test !(l1 - neg_step in dr)
                 @test !(l1 + neg_step in dr)
+                @test dr == []
+                @test hash(dr) == hash([])
 
                 for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
                     dr = l:neg_step:f
@@ -112,6 +118,8 @@ let
                         @test length(dr1) == len
                         @test findin(dr, dr) == [1:len;]
                         @test length([dr;]) == len
+                        @test dr == dr1
+                        @test hash(dr) == hash(dr1)
                     end
                     @test !isempty(reverse(dr))
                     @test length(reverse(dr)) == len
@@ -148,6 +156,8 @@ let
                     @test !(l1 in dr)
                     @test !(f1 - pos_step in dr)
                     @test !(l1 + pos_step in dr)
+                    @test dr == []
+                    @test hash(dr) == hash([])
 
                     for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
                         dr = f:pos_step:l
@@ -168,6 +178,8 @@ let
                             @test length(dr1) == len
                             @test findin(dr, dr) == [1:len;]
                             @test length([dr;]) == len
+                            @test dr == dr1
+                            @test hash(dr) == hash(dr1)
                         end
                         @test !isempty(reverse(dr))
                         @test length(reverse(dr)) == len
@@ -202,6 +214,8 @@ let
                     @test !(l1 in dr)
                     @test !(l1 - neg_step in dr)
                     @test !(l1 + neg_step in dr)
+                    @test dr == []
+                    @test hash(dr) == hash([])
 
                     for (f, l) in ((f2, l2), (f3, l3), (f4, l4))
                         dr = l:neg_step:f
@@ -222,6 +236,8 @@ let
                             @test length(dr1) == len
                             @test findin(dr, dr) == [1:len;]
                             @test length([dr;]) == len
+                            @test dr == dr1
+                            @test hash(dr) == hash(dr1)
                         end
                         @test !isempty(reverse(dr))
                         @test length(reverse(dr)) == len

diff --git a/test/abstractarray.jl b/test/abstractarray.jl
@@ -388,8 +388,8 @@ function test_vector_indexing(::Type{T}, shape, ::Type{TestAbstractArray}) where
         idxs = rand(1:N, 3, 3, 3)
         @test B[idxs] == A[idxs] == idxs
         @test B[vec(idxs)] == A[vec(idxs)] == vec(idxs)
-        @test B[:] == A[:] == collect(1:N)
-        @test B[1:end] == A[1:end] == collect(1:N)
+        @test B[:] == A[:] == 1:N
+        @test B[1:end] == A[1:end] == 1:N
         @test B[:,:,trailing2] == A[:,:,trailing2] == B[:,:,1,trailing3] == A[:,:,1,trailing3]
             B[1:end,1:end,trailing2] == A[1:end,1:end,trailing2] == B[1:end,1:end,1,trailing3] == A[1:end,1:end,1,trailing3]
 

diff --git a/test/arrayops.jl b/test/arrayops.jl
@@ -1125,7 +1125,7 @@ end
     # base case w/ Vector
     a = collect(1:10)
     filter!(x -> x > 5, a)
-    @test a == collect(6:10)
+    @test a == 6:10
 
     # different subtype of AbstractVector
     ba = rand(10) .> 0.5
@@ -1761,8 +1761,8 @@ end
     for A in (reshape(collect(1:20), 4, 5),
               reshape(1:20, 4, 5))
         local A
-        @test slicedim(A, 1, 2) == collect(2:4:20)
-        @test slicedim(A, 2, 2) == collect(5:8)
+        @test slicedim(A, 1, 2) == 2:4:20
+        @test slicedim(A, 2, 2) == 5:8
         @test_throws ArgumentError slicedim(A,0,1)
         @test slicedim(A, 3, 1) == A
         @test_throws BoundsError slicedim(A, 3, 2)