diff --git a/base/abstractarray.jl b/base/abstractarray.jl index 8f55e6a56eba8..94bf3170feb38 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -3567,81 +3567,6 @@ pushfirst!(A, a, b, c...) = pushfirst!(pushfirst!(A, c...), a, b) # sizehint! does not nothing by default sizehint!(a::AbstractVector, _) = a -## hashing AbstractArray ## - -const hash_abstractarray_seed = UInt === UInt64 ? 0x7e2d6fb6448beb77 : 0xd4514ce5 -function hash(A::AbstractArray, h::UInt) - h ⊻= hash_abstractarray_seed - # Axes are themselves AbstractArrays, so hashing them directly would stack overflow - # Instead hash the tuple of firsts and lasts along each dimension - h = hash(map(first, axes(A)), h) - h = hash(map(last, axes(A)), h) - - # For short arrays, it's not worth doing anything complicated - if length(A) < 8192 - for x in A - h = hash(x, h) - end - return h - end - - # Goal: Hash approximately log(N) entries with a higher density of hashed elements - # weighted towards the end and special consideration for repeated values. Colliding - # hashes will often subsequently be compared by equality -- and equality between arrays - # works elementwise forwards and is short-circuiting. This means that a collision - # between arrays that differ by elements at the beginning is cheaper than one where the - # difference is towards the end. Furthermore, choosing `log(N)` arbitrary entries from a - # sparse array will likely only choose the same element repeatedly (zero in this case). - - # To achieve this, we work backwards, starting by hashing the last element of the - # array. After hashing each element, we skip `fibskip` elements, where `fibskip` - # is pulled from the Fibonacci sequence -- Fibonacci was chosen as a simple - # ~O(log(N)) algorithm that ensures we don't hit a common divisor of a dimension - # and only end up hashing one slice of the array (as might happen with powers of - # two). Finally, we find the next distinct value from the one we just hashed. - - # This is a little tricky since skipping an integer number of values inherently works - # with linear indices, but `findprev` uses `keys`. Hoist out the conversion "maps": - ks = keys(A) - key_to_linear = LinearIndices(ks) # Index into this map to compute the linear index - linear_to_key = vec(ks) # And vice-versa - - # Start at the last index - keyidx = last(ks) - linidx = key_to_linear[keyidx] - fibskip = prevfibskip = oneunit(linidx) - first_linear = first(LinearIndices(linear_to_key)) - n = 0 - while true - n += 1 - # Hash the element - elt = A[keyidx] - h = hash(keyidx=>elt, h) - - # Skip backwards a Fibonacci number of indices -- this is a linear index operation - linidx = key_to_linear[keyidx] - linidx < fibskip + first_linear && break - linidx -= fibskip - keyidx = linear_to_key[linidx] - - # Only increase the Fibonacci skip once every N iterations. This was chosen - # to be big enough that all elements of small arrays get hashed while - # obscenely large arrays are still tractable. With a choice of N=4096, an - # entirely-distinct 8000-element array will have ~75% of its elements hashed, - # with every other element hashed in the first half of the array. At the same - # time, hashing a `typemax(Int64)`-length Float64 range takes about a second. - if rem(n, 4096) == 0 - fibskip, prevfibskip = fibskip + prevfibskip, fibskip - end - - # Find a key index with a value distinct from `elt` -- might be `keyidx` itself - keyidx = findprev(!isequal(elt), A, keyidx) - keyidx === nothing && break - end - - return h -end - # The semantics of `collect` are weird. Better to write our own function rest(a::AbstractArray{T}, state...) where {T} v = Vector{T}(undef, 0) @@ -3650,7 +3575,6 @@ function rest(a::AbstractArray{T}, state...) where {T} return foldl(push!, Iterators.rest(a, state...), init=v) end - ## keepat! ## # NOTE: since these use `@inbounds`, they are actually only intended for Vector and BitVector diff --git a/base/hashing.jl b/base/hashing.jl index 1b323f2e9097e..897a0d73cb874 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -45,7 +45,7 @@ end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) # faster-but-weaker than hash_mix intended for small keys -hash_mix_linear(x::UInt64, h::UInt) = 3h - x +hash_mix_linear(x::Union{UInt64, UInt32}, h::UInt) = 3h - x function hash_finalizer(x::UInt64) x ⊻= (x >> 32) x *= 0x63652a4cd374b267 diff --git a/base/multidimensional.jl b/base/multidimensional.jl index edf71927661ca..83a03fc1b45bf 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -2017,3 +2017,105 @@ end getindex(b::Ref, ::CartesianIndex{0}) = getindex(b) setindex!(b::Ref, x, ::CartesianIndex{0}) = setindex!(b, x) + +## hashing AbstractArray ## can't be put in abstractarray.jl due to bootstrapping problems with the use of @nexpr + +function _hash_fib(A, h::UInt) + # Goal: Hash approximately log(N) entries with a higher density of hashed elements + # weighted towards the end and special consideration for repeated values. Colliding + # hashes will often subsequently be compared by equality -- and equality between arrays + # works elementwise forwards and is short-circuiting. This means that a collision + # between arrays that differ by elements at the beginning is cheaper than one where the + # difference is towards the end. Furthermore, choosing `log(N)` arbitrary entries from a + # sparse array will likely only choose the same element repeatedly (zero in this case). + + # To achieve this, we work backwards, starting by hashing the last element of the + # array. After hashing each element, we skip `fibskip` elements, where `fibskip` + # is pulled from the Fibonacci sequence -- Fibonacci was chosen as a simple + # ~O(log(N)) algorithm that ensures we don't hit a common divisor of a dimension + # and only end up hashing one slice of the array (as might happen with powers of + # two). Finally, we find the next distinct value from the one we just hashed. + + # This is a little tricky since skipping an integer number of values inherently works + # with linear indices, but `findprev` uses `keys`. Hoist out the conversion "maps": + ks = keys(A) + key_to_linear = LinearIndices(ks) # Index into this map to compute the linear index + linear_to_key = vec(ks) # And vice-versa + + # Start at the last index + keyidx = last(ks) + linidx = key_to_linear[keyidx] + fibskip = prevfibskip = oneunit(linidx) + first_linear = first(LinearIndices(linear_to_key)) + @nexprs 4 i -> p_i = h + + n = 0 + while true + n += 1 + # Hash the element + elt = A[keyidx] + + stream_idx = mod1(n, 4) + @nexprs 4 i -> stream_idx == i && (p_i = hash_mix_linear(hash(keyidx, p_i), hash(elt, p_i))) + + # Skip backwards a Fibonacci number of indices -- this is a linear index operation + linidx = key_to_linear[keyidx] + linidx < fibskip + first_linear && break + linidx -= fibskip + keyidx = linear_to_key[linidx] + + # Only increase the Fibonacci skip once every N iterations. This was chosen + # to be big enough that all elements of small arrays get hashed while + # obscenely large arrays are still tractable. With a choice of N=4096, an + # entirely-distinct 8000-element array will have ~75% of its elements hashed, + # with every other element hashed in the first half of the array. At the same + # time, hashing a `typemax(Int64)`-length Float64 range takes about a second. + if rem(n, 4096) == 0 + fibskip, prevfibskip = fibskip + prevfibskip, fibskip + end + + # Find a key index with a value distinct from `elt` -- might be `keyidx` itself + keyidx = findprev(!isequal(elt), A, keyidx) + keyidx === nothing && break + end + + @nexprs 4 i -> h = hash_mix_linear(p_i, h) + return hash_uint(h) +end + +function hash_shaped(A, h::UInt) + # Axes are themselves AbstractArrays, so hashing them directly would stack overflow + # Instead hash the tuple of firsts and lasts along each dimension + h = hash(map(first, axes(A)), h) + h = hash(map(last, axes(A)), h) + len = length(A) + + if len < 8 + # for the shortest arrays we chain directly + for elt in A + h = hash(elt, h) + end + return h + elseif len < 32768 + # separate accumulator streams, unrolled + @nexprs 8 i -> p_i = h + n = 1 + limit = len - 7 + while n <= limit + @nexprs 8 i -> p_i = hash(A[n + i - 1], p_i) + n += 8 + end + while n <= len + p_1 = hash(A[n], p_1) + n += 1 + end + # fold all streams back together + @nexprs 8 i -> h = hash_mix_linear(p_i, h) + return hash_uint(h) + else + return _hash_fib(A, h) + end +end + +const hash_abstractarray_seed = UInt === UInt64 ? 0x7e2d6fb6448beb77 : 0xd4514ce5 +hash(A::AbstractArray, h::UInt) = hash_shaped(A, h ⊻ hash_abstractarray_seed)