Skip to content

Reduce spurious allocations in setindex and getindex #1301

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/MArray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ end
end

@propagate_inbounds function setindex!(v::MArray, val, i::Int)
@boundscheck checkbounds(v,i)
@inline
@boundscheck checkbounds((v),i)
T = eltype(v)

if isbitstype(T)
Expand Down
3 changes: 2 additions & 1 deletion src/SArray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ sacollect
####################

@propagate_inbounds function getindex(v::SArray, i::Int)
getfield(v,:data)[i]
@boundscheck checkbounds(v, i)
@inbounds getfield(v,:data)[i]
end

@inline Base.Tuple(v::SArray) = getfield(v,:data)
Expand Down
2 changes: 1 addition & 1 deletion src/SUnitRange.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ SUnitRange(a::Int, b::Int) = SUnitRange{a, max(0, b - a + 1)}()

@propagate_inbounds function getindex(x::SUnitRange{Start, L}, i::Int) where {Start, L}
@boundscheck if i < 1 || i > L
throw(BoundsError(x, i))
Base.throw_boundserror(x, i)
end
return Start + i - 1
end
Expand Down
6 changes: 3 additions & 3 deletions src/deque.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ julia> insert(@SVector[6, 5, 4, 2, 1], 4, 3)
return quote
@_propagate_inbounds_meta
@boundscheck if (index < 1 || index > $newlen)
throw(BoundsError(vec, index))
Base.throw_boundserror(vec, index)
end
@inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
end
Expand Down Expand Up @@ -150,7 +150,7 @@ julia> deleteat(@SVector[6, 5, 4, 3, 2, 1], 2)
return quote
@_propagate_inbounds_meta
@boundscheck if (index < 1 || index > $(s[1]))
throw(BoundsError(vec, index))
Base.throw_boundserror(vec, index)
end
@inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
end
Expand Down Expand Up @@ -188,7 +188,7 @@ julia> setindex(@SMatrix[2 4; 6 8], 1, 2)
return quote
@_propagate_inbounds_meta
@boundscheck if (index < 1 || index > $(L))
throw(BoundsError(a, index))
Base.throw_boundserror(a, index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is identical within a @noinline. Can we define the helper function in this package, instead of relying on Base internals?

end
@inbounds return typeof(a)(tuple($(exprs...)))
end
Expand Down
16 changes: 16 additions & 0 deletions src/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,22 @@ setindex!(a::StaticArray, value, i::Int) = error("setindex!(::$(typeof(a)), valu

# Note: all indexing behavior defaults to dense, linear indexing

Base.summary(io::IO, T::Type{<:StaticVector}) =
print(io, length(T), "-element ", T)

Base.summary(io::IO, T::Type{<:StaticArray}) =
print(io, join(size(T), "x"), " ", T)

"""
Only store the type upon failed bounds checking of a StaticArray to prevent
boxing and the corresponding allocation. Boxing is otherwise needed when
bounds checking is active to make it possible to potentially put the
StaticArray in BoundsError.a::Any.
"""
Base.throw_boundserror(T::Type{<:StaticArray},I) = (@noinline;throw(BoundsError(T,I)))

Base.throw_boundserror(A::StaticArray,I) = (@inline;Base.throw_boundserror(typeof(A),I))

@propagate_inbounds function getindex(a::StaticArray, inds::Int...)
@boundscheck checkbounds(a, inds...)
_getindex_scalar(Size(a), a, inds...)
Expand Down
57 changes: 23 additions & 34 deletions test/matrix_multiply_add.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,6 @@ using LinearAlgebra
using BenchmarkTools
using Test

macro test_noalloc(ex)
esc(quote
$ex
@test(@allocated($ex) == 0)
end)
end

mul_add_wrappers = [
m -> m,
m -> Symmetric(m, :U),
Expand Down Expand Up @@ -94,22 +87,18 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(b,At,c,1.0,2.0)
@test b ≈ 5A'c

if !(ArrayType <: SizedArray)
@test_noalloc mul!(c,A,b)
else
mul!(c,A,b)
@test_broken(@allocated(mul!(c,A,b)) == 0)
end
@test_noalloc mul!(c,A,b)

expected_transpose_allocs = 0
bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10
@test minimum(bmark).allocs == 0
# @test_noalloc mul!(c, A, b, α, β) # records 32 bytes
@test_noalloc mul!(c, A, b, α, β)
bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs
# @test_noalloc mul!(b, Transpose(A), c) # records 16 bytes
@test_noalloc mul!(b, Transpose(A), c)
bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs
# @test_noalloc mul!(b, Transpose(A), c, α, β) # records 48 bytes
@test_noalloc mul!(b, Transpose(A), c, α, β)

# outer product
C = rand(Mat{N1,N2})
Expand All @@ -122,9 +111,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(C,a,b',1.,1.)
@test C ≈ 3a*b'

b = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
@test minimum(b).allocs <= expected_transpose_allocs
# @test_noalloc mul!(C, a, b') # records 16 bytes
bmark = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs
@test_noalloc mul!(C, a, b')

# A × B
A = rand(Mat{N1,N2})
Expand All @@ -137,9 +126,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(C,A,B,2.0,1.0)
@test C ≈ 4A*B

b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
@test minimum(b).allocs == 0
# @test_noalloc mul!(C, A, B, α, β) # records 32 bytes
bmark = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
@test minimum(bmark).allocs == 0
@test_noalloc mul!(C, A, B, α, β)

# A'B
At = Transpose(A)
Expand All @@ -150,9 +139,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(B,At,C,2.0,1.0)
@test B ≈ 4A'C

b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
@test minimum(b).allocs <= expected_transpose_allocs
# @test_noalloc mul!(B, Transpose(A), C, α, β) # records 48 bytes
bmark = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs
@test_noalloc mul!(B, Transpose(A), C, α, β)

# A*B'
Bt = Transpose(B)
Expand All @@ -163,9 +152,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(C,A,Bt,2.0,1.0)
@test C ≈ 4A*B'

b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
@test minimum(b).allocs <= expected_transpose_allocs
# @test_noalloc mul!(C, A, Transpose(B), α, β) # records 48 bytes
bmark = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs
@test_noalloc mul!(C, A, Transpose(B), α, β)

# A'B'
B = rand(Mat{N1,N1})
Expand All @@ -177,17 +166,17 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
mul!(C,Transpose(A),Transpose(B),2.0,1.0)
@test C ≈ 4A'B'

b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
@test minimum(b).allocs <= 2*expected_transpose_allocs
# @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # records 64 bytes
bmark = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
@test minimum(bmark).allocs <= 2*expected_transpose_allocs
@test_noalloc mul!(C, Transpose(A), Transpose(B), α, β)

# Transpose Output
C = rand(Mat{N1,N2})
mul!(Transpose(C),Transpose(A),Transpose(B))
@test C' ≈ A'B'
b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
@test minimum(b).allocs <= expected_transpose_allocs*3
# @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) # records 80 bytes
bmark = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
@test minimum(bmark).allocs <= expected_transpose_allocs*3
@test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β)
end

# Test the three different
Expand Down
115 changes: 115 additions & 0 deletions test/testutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,103 @@ macro test_was_once_broken(good_version, ex)
end)
end


r"""
@allocated_barrier f(b, g(c))

Is effectively translated to:

(function(b, c)
@allocated f(b, g(c))
end)(b, c)

The function barrier improves type stability which helps the compiler
avoid unccessary heap allocations.

Functions/functors are not captured as local variables by default but
they can be wrapped by prepending each function call with a $ sign.
Values can also be interpolated with a $:

@allocated_barrier $f(b, $(g(c)))

Which effectively translates to:

(function(f, b, gc)
@allocated f(b, gc)
end)(f, b, g(c))

This is useful if `f` is a local variable or if `g(..)` causes allocations
that should be excluded.

Another approach to is to wrap each call to `@allocated` with `@eval`:

@eval @allocated f($a,g($b,$c))
@eval @allocated $a .= $f.($b, $c)

The number of allocated bytes reported is similar to this macro.
"""
macro allocated_barrier(ex)
captured = Dict{Any,Symbol}()

function capture(s::Symbol)
if Base.isidentifier(s)
get!(captured, s) do
gensym(s)
end
else
s
end
end

function capture(expr::Expr)
if expr.head == :$
get!(captured, expr.args[1]) do
gensym(string(expr.args[1]))
end
elseif expr.head == :. && last(expr.args) isa QuoteNode
get!(captured, expr) do
gensym(join(expr.args, "."))
end
else
# Expr(expr.head, capture.(expr.args)...)
arg1 = popfirst!(expr.args)
Expr(expr.head,
expr.head == :call && !(arg1 isa Expr && arg1.head==:$) ? arg1 : capture(arg1),
capture.(expr.args)...)
end
end

capture(x) = x

inner_ex = capture(ex)

quote
(function($(values(captured)...))

f() = $inner_ex
Base.precompile(f, ())
@allocated f()

end)($(esc.(keys(captured))...))
end
end


macro test_noalloc(ex)
a = :(
@allocated_barrier($ex)
)
a.args[2] = () # tidy output

q = :(
@test 0 == $a
)
q.args[2] = LineNumberNode(__source__.line, __source__.file)

esc(q)
end


@testset "test utils" begin
@testset "@testinf" begin
@testinf [1,2] == [1,2]
Expand All @@ -121,4 +218,22 @@ end
end
@test ts.errorcount == 0 && ts.failcount == 2 && ts.passcount == 0
end

a = rand(3)
z = ones(3)

@testset "@allocated_barrier" begin
@test @allocated_barrier(z .= a .+ z) == 0
@test z ≈ a .+ 1
@test @allocated_barrier(z .= a + z) > 0
@test @allocated_barrier(z .= $(a + z)) == 0

@test @allocated_barrier(z .= abs.(a + z)) > 0
@test @allocated_barrier(z .= $(abs.(a + z))) == 0
@test @allocated_barrier(z .= abs.(a .+ z)) == 0
end

@testset "@test_noalloc" begin
@test_noalloc z .= abs.(a .+ z)
end
end
Loading