diff --git a/src/MArray.jl b/src/MArray.jl index eeadb7ef..d9ed9c3c 100644 --- a/src/MArray.jl +++ b/src/MArray.jl @@ -28,7 +28,8 @@ end end @propagate_inbounds function setindex!(v::MArray, val, i::Int) - @boundscheck checkbounds(v,i) + @inline + @boundscheck checkbounds((v),i) T = eltype(v) if isbitstype(T) diff --git a/src/SArray.jl b/src/SArray.jl index 8f02e256..fcf68a66 100644 --- a/src/SArray.jl +++ b/src/SArray.jl @@ -62,7 +62,8 @@ sacollect #################### @propagate_inbounds function getindex(v::SArray, i::Int) - getfield(v,:data)[i] + @boundscheck checkbounds(v, i) + @inbounds getfield(v,:data)[i] end @inline Base.Tuple(v::SArray) = getfield(v,:data) diff --git a/src/SUnitRange.jl b/src/SUnitRange.jl index 32255887..0d74376d 100644 --- a/src/SUnitRange.jl +++ b/src/SUnitRange.jl @@ -20,7 +20,7 @@ SUnitRange(a::Int, b::Int) = SUnitRange{a, max(0, b - a + 1)}() @propagate_inbounds function getindex(x::SUnitRange{Start, L}, i::Int) where {Start, L} @boundscheck if i < 1 || i > L - throw(BoundsError(x, i)) + Base.throw_boundserror(x, i) end return Start + i - 1 end diff --git a/src/deque.jl b/src/deque.jl index 9ffb2e94..401651b9 100644 --- a/src/deque.jl +++ b/src/deque.jl @@ -75,7 +75,7 @@ julia> insert(@SVector[6, 5, 4, 2, 1], 4, 3) return quote @_propagate_inbounds_meta @boundscheck if (index < 1 || index > $newlen) - throw(BoundsError(vec, index)) + Base.throw_boundserror(vec, index) end @inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...))) end @@ -150,7 +150,7 @@ julia> deleteat(@SVector[6, 5, 4, 3, 2, 1], 2) return quote @_propagate_inbounds_meta @boundscheck if (index < 1 || index > $(s[1])) - throw(BoundsError(vec, index)) + Base.throw_boundserror(vec, index) end @inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...))) end @@ -188,7 +188,7 @@ julia> setindex(@SMatrix[2 4; 6 8], 1, 2) return quote @_propagate_inbounds_meta @boundscheck if (index < 1 || index > $(L)) - throw(BoundsError(a, index)) + Base.throw_boundserror(a, index) end @inbounds return typeof(a)(tuple($(exprs...))) end diff --git a/src/indexing.jl b/src/indexing.jl index a42fedf4..ee92f26d 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -8,6 +8,22 @@ setindex!(a::StaticArray, value, i::Int) = error("setindex!(::$(typeof(a)), valu # Note: all indexing behavior defaults to dense, linear indexing +Base.summary(io::IO, T::Type{<:StaticVector}) = + print(io, length(T), "-element ", T) + +Base.summary(io::IO, T::Type{<:StaticArray}) = + print(io, join(size(T), "x"), " ", T) + +""" +Only store the type upon failed bounds checking of a StaticArray to prevent +boxing and the corresponding allocation. Boxing is otherwise needed when +bounds checking is active to make it possible to potentially put the +StaticArray in BoundsError.a::Any. +""" +Base.throw_boundserror(T::Type{<:StaticArray},I) = (@noinline;throw(BoundsError(T,I))) + +Base.throw_boundserror(A::StaticArray,I) = (@inline;Base.throw_boundserror(typeof(A),I)) + @propagate_inbounds function getindex(a::StaticArray, inds::Int...) @boundscheck checkbounds(a, inds...) _getindex_scalar(Size(a), a, inds...) diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index 7d5bd8c7..467223ca 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -3,13 +3,6 @@ using LinearAlgebra using BenchmarkTools using Test -macro test_noalloc(ex) - esc(quote - $ex - @test(@allocated($ex) == 0) - end) -end - mul_add_wrappers = [ m -> m, m -> Symmetric(m, :U), @@ -94,22 +87,18 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(b,At,c,1.0,2.0) @test b ≈ 5A'c - if !(ArrayType <: SizedArray) - @test_noalloc mul!(c,A,b) - else - mul!(c,A,b) - @test_broken(@allocated(mul!(c,A,b)) == 0) - end + @test_noalloc mul!(c,A,b) + expected_transpose_allocs = 0 bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10 @test minimum(bmark).allocs == 0 - # @test_noalloc mul!(c, A, b, α, β) # records 32 bytes + @test_noalloc mul!(c, A, b, α, β) bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 @test minimum(bmark).allocs <= expected_transpose_allocs - # @test_noalloc mul!(b, Transpose(A), c) # records 16 bytes + @test_noalloc mul!(b, Transpose(A), c) bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10 @test minimum(bmark).allocs <= expected_transpose_allocs - # @test_noalloc mul!(b, Transpose(A), c, α, β) # records 48 bytes + @test_noalloc mul!(b, Transpose(A), c, α, β) # outer product C = rand(Mat{N1,N2}) @@ -122,9 +111,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,a,b',1.,1.) @test C ≈ 3a*b' - b = @benchmark mul!($C,$a,$(b')) samples=10 evals=10 - @test minimum(b).allocs <= expected_transpose_allocs - # @test_noalloc mul!(C, a, b') # records 16 bytes + bmark = @benchmark mul!($C,$a,$(b')) samples=10 evals=10 + @test minimum(bmark).allocs <= expected_transpose_allocs + @test_noalloc mul!(C, a, b') # A × B A = rand(Mat{N1,N2}) @@ -137,9 +126,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,A,B,2.0,1.0) @test C ≈ 4A*B - b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10 - @test minimum(b).allocs == 0 - # @test_noalloc mul!(C, A, B, α, β) # records 32 bytes + bmark = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + @test_noalloc mul!(C, A, B, α, β) # A'B At = Transpose(A) @@ -150,9 +139,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(B,At,C,2.0,1.0) @test B ≈ 4A'C - b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10 - @test minimum(b).allocs <= expected_transpose_allocs - # @test_noalloc mul!(B, Transpose(A), C, α, β) # records 48 bytes + bmark = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs <= expected_transpose_allocs + @test_noalloc mul!(B, Transpose(A), C, α, β) # A*B' Bt = Transpose(B) @@ -163,9 +152,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,A,Bt,2.0,1.0) @test C ≈ 4A*B' - b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10 - @test minimum(b).allocs <= expected_transpose_allocs - # @test_noalloc mul!(C, A, Transpose(B), α, β) # records 48 bytes + bmark = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs <= expected_transpose_allocs + @test_noalloc mul!(C, A, Transpose(B), α, β) # A'B' B = rand(Mat{N1,N1}) @@ -177,17 +166,17 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,Transpose(A),Transpose(B),2.0,1.0) @test C ≈ 4A'B' - b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 - @test minimum(b).allocs <= 2*expected_transpose_allocs - # @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # records 64 bytes + bmark = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs <= 2*expected_transpose_allocs + @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # Transpose Output C = rand(Mat{N1,N2}) mul!(Transpose(C),Transpose(A),Transpose(B)) @test C' ≈ A'B' - b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 - @test minimum(b).allocs <= expected_transpose_allocs*3 - # @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) # records 80 bytes + bmark = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs <= expected_transpose_allocs*3 + @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) end # Test the three different diff --git a/test/testutil.jl b/test/testutil.jl index f3faad7c..d95f6119 100644 --- a/test/testutil.jl +++ b/test/testutil.jl @@ -104,6 +104,103 @@ macro test_was_once_broken(good_version, ex) end) end + +r""" + @allocated_barrier f(b, g(c)) + +Is effectively translated to: + + (function(b, c) + @allocated f(b, g(c)) + end)(b, c) + +The function barrier improves type stability which helps the compiler +avoid unccessary heap allocations. + +Functions/functors are not captured as local variables by default but +they can be wrapped by prepending each function call with a $ sign. +Values can also be interpolated with a $: + + @allocated_barrier $f(b, $(g(c))) + +Which effectively translates to: + + (function(f, b, gc) + @allocated f(b, gc) + end)(f, b, g(c)) + +This is useful if `f` is a local variable or if `g(..)` causes allocations +that should be excluded. + +Another approach to is to wrap each call to `@allocated` with `@eval`: + + @eval @allocated f($a,g($b,$c)) + @eval @allocated $a .= $f.($b, $c) + +The number of allocated bytes reported is similar to this macro. +""" +macro allocated_barrier(ex) + captured = Dict{Any,Symbol}() + + function capture(s::Symbol) + if Base.isidentifier(s) + get!(captured, s) do + gensym(s) + end + else + s + end + end + + function capture(expr::Expr) + if expr.head == :$ + get!(captured, expr.args[1]) do + gensym(string(expr.args[1])) + end + elseif expr.head == :. && last(expr.args) isa QuoteNode + get!(captured, expr) do + gensym(join(expr.args, ".")) + end + else + # Expr(expr.head, capture.(expr.args)...) + arg1 = popfirst!(expr.args) + Expr(expr.head, + expr.head == :call && !(arg1 isa Expr && arg1.head==:$) ? arg1 : capture(arg1), + capture.(expr.args)...) + end + end + + capture(x) = x + + inner_ex = capture(ex) + + quote + (function($(values(captured)...)) + + f() = $inner_ex + Base.precompile(f, ()) + @allocated f() + + end)($(esc.(keys(captured))...)) + end +end + + +macro test_noalloc(ex) + a = :( + @allocated_barrier($ex) + ) + a.args[2] = () # tidy output + + q = :( + @test 0 == $a + ) + q.args[2] = LineNumberNode(__source__.line, __source__.file) + + esc(q) +end + + @testset "test utils" begin @testset "@testinf" begin @testinf [1,2] == [1,2] @@ -121,4 +218,22 @@ end end @test ts.errorcount == 0 && ts.failcount == 2 && ts.passcount == 0 end + + a = rand(3) + z = ones(3) + + @testset "@allocated_barrier" begin + @test @allocated_barrier(z .= a .+ z) == 0 + @test z ≈ a .+ 1 + @test @allocated_barrier(z .= a + z) > 0 + @test @allocated_barrier(z .= $(a + z)) == 0 + + @test @allocated_barrier(z .= abs.(a + z)) > 0 + @test @allocated_barrier(z .= $(abs.(a + z))) == 0 + @test @allocated_barrier(z .= abs.(a .+ z)) == 0 + end + + @testset "@test_noalloc" begin + @test_noalloc z .= abs.(a .+ z) + end end