-
-
Notifications
You must be signed in to change notification settings - Fork 68
Description
Array broadcast fusion not working as expected with CuArrays
using RecursiveArrayToolsm, CuArrays, CUDAnative
a = ArrayPartition(([1.0f0] |> cu,[2.0f0] |> cu,[3.0f0] |> cu))
b = ArrayPartition(([0.0f0] |> cu,[0.0f0] |> cu,[0.0f0] |> cu))
@. a + CUDAnative.pow(b, 2f0)
throws:
ERROR: LoadError: GPU compilation failed, try inspecting generated code with any of the @device_code_... macros
CompilerError: could not compile #19(CuArrays.CuKernelState, CuDeviceArray{Float32,1,CUDAnative.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},getfield(Base.Broadcast, Symbol("##1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("##5#6")){getfield(Base.Broadcast, Symbol("##7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("##9#10")){getfield(Base.Broadcast, Symbol("##9#10")){getfield(Base.Broadcast, Symbol("##11#12"))}},getfield(Base.Broadcast, Symbol("##13#14")){getfield(Base.Broadcast, Symbol("##13#14")){getfield(Base.Broadcast, Symbol("##15#16"))}},getfield(Base.Broadcast, Symbol("##5#6")){getfield(Base.Broadcast, Symbol("##5#6")){getfield(Base.Broadcast, Symbol("##3#4"))}}}}},Tuple{Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Float32}}); passing and using non-bitstype argument
- argument_type = Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},getfield(Base.Broadcast, Symbol("#flatten & reshape #1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#recursivecopy! for sparse inputs #7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#ArrayPartition broadcast() assumes input and output types are the same #11#12"))}},getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Implement a fast
vec
on VectorOfArray #15#16"))}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Fix type piracy #3#4"))}}}}},Tuple{Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Float32}}- argument = 4
Stacktrace:
[1] check_invocation(::CUDAnative.CompilerContext, ::LLVM.Function) at >/home/jack/.julia/packages/CUDAnative/EsWDI/src/compiler/validation.jl:30
[2] #compile_function#78(::Bool, ::Function, ::CUDAnative.CompilerContext) at ./logging.jl:319
[3] compile_function at /home/jack/.julia/packages/CUDAnative/EsWDI/src/compiler/driver.jl:56 [inlined]
[4] #cufunction#77(::Base.Iterators.Pairs{Symbol,getfield(GPUArrays, Symbol("#Immutable FieldVectors from StaticArrays.jl #19#20")),Tuple{Symbol},NamedTuple{(:inner_f,),Tuple{getfield(GPUArrays, Symbol("#Immutable FieldVectors from StaticArrays.jl #19#20"))}}}, ::Function, ::CUDAdrv.CuDevice, ::Any, ::Any) at /home/jack/.julia/packages/CUDAnative/EsWDI/src/compiler/driver.jl:22
[5] (::getfield(CUDAnative, Symbol("#kw##cufunction")))(::NamedTuple{(:inner_f,),Tuple{getfield(GPUArrays, Symbol("#Immutable FieldVectors from StaticArrays.jl #19#20"))}}, ::typeof(cufunction), ::CUDAdrv.CuDevice, ::Function, ::Type) at ./none:0
[6] macro expansion at /home/jack/.julia/packages/CUDAnative/EsWDI/src/execution.jl:219 [inlined]
[7] _cuda(::CUDAnative.KernelWrapper{getfield(GPUArrays, Symbol("#Immutable FieldVectors from StaticArrays.jl #19#20"))}, ::getfield(GPUArrays, Symbol("#Immutable FieldVectors from StaticArrays.jl #19#20")), ::Tuple{}, ::NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64},Tuple{Int64}}}, ::CuArrays.CuKernelState, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},getfield(Base.Broadcast, Symbol("#flatten & reshape #1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#recursivecopy! for sparse inputs #7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#ArrayPartition broadcast() assumes input and output types are the same #11#12"))}},getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Implement a fastvec
on VectorOfArray #15#16"))}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Fix type piracy #3#4"))}}}}},Tuple{Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}},Float32}}) at /home/jack/.julia/packages/CUDAnative/EsWDI/src/execution.jl:177
[8] macro expansion at ./gcutils.jl:87 [inlined]
[9] _gpu_call(::Function, ::CuArray{Float32,1}, ::Tuple{CuArray{Float32,1},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},getfield(Base.Broadcast, Symbol("#flatten & reshape #1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#recursivecopy! for sparse inputs #7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#ArrayPartition broadcast() assumes input and output types are the same #11#12"))}},getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Implement a fastvec
on VectorOfArray #15#16"))}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Fix type piracy #3#4"))}}}}},Tuple{Base.Broadcast.Extruded{CuArray{Float32,1},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuArray{Float32,1},Tuple{Bool},Tuple{Int64}},Float32}}}, ::Tuple{Tuple{Int64},Tuple{Int64}}) at /home/jack/.julia/packages/CuArrays/F96Gk/src/gpuarray_interface.jl:68
[10] gpu_call(::Function, ::CuArray{Float32,1}, ::Tuple{CuArray{Float32,1},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},getfield(Base.Broadcast, Symbol("#flatten & reshape #1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#recursivecopy! for sparse inputs #7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#ArrayPartition broadcast() assumes input and output types are the same #11#12"))}},getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Implement a fastvec
on VectorOfArray #15#16"))}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Fix type piracy #3#4"))}}}}},Tuple{Base.Broadcast.Extruded{CuArray{Float32,1},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuArray{Float32,1},Tuple{Bool},Tuple{Int64}},Float32}}}, ::Int64) at /home/jack/.julia/packages/GPUArrays/3E1qk/src/abstract_gpu_interface.jl:151
[11] gpu_call at /home/jack/.julia/packages/GPUArrays/3E1qk/src/abstract_gpu_interface.jl:128 [inlined]
[12] copyto! at /home/jack/.julia/packages/GPUArrays/3E1qk/src/broadcast.jl:14 [inlined]
[13] copyto! at ./broadcast.jl:768 [inlined]
[14] copy at ./broadcast.jl:744 [inlined]
[15] materialize at ./broadcast.jl:724 [inlined]
[16] broadcast(::getfield(Base.Broadcast, Symbol("#flatten & reshape #1#2")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Tuple{Base.OneTo{Int64}},typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#recursivecopy! for sparse inputs #7#8")){Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}},getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#use @static to silence warnings in v0.6 #9#10")){getfield(Base.Broadcast, Symbol("#ArrayPartition broadcast() assumes input and output types are the same #11#12"))}},getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Update CI URLs to point to new caching infrastructure #13#14")){getfield(Base.Broadcast, Symbol("#Implement a fastvec
on VectorOfArray #15#16"))}},getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Efficient broadcast overloads for AbstractVectorOfArray #5#6")){getfield(Base.Broadcast, Symbol("#Fix type piracy #3#4"))}}}}}, ::CuArray{Float32,1}, ::CuArray{Float32,1}, ::Float32) at ./broadcast.jl:702
[17] materialize(::Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(+),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Base.Broadcast.Broadcasted{Base.Broadcast.ArrayStyle{ArrayPartition},Nothing,typeof(CUDAnative.pow),Tuple{ArrayPartition{Float32,Tuple{CuArray{Float32,1},CuArray{Float32,1},CuArray{Float32,1}}},Float32}}}}) at /home/jack/.julia/packages/RecursiveArrayTools/V9Xjw/src/array_partition.jl:293
[18] top-level scope at none:0
[19] include at ./boot.jl:317 [inlined]
[20] include_relative(::Module, ::String) at ./loading.jl:1038
[21] include(::Module, ::String) at ./sysimg.jl:29
[22] include(::String) at ./client.jl:388
[23] top-level scope at none:0
in expression starting at /home/jack/Dropbox/lab/Dendronotus/julia/scantest.jl:53