Closed
Description
Opening an issue to track an 1.11 regression seen in the GPU stack on 1.11 (JuliaGPU/CUDA.jl#2240, JuliaGPU/GPUCompiler.jl#506), involving recursive functions. These functions are common, e.g. typejoin
is recursive, breaking certain kwarg calls:
; ││┌ @ iterators.jl:279 within `pairs`
; │││┌ @ essentials.jl:388 within `Pairs`
; ││││┌ @ namedtuple.jl:236 within `eltype`
; │││││┌ @ namedtuple.jl:238 within `nteltype`
; ││││││┌ @ tuple.jl:203 within `eltype`
; │││││││┌ @ tuple.jl:223 within `_compute_eltype`
; ││││││││┌ @ promotion.jl:174 within `promote_typejoin`
%8 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 256) to {}**), align 8
%gc_slot_addr_1 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 1)
store {}* %8, {}** %gc_slot_addr_1, align 8
%9 = call fastcc nonnull {}* @julia_typejoin_54511({}* readonly %8, {}* readonly inttoptr (i64 130372983896064 to {}*))
; │││││││││ @ promotion.jl:175 within `promote_typejoin`
%10 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 64) to {}**), align 8
%gc_slot_addr_2 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 2)
store {}* %10, {}** %gc_slot_addr_2, align 8
%gc_slot_addr_0 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 0)
store {}* %9, {}** %gc_slot_addr_0, align 8
store {}* %10, {}** %jlcallframe35.sub, align 8
%11 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 1
store {}* %8, {}** %11, align 8
%12 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 2
store {}* inttoptr (i64 130372983896064 to {}*), {}** %12, align 8
%13 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 3
store {}* %9, {}** %13, align 8
%14 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %jlcallframe35.sub, i32 4)
MWE without the GPU stack:
const CC = Core.Compiler
using Core: MethodInstance, CodeInstance, CodeInfo, MethodTable
## code instance cache
struct CodeCache
dict::IdDict{MethodInstance,Vector{CodeInstance}}
CodeCache() = new(IdDict{MethodInstance,Vector{CodeInstance}}())
end
function CC.setindex!(cache::CodeCache, ci::CodeInstance, mi::MethodInstance)
cis = get!(cache.dict, mi, CodeInstance[])
push!(cis, ci)
end
## world view of the cache
function CC.haskey(wvc::CC.WorldView{CodeCache}, mi::MethodInstance)
CC.get(wvc, mi, nothing) !== nothing
end
function CC.get(wvc::CC.WorldView{CodeCache}, mi::MethodInstance, default)
# check the cache
for ci in get!(wvc.cache.dict, mi, CodeInstance[])
if ci.min_world <= wvc.worlds.min_world && wvc.worlds.max_world <= ci.max_world
# TODO: if (code && (code == jl_nothing || jl_ir_flag_inferred((jl_array_t*)code)))
src = if ci.inferred isa Vector{UInt8}
ccall(:jl_uncompress_ir, Any, (Any, Ptr{Cvoid}, Any),
mi.def, C_NULL, ci.inferred)
else
ci.inferred
end
return ci
end
end
return default
end
function CC.getindex(wvc::CC.WorldView{CodeCache}, mi::MethodInstance)
r = CC.get(wvc, mi, nothing)
r === nothing && throw(KeyError(mi))
return r::CodeInstance
end
function CC.setindex!(wvc::CC.WorldView{CodeCache}, ci::CodeInstance, mi::MethodInstance)
src = if ci.inferred isa Vector{UInt8}
ccall(:jl_uncompress_ir, Any, (Any, Ptr{Cvoid}, Any),
mi.def, C_NULL, ci.inferred)
else
ci.inferred
end
CC.setindex!(wvc.cache, ci, mi)
end
## interpreter
if isdefined(CC, :CachedMethodTable)
const ExternalMethodTableView = CC.CachedMethodTable{CC.OverlayMethodTable}
get_method_table_view(world::UInt, mt::MethodTable) =
CC.CachedMethodTable(CC.OverlayMethodTable(world, mt))
else
const ExternalMethodTableView = CC.OverlayMethodTable
get_method_table_view(world::UInt, mt::MethodTable) = CC.OverlayMethodTable(world, mt)
end
struct ExternalInterpreter <: CC.AbstractInterpreter
world::UInt
method_table::ExternalMethodTableView
code_cache
inf_cache::Vector{CC.InferenceResult}
end
function ExternalInterpreter(world::UInt=Base.get_world_counter(); method_table, code_cache)
@assert world <= Base.get_world_counter()
method_table = get_method_table_view(world, method_table)
inf_cache = Vector{CC.InferenceResult}()
return ExternalInterpreter(world, method_table, code_cache, inf_cache)
end
CC.InferenceParams(interp::ExternalInterpreter) = CC.InferenceParams()
CC.OptimizationParams(interp::ExternalInterpreter) = CC.OptimizationParams()
CC.get_world_counter(interp::ExternalInterpreter) = interp.world
CC.get_inference_cache(interp::ExternalInterpreter) = interp.inf_cache
CC.code_cache(interp::ExternalInterpreter) = CC.WorldView(interp.code_cache, interp.world)
# No need to do any locking since we're not putting our results into the runtime cache
CC.lock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing
CC.unlock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing
function CC.add_remark!(interp::ExternalInterpreter, sv::CC.InferenceState, msg)
@debug "Inference remark during External compilation of $(sv.linfo): $msg"
end
CC.may_optimize(interp::ExternalInterpreter) = true
CC.may_compress(interp::ExternalInterpreter) = true
CC.may_discard_trees(interp::ExternalInterpreter) = true
CC.verbose_stmt_info(interp::ExternalInterpreter) = false
CC.method_table(interp::ExternalInterpreter) = interp.method_table
# main
Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)
inner(f, types::Type, args...; kwargs...) = nothing
outer(f) = @inline inner(f, Tuple{}; foo=Ref(42), bar=1)
function main()
println("Native:")
display(Base.code_ircode(outer, Tuple{Nothing}))
println()
println("External:")
interp = ExternalInterpreter(; method_table=GLOBAL_METHOD_TABLE, code_cache=CodeCache())
display(Base.code_ircode(outer, Tuple{Nothing}; interp))
return
end
isinteractive() || main()
Native:
1-element Vector{Any}:
115 1 ─ return nothing │
=> Nothing
External:
1-element Vector{Any}:
115 1 ─ %1 = invoke Base.typejoin(Int64::Any, Base.RefValue{Int64}::Any)::Any
│ Core.apply_type(Base.Union, Int64, Base.RefValue{Int64}, %1)::Type
└── return nothing │
=> Nothing
Bisected to #51092; cc @vtjnash. There's a solution proposed by @aviatesk in #51092 (comment).