Skip to content

Inference regression with external interpreters + recursive functions #52938

Closed
@maleadt

Description

@maleadt

Opening an issue to track an 1.11 regression seen in the GPU stack on 1.11 (JuliaGPU/CUDA.jl#2240, JuliaGPU/GPUCompiler.jl#506), involving recursive functions. These functions are common, e.g. typejoin is recursive, breaking certain kwarg calls:

; ││┌ @ iterators.jl:279 within `pairs`
; │││┌ @ essentials.jl:388 within `Pairs`
; ││││┌ @ namedtuple.jl:236 within `eltype`
; │││││┌ @ namedtuple.jl:238 within `nteltype`
; ││││││┌ @ tuple.jl:203 within `eltype`
; │││││││┌ @ tuple.jl:223 within `_compute_eltype`
; ││││││││┌ @ promotion.jl:174 within `promote_typejoin`
           %8 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 256) to {}**), align 8
           %gc_slot_addr_1 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 1)
           store {}* %8, {}** %gc_slot_addr_1, align 8
           %9 = call fastcc nonnull {}* @julia_typejoin_54511({}* readonly %8, {}* readonly inttoptr (i64 130372983896064 to {}*))
; │││││││││ @ promotion.jl:175 within `promote_typejoin`
           %10 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 64) to {}**), align 8
           %gc_slot_addr_2 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 2)
           store {}* %10, {}** %gc_slot_addr_2, align 8
           %gc_slot_addr_0 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 0)
           store {}* %9, {}** %gc_slot_addr_0, align 8
           store {}* %10, {}** %jlcallframe35.sub, align 8
           %11 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 1
           store {}* %8, {}** %11, align 8
           %12 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 2
           store {}* inttoptr (i64 130372983896064 to {}*), {}** %12, align 8
           %13 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe35, i64 0, i64 3
           store {}* %9, {}** %13, align 8
           %14 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %jlcallframe35.sub, i32 4)

MWE without the GPU stack:

const CC = Core.Compiler
using Core: MethodInstance, CodeInstance, CodeInfo, MethodTable


## code instance cache

struct CodeCache
    dict::IdDict{MethodInstance,Vector{CodeInstance}}

    CodeCache() = new(IdDict{MethodInstance,Vector{CodeInstance}}())
end

function CC.setindex!(cache::CodeCache, ci::CodeInstance, mi::MethodInstance)
    cis = get!(cache.dict, mi, CodeInstance[])
    push!(cis, ci)
end


## world view of the cache

function CC.haskey(wvc::CC.WorldView{CodeCache}, mi::MethodInstance)
    CC.get(wvc, mi, nothing) !== nothing
end

function CC.get(wvc::CC.WorldView{CodeCache}, mi::MethodInstance, default)
    # check the cache
    for ci in get!(wvc.cache.dict, mi, CodeInstance[])
        if ci.min_world <= wvc.worlds.min_world && wvc.worlds.max_world <= ci.max_world
            # TODO: if (code && (code == jl_nothing || jl_ir_flag_inferred((jl_array_t*)code)))
            src = if ci.inferred isa Vector{UInt8}
                ccall(:jl_uncompress_ir, Any, (Any, Ptr{Cvoid}, Any),
                       mi.def, C_NULL, ci.inferred)
            else
                ci.inferred
            end
            return ci
        end
    end

    return default
end

function CC.getindex(wvc::CC.WorldView{CodeCache}, mi::MethodInstance)
    r = CC.get(wvc, mi, nothing)
    r === nothing && throw(KeyError(mi))
    return r::CodeInstance
end

function CC.setindex!(wvc::CC.WorldView{CodeCache}, ci::CodeInstance, mi::MethodInstance)
    src = if ci.inferred isa Vector{UInt8}
        ccall(:jl_uncompress_ir, Any, (Any, Ptr{Cvoid}, Any),
              mi.def, C_NULL, ci.inferred)
    else
        ci.inferred
    end
    CC.setindex!(wvc.cache, ci, mi)
end


## interpreter

if isdefined(CC, :CachedMethodTable)
    const ExternalMethodTableView = CC.CachedMethodTable{CC.OverlayMethodTable}
    get_method_table_view(world::UInt, mt::MethodTable) =
        CC.CachedMethodTable(CC.OverlayMethodTable(world, mt))
else
    const ExternalMethodTableView = CC.OverlayMethodTable
    get_method_table_view(world::UInt, mt::MethodTable) = CC.OverlayMethodTable(world, mt)
end

struct ExternalInterpreter <: CC.AbstractInterpreter
    world::UInt
    method_table::ExternalMethodTableView

    code_cache
    inf_cache::Vector{CC.InferenceResult}
end

function ExternalInterpreter(world::UInt=Base.get_world_counter(); method_table, code_cache)
    @assert world <= Base.get_world_counter()
    method_table = get_method_table_view(world, method_table)
    inf_cache = Vector{CC.InferenceResult}()

    return ExternalInterpreter(world, method_table, code_cache, inf_cache)
end

CC.InferenceParams(interp::ExternalInterpreter) = CC.InferenceParams()
CC.OptimizationParams(interp::ExternalInterpreter) = CC.OptimizationParams()
CC.get_world_counter(interp::ExternalInterpreter) = interp.world
CC.get_inference_cache(interp::ExternalInterpreter) = interp.inf_cache
CC.code_cache(interp::ExternalInterpreter) = CC.WorldView(interp.code_cache, interp.world)

# No need to do any locking since we're not putting our results into the runtime cache
CC.lock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing
CC.unlock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing

function CC.add_remark!(interp::ExternalInterpreter, sv::CC.InferenceState, msg)
    @debug "Inference remark during External compilation of $(sv.linfo): $msg"
end

CC.may_optimize(interp::ExternalInterpreter) = true
CC.may_compress(interp::ExternalInterpreter) = true
CC.may_discard_trees(interp::ExternalInterpreter) = true
CC.verbose_stmt_info(interp::ExternalInterpreter) = false
CC.method_table(interp::ExternalInterpreter) = interp.method_table


# main

Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)

inner(f, types::Type, args...; kwargs...) = nothing
outer(f) = @inline inner(f, Tuple{}; foo=Ref(42), bar=1)

function main()
    println("Native:")
    display(Base.code_ircode(outer, Tuple{Nothing}))

    println()

    println("External:")
    interp = ExternalInterpreter(; method_table=GLOBAL_METHOD_TABLE, code_cache=CodeCache())
    display(Base.code_ircode(outer, Tuple{Nothing}; interp))

    return
end

isinteractive() || main()
Native:
1-element Vector{Any}:
115 1 ─     return nothing                                                  │
     => Nothing

External:
1-element Vector{Any}:
115 1 ─ %1 = invoke Base.typejoin(Int64::Any, Base.RefValue{Int64}::Any)::Any
    │        Core.apply_type(Base.Union, Int64, Base.RefValue{Int64}, %1)::Type
    └──      return nothing                                  │
     => Nothing

Bisected to #51092; cc @vtjnash. There's a solution proposed by @aviatesk in #51092 (comment).

Metadata

Metadata

Assignees

Labels

gpuAffects running Julia on a GPUregressionRegression in behavior compared to a previous version

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions