Skip to content

reinterpret of tuple fails on gpus #712

@simeonschaub

Description

@simeonschaub

This generates gpu-incompatible code:

julia> foo(x,y) = reinterpret(UInt64, (x,y))
foo (generic function with 1 method)

julia> OpenCL.code_llvm(foo, (Float32, Float32))
;  @ REPL[3]:1 within `foo`
define i64 @julia_foo_81905(float %"x::Float32", float %"y::Float32") local_unnamed_addr {
top:
  %jlcallframe10 = alloca [9 x ptr], align 8
  %"new::Tuple" = alloca [2 x float], align 4
  store float %"x::Float32", ptr %"new::Tuple", align 4
  %0 = getelementptr inbounds i8, ptr %"new::Tuple", i64 4
  store float %"y::Float32", ptr %0, align 4
; ┌ @ essentials.jl:736 within `reinterpret`
; │┌ @ reinterpretarray.jl:857 within `_reinterpret`
    %1 = call fastcc nonnull ptr @julia_packedsize_81937()
; ││ @ reinterpretarray.jl:859 within `_reinterpret`
    store ptr %1, ptr %jlcallframe10, align 8
    %2 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 1
    store ptr inttoptr (i64 139924789764000 to ptr), ptr %2, align 8
    %3 = call nonnull ptr @ijl_apply_generic(ptr inttoptr (i64 139924866051216 to ptr), ptr nonnull %jlcallframe10, i32 2)
    %.tag_addr = getelementptr inbounds i64, ptr %3, i64 -1
    %.tag = load atomic i64, ptr %.tag_addr unordered, align 8
    %4 = and i64 %.tag, -16
    %5 = inttoptr i64 %4 to ptr
    %exactly_isa = icmp eq ptr %5, inttoptr (i64 192 to ptr)
    br i1 %exactly_isa, label %pass, label %fail

L5:                                               ; preds = %pass
; ││ @ reinterpretarray.jl:862 within `_reinterpret`
; ││┌ @ refpointer.jl:147 within `Ref`
; │││┌ @ refvalue.jl:8 within `RefValue`
      %"new::RefValue.sroa.0.0.copyload" = load i64, ptr %"new::Tuple", align 1
; ││└└
; ││ @ reinterpretarray.jl:864 within `_reinterpret`
; ││┌ @ reinterpretarray.jl:811 within `struct_subpadding`
     %6 = load ptr, ptr addrspacecast (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @jl_small_typeof, i64 320) to ptr), align 8
     store ptr %6, ptr %jlcallframe10, align 8
     %7 = call nonnull ptr @ijl_invoke(ptr inttoptr (i64 139924884548304 to ptr), ptr nonnull %jlcallframe10, i32 1, ptr inttoptr (i64 139924885003056 to ptr))
; │││┌ @ reinterpretarray.jl:755 within `padding`
      store ptr inttoptr (i64 139924922404224 to ptr), ptr %jlcallframe10, align 8
      store ptr inttoptr (i64 139924789760416 to ptr), ptr %2, align 8
      %8 = call nonnull ptr @ijl_invoke(ptr inttoptr (i64 139924884548304 to ptr), ptr nonnull %jlcallframe10, i32 2, ptr inttoptr (i64 139924884994992 to ptr))
; │││└
     %9 = call fastcc i8 @julia____81955(ptr %7, ptr %8)
; ││└
    %10 = and i8 %9, 1
    %.not = icmp eq i8 %10, 0
    br i1 %.not, label %L22, label %L31

L22:                                              ; preds = %L5
; ││ @ reinterpretarray.jl:874 within `_reinterpret`
    %11 = call fastcc i64 @julia__reinterpret_padding_81925(ptr nocapture readonly %"new::Tuple")
    br label %L31

L24:                                              ; preds = %pass
    %12 = load ptr, ptr addrspacecast (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @jl_small_typeof, i64 320) to ptr), align 8
    store ptr inttoptr (i64 139924981334912 to ptr), ptr %jlcallframe10, align 8
    store ptr %12, ptr %2, align 8
    %13 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 2
    store ptr inttoptr (i64 139924954237824 to ptr), ptr %13, align 8
    %14 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 3
    store ptr inttoptr (i64 139924922404224 to ptr), ptr %14, align 8
    %15 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 4
    store ptr inttoptr (i64 139924981334864 to ptr), ptr %15, align 8
    %16 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 5
    store ptr inttoptr (i64 139924789764000 to ptr), ptr %16, align 8
    %17 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 6
    store ptr inttoptr (i64 139924954237824 to ptr), ptr %17, align 8
    %18 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 7
    store ptr %1, ptr %18, align 8
    %19 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 8
    store ptr inttoptr (i64 139924981334832 to ptr), ptr %19, align 8
    %jl_f_tuple_ret = call nonnull ptr @jl_f_tuple(ptr null, ptr nonnull %jlcallframe10, i32 9)
; ││ @ reinterpretarray.jl:859 within `_reinterpret`
    call fastcc void @gpu_report_exception()
    call fastcc void @gpu_signal_exception()
    call void @llvm.trap()
    unreachable

L31:                                              ; preds = %L22, %L5
    %value_phi = phi i64 [ %11, %L22 ], [ %"new::RefValue.sroa.0.0.copyload", %L5 ]
; │└
   ret i64 %value_phi

fail:                                             ; preds = %top
; │┌ @ reinterpretarray.jl:859 within `_reinterpret`
    call fastcc void @gpu_report_exception()
    call fastcc void @gpu_signal_exception()
    call void @llvm.trap()
    unreachable

pass:                                             ; preds = %top
    %jl_false = load ptr, ptr addrspace(1) @jl_false, align 8
    %20 = icmp eq ptr %3, %jl_false
    br i1 %20, label %L24, label %L5
; └└
}

Even though it produces good code on the CPU:

julia> code_llvm(foo, (Float32, Float32))
; Function Signature: foo(Float32, Float32)
;  @ REPL[3]:1 within `foo`
define i64 @julia_foo_85335(float %"x::Float32", float %"y::Float32") #0 {
top:
; ┌ @ essentials.jl:736 within `reinterpret`
; │┌ @ reinterpretarray.jl:862 within `_reinterpret`
; ││┌ @ refpointer.jl:147 within `Ref`
; │││┌ @ refvalue.jl:8 within `RefValue`
      %0 = bitcast float %"x::Float32" to i32
      %1 = bitcast float %"y::Float32" to i32
; ││└└
; ││ @ reinterpretarray.jl:871 within `_reinterpret`
; ││┌ @ refvalue.jl:59 within `getindex`
; │││┌ @ Base_compiler.jl:54 within `getproperty`
      %"new::RefValue3.sroa.5.0.insert.ext10" = zext i32 %1 to i64
      %"new::RefValue3.sroa.5.0.insert.shift11" = shl nuw i64 %"new::RefValue3.sroa.5.0.insert.ext10", 32
      %"new::RefValue3.sroa.0.0.insert.ext7" = zext i32 %0 to i64
      %"new::RefValue3.sroa.0.0.insert.insert9" = or disjoint i64 %"new::RefValue3.sroa.5.0.insert.shift11", %"new::RefValue3.sroa.0.0.insert.ext7"
      ret i64 %"new::RefValue3.sroa.0.0.insert.insert9"
; └└└└
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions