Skip to content

NVPTX backend generating instructions only available on SM80 and above by default.  #64606

Open
@MaheshRavishankar

Description

@MaheshRavishankar

Starting with https://github.com/llvm/llvm-project/commit/dad9de0ae536 it seems like NVPTX backend is generating instructions that are only available on SM80 and above. Specifically, starting from this input

; ModuleID = 'module__max_sub_exp_dispatch_0_cuda_nvptx_fb.optimized.bc'
source_filename = "_max_sub_exp_dispatch_0"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

@__dynamic_shared_memory__ = external local_unnamed_addr addrspace(3) global [0 x i8], align 16

; Function Attrs: nounwind
define void @_max_sub_exp_dispatch_0_generic_12x128x128_f32(ptr noalias readonly align 16 %0, ptr noalias align 16 %1) local_unnamed_addr #0 {
.lr.ph:
  %2 = ptrtoint ptr %0 to i64
  %3 = and i64 %2, 48
  %4 = icmp eq i64 %3, 0
  tail call void @llvm.assume(i1 %4)
  %5 = ptrtoint ptr %1 to i64
  %6 = and i64 %5, 48
  %7 = icmp eq i64 %6, 0
  tail call void @llvm.assume(i1 %7)
  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
  %9 = zext i32 %8 to i64
  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !7
  %11 = zext i32 %10 to i64
  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
  %13 = shl nuw nsw i32 %12, 2
  %14 = zext i32 %13 to i64
  %15 = shl nuw nsw i64 %9, 14
  %16 = shl nuw nsw i64 %11, 7
  %17 = add nuw nsw i64 %15, %16
  %18 = or i64 %17, %14
  %19 = getelementptr float, ptr %0, i64 %18
  %20 = load <4 x float>, ptr %19, align 16
  %21 = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %20)
  tail call void @llvm.nvvm.barrier0()
  %22 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %21, i32 1, i32 31)
  %23 = extractvalue { float, i1 } %22, 0
  %.inv = fcmp ole float %21, %23
  %24 = fcmp uno float %23, 0.000000e+00
  %25 = select i1 %24, i1 true, i1 %.inv
  %26 = select i1 %25, float %23, float %21
  %27 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %26, i32 2, i32 31)
  %28 = extractvalue { float, i1 } %27, 0
  %.inv1 = fcmp ole float %26, %28
  %29 = fcmp uno float %28, 0.000000e+00
  %30 = select i1 %29, i1 true, i1 %.inv1
  %31 = select i1 %30, float %28, float %26
  %32 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %31, i32 4, i32 31)
  %33 = extractvalue { float, i1 } %32, 0
  %.inv2 = fcmp ole float %31, %33
  %34 = fcmp uno float %33, 0.000000e+00
  %35 = select i1 %34, i1 true, i1 %.inv2
  %36 = select i1 %35, float %33, float %31
  %37 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %36, i32 8, i32 31)
  %38 = extractvalue { float, i1 } %37, 0
  %.inv3 = fcmp ole float %36, %38
  %39 = fcmp uno float %38, 0.000000e+00
  %40 = select i1 %39, i1 true, i1 %.inv3
  %41 = select i1 %40, float %38, float %36
  %42 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %41, i32 16, i32 31)
  %43 = icmp eq i32 %12, 0
  br i1 %43, label %44, label %50

44:                                               ; preds = %.lr.ph
  %45 = extractvalue { float, i1 } %42, 0
  %46 = fcmp uno float %45, 0.000000e+00
  %.inv4 = fcmp ole float %41, %45
  %47 = select i1 %46, i1 true, i1 %.inv4
  %48 = select i1 %47, float %45, float %41
  %.inv5 = fcmp ole float %48, 0xC7EFFFFFE0000000
  %49 = select i1 %.inv5, float 0xC7EFFFFFE0000000, float %48
  store float %49, ptr addrspace(3) @__dynamic_shared_memory__, align 16
  br label %50

50:                                               ; preds = %44, %.lr.ph
  tail call void @llvm.nvvm.barrier0()
  %51 = shl nuw nsw i64 %9, 14
  %52 = shl nuw nsw i64 %11, 7
  %53 = add nuw nsw i64 %51, %52
  %54 = or i64 %53, %14
  %55 = getelementptr float, ptr %0, i64 %54
  %56 = load <4 x float>, ptr %55, align 16
  %57 = load float, ptr addrspace(3) @__dynamic_shared_memory__, align 16
  %58 = insertelement <4 x float> undef, float %57, i64 0
  %59 = shufflevector <4 x float> %58, <4 x float> undef, <4 x i32> zeroinitializer
  %60 = fsub <4 x float> %56, %59
  %.inv6 = fcmp olt <4 x float> %60, <float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000>
  %61 = select <4 x i1> %.inv6, <4 x float> <float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000>, <4 x float> %60
  %.inv7 = fcmp ogt <4 x float> %61, <float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000>
  %62 = select <4 x i1> %.inv7, <4 x float> <float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000>, <4 x float> %61
  %63 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %62, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>)
  %64 = extractelement <4 x float> %63, i64 0
  %65 = tail call float @llvm.floor.f32(float %64)
  %66 = insertelement <4 x float> poison, float %65, i64 0
  %67 = extractelement <4 x float> %63, i64 1
  %68 = tail call float @llvm.floor.f32(float %67)
  %69 = insertelement <4 x float> %66, float %68, i64 1
  %70 = extractelement <4 x float> %63, i64 2
  %71 = tail call float @llvm.floor.f32(float %70)
  %72 = insertelement <4 x float> %69, float %71, i64 2
  %73 = extractelement <4 x float> %63, i64 3
  %74 = tail call float @llvm.floor.f32(float %73)
  %75 = insertelement <4 x float> %72, float %74, i64 3
  %.inv8 = fcmp olt <4 x float> %75, <float -1.270000e+02, float -1.270000e+02, float -1.270000e+02, float -1.270000e+02>
  %76 = select <4 x i1> %.inv8, <4 x float> <float -1.270000e+02, float -1.270000e+02, float -1.270000e+02, float -1.270000e+02>, <4 x float> %75
  %.inv9 = fcmp ogt <4 x float> %76, <float 1.270000e+02, float 1.270000e+02, float 1.270000e+02, float 1.270000e+02>
  %77 = select <4 x i1> %.inv9, <4 x float> <float 1.270000e+02, float 1.270000e+02, float 1.270000e+02, float 1.270000e+02>, <4 x float> %76
  %78 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %77, <4 x float> <float 0xBFE6300000000000, float 0xBFE6300000000000, float 0xBFE6300000000000, float 0xBFE6300000000000>, <4 x float> %62)
  %79 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %77, <4 x float> <float 0x3F2BD01060000000, float 0x3F2BD01060000000, float 0x3F2BD01060000000, float 0x3F2BD01060000000>, <4 x float> %78)
  %80 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %79, <4 x float> <float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000>, <4 x float> <float 0x3F56E879C0000000, float 0x3F56E879C0000000, float 0x3F56E879C0000000, float 0x3F56E879C0000000>)
  %81 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %80, <4 x float> %79, <4 x float> <float 0x3F81112100000000, float 0x3F81112100000000, float 0x3F81112100000000, float 0x3F81112100000000>)
  %82 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %81, <4 x float> %79, <4 x float> <float 0x3FA5553820000000, float 0x3FA5553820000000, float 0x3FA5553820000000, float 0x3FA5553820000000>)
  %83 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %82, <4 x float> %79, <4 x float> <float 0x3FC5555540000000, float 0x3FC5555540000000, float 0x3FC5555540000000, float 0x3FC5555540000000>)
  %84 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %83, <4 x float> %79, <4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>)
  %85 = fmul <4 x float> %79, %79
  %86 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %84, <4 x float> %85, <4 x float> %79)
  %87 = fadd <4 x float> %86, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %88 = fptosi <4 x float> %77 to <4 x i32>
  %89 = shl <4 x i32> %88, <i32 23, i32 23, i32 23, i32 23>
  %90 = add <4 x i32> %89, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
  %91 = bitcast <4 x i32> %90 to <4 x float>
  %92 = fmul <4 x float> %87, %91
  %93 = getelementptr float, ptr %1, i64 %54
  store <4 x float> %92, ptr %93, align 16
  tail call void @llvm.nvvm.barrier0()
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2

; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #3

; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32) #4

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.floor.f32(float) #2

attributes #0 = { nounwind }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #3 = { convergent nocallback nounwind }
attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }

!nvvm.annotations = !{!0, !1, !2, !3}
!llvm.ident = !{!4}
!nvvmir.version = !{!5}

!0 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"kernel", i32 1}
!1 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidx", i32 32}
!2 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidy", i32 1}
!3 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidz", i32 1}
!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!5 = !{i32 2, i32 0}
!6 = !{i32 0, i32 2147483647}
!7 = !{i32 0, i32 65535}
!8 = !{i32 0, i32 32}

I get this PTX for all architectures (here sm 60)

//                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
// Generated by LLVM NVPTX Back-End                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
//                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

.version 7.6
.target sm_60
.address_size 64

        // .globl       _max_sub_exp_dispatch_0_generic_12x128x128_f32                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
.extern .shared .align 16 .b8 __dynamic_shared_memory__[];

.visible .entry _max_sub_exp_dispatch_0_generic_12x128x128_f32(
        .param .u64 _max_sub_exp_dispatch_0_generic_12x128x128_f32_param_0,
        .param .u64 _max_sub_exp_dispatch_0_generic_12x128x128_f32_param_1
)
.maxntid 32, 1, 1
{
        .reg .pred      %p<34>;
        .reg .b32       %r<17>;
        .reg .f32       %f<105>;
        .reg .b64       %rd<14>;

        ld.param.u64    %rd4, [_max_sub_exp_dispatch_0_generic_12x128x128_f32_param_0];
        ld.param.u64    %rd5, [_max_sub_exp_dispatch_0_generic_12x128x128_f32_param_1];
        cvta.to.global.u64      %rd1, %rd5;
        cvta.to.global.u64      %rd6, %rd4;
        mov.u32         %r1, %ctaid.x;
        mov.u32         %r2, %ctaid.y;
        mov.u32         %r3, %tid.x;
        shl.b32         %r4, %r3, 2;
        cvt.u64.u32     %rd7, %r4;
        mul.wide.u32    %rd8, %r1, 16384;
        mul.wide.u32    %rd9, %r2, 128;
        add.s64         %rd10, %rd8, %rd9;
        or.b64          %rd2, %rd10, %rd7;
        shl.b64         %rd11, %rd2, 2;
        add.s64         %rd3, %rd6, %rd11;
        ld.global.nc.v4.f32     {%f2, %f3, %f4, %f5}, [%rd3];
        max.NaN.f32     %f6, %f3, %f5;
        max.NaN.f32     %f7, %f2, %f4;
        max.NaN.f32     %f8, %f7, %f6;
        bar.sync        0;
        shfl.sync.bfly.b32      %f9|%p1, %f8, 1, 31, -1;
        setp.le.f32     %p2, %f8, %f9;
        setp.nan.f32    %p3, %f9, %f9;
        selp.f32        %f10, %f9, %f8, %p2;
        selp.f32        %f11, %f9, %f10, %p3;
        shfl.sync.bfly.b32      %f12|%p4, %f11, 2, 31, -1;
        setp.le.f32     %p5, %f11, %f12;
        setp.nan.f32    %p6, %f12, %f12;
        selp.f32        %f13, %f12, %f11, %p5;
        selp.f32        %f14, %f12, %f13, %p6;
        shfl.sync.bfly.b32      %f15|%p7, %f14, 4, 31, -1;
        setp.le.f32     %p8, %f14, %f15;
        setp.nan.f32    %p9, %f15, %f15;
        selp.f32        %f16, %f15, %f14, %p8;
        selp.f32        %f17, %f15, %f16, %p9;
        shfl.sync.bfly.b32      %f18|%p10, %f17, 8, 31, -1;
        setp.le.f32     %p11, %f17, %f18;
        setp.nan.f32    %p12, %f18, %f18;
        selp.f32        %f19, %f18, %f17, %p11;
        selp.f32        %f20, %f18, %f19, %p12;
        shfl.sync.bfly.b32      %f21|%p13, %f20, 16, 31, -1;
        setp.ne.s32     %p14, %r3, 0;
        @%p14 bra       $L__BB0_2;
        setp.nan.f32    %p15, %f21, %f21;
        setp.le.f32     %p16, %f20, %f21;
        selp.f32        %f22, %f21, %f20, %p16;
        selp.f32        %f23, %f21, %f22, %p15;
        setp.le.f32     %p17, %f23, 0fFF7FFFFF;
        selp.f32        %f1, 0fFF7FFFFF, %f23, %p17;
        st.shared.f32   [__dynamic_shared_memory__], %f1;
$L__BB0_2:
        bar.sync        0;
        ld.global.nc.v4.f32     {%f24, %f25, %f26, %f27}, [%rd3];
        ld.shared.f32   %f28, [__dynamic_shared_memory__];
        sub.rn.f32      %f29, %f24, %f28;
        sub.rn.f32      %f30, %f25, %f28;
        sub.rn.f32      %f31, %f26, %f28;
        sub.rn.f32      %f32, %f27, %f28;
        setp.lt.f32     %p18, %f32, 0fC2AF999A;
        setp.lt.f32     %p19, %f31, 0fC2AF999A;
        setp.lt.f32     %p20, %f30, 0fC2AF999A;
        setp.lt.f32     %p21, %f29, 0fC2AF999A;
        selp.f32        %f33, 0fC2AF999A, %f29, %p21;
        selp.f32        %f34, 0fC2AF999A, %f30, %p20;
        selp.f32        %f35, 0fC2AF999A, %f31, %p19;
        selp.f32        %f36, 0fC2AF999A, %f32, %p18;
        setp.gt.f32     %p22, %f36, 0f42B1999A;
        setp.gt.f32     %p23, %f35, 0f42B1999A;
        setp.gt.f32     %p24, %f34, 0f42B1999A;
        setp.gt.f32     %p25, %f33, 0f42B1999A;
        selp.f32        %f37, 0f42B1999A, %f33, %p25;
        selp.f32        %f38, 0f42B1999A, %f34, %p24;
        selp.f32        %f39, 0f42B1999A, %f35, %p23;
        selp.f32        %f40, 0f42B1999A, %f36, %p22;
        fma.rn.f32      %f41, %f40, 0f3FB8AA3B, 0f3F000000;
        fma.rn.f32      %f42, %f39, 0f3FB8AA3B, 0f3F000000;
        fma.rn.f32      %f43, %f38, 0f3FB8AA3B, 0f3F000000;
        fma.rn.f32      %f44, %f37, 0f3FB8AA3B, 0f3F000000;
        cvt.rmi.f32.f32         %f45, %f44;
        cvt.rmi.f32.f32         %f46, %f43;
        cvt.rmi.f32.f32         %f47, %f42;
        cvt.rmi.f32.f32         %f48, %f41;
        setp.lt.f32     %p26, %f45, 0fC2FE0000;
        setp.lt.f32     %p27, %f46, 0fC2FE0000;
        setp.lt.f32     %p28, %f47, 0fC2FE0000;
        setp.lt.f32     %p29, %f48, 0fC2FE0000;
        selp.f32        %f49, 0fC2FE0000, %f48, %p29;
        selp.f32        %f50, 0fC2FE0000, %f47, %p28;
        selp.f32        %f51, 0fC2FE0000, %f46, %p27;
        selp.f32        %f52, 0fC2FE0000, %f45, %p26;
        setp.gt.f32     %p30, %f52, 0f42FE0000;
        setp.gt.f32     %p31, %f51, 0f42FE0000;
        setp.gt.f32     %p32, %f50, 0f42FE0000;
        setp.gt.f32     %p33, %f49, 0f42FE0000;
        selp.f32        %f53, 0f42FE0000, %f49, %p33;
        selp.f32        %f54, 0f42FE0000, %f50, %p32;
        selp.f32        %f55, 0f42FE0000, %f51, %p31;
        selp.f32        %f56, 0f42FE0000, %f52, %p30;
        fma.rn.f32      %f57, %f56, 0fBF318000, %f37;
        fma.rn.f32      %f58, %f55, 0fBF318000, %f38;
        fma.rn.f32      %f59, %f54, 0fBF318000, %f39;
        fma.rn.f32      %f60, %f53, 0fBF318000, %f40;
        fma.rn.f32      %f61, %f53, 0f395E8083, %f60;
        fma.rn.f32      %f62, %f54, 0f395E8083, %f59;
        fma.rn.f32      %f63, %f55, 0f395E8083, %f58;
        fma.rn.f32      %f64, %f56, 0f395E8083, %f57;
        fma.rn.f32      %f65, %f64, 0f39506967, 0f3AB743CE;
        fma.rn.f32      %f66, %f63, 0f39506967, 0f3AB743CE;
        fma.rn.f32      %f67, %f62, 0f39506967, 0f3AB743CE;
        fma.rn.f32      %f68, %f61, 0f39506967, 0f3AB743CE;
        fma.rn.f32      %f69, %f68, %f61, 0f3C088908;
        fma.rn.f32      %f70, %f67, %f62, 0f3C088908;
        fma.rn.f32      %f71, %f66, %f63, 0f3C088908;
        fma.rn.f32      %f72, %f65, %f64, 0f3C088908;
        fma.rn.f32      %f73, %f72, %f64, 0f3D2AA9C1;
        fma.rn.f32      %f74, %f71, %f63, 0f3D2AA9C1;
        fma.rn.f32      %f75, %f70, %f62, 0f3D2AA9C1;
        fma.rn.f32      %f76, %f69, %f61, 0f3D2AA9C1;
        fma.rn.f32      %f77, %f76, %f61, 0f3E2AAAAA;
        fma.rn.f32      %f78, %f75, %f62, 0f3E2AAAAA;
        fma.rn.f32      %f79, %f74, %f63, 0f3E2AAAAA;
        fma.rn.f32      %f80, %f73, %f64, 0f3E2AAAAA;
        fma.rn.f32      %f81, %f80, %f64, 0f3F000000;
        fma.rn.f32      %f82, %f79, %f63, 0f3F000000;
        fma.rn.f32      %f83, %f78, %f62, 0f3F000000;
        fma.rn.f32      %f84, %f77, %f61, 0f3F000000;
        mul.rn.f32      %f85, %f64, %f64;
        mul.rn.f32      %f86, %f63, %f63;
        mul.rn.f32      %f87, %f62, %f62;
        mul.rn.f32      %f88, %f61, %f61;
        fma.rn.f32      %f89, %f84, %f88, %f61;
        fma.rn.f32      %f90, %f83, %f87, %f62;
        fma.rn.f32      %f91, %f82, %f86, %f63;
        fma.rn.f32      %f92, %f81, %f85, %f64;
        add.rn.f32      %f93, %f92, 0f3F800000;
        add.rn.f32      %f94, %f91, 0f3F800000;
        add.rn.f32      %f95, %f90, 0f3F800000;
        add.rn.f32      %f96, %f89, 0f3F800000;
        cvt.rzi.s32.f32         %r5, %f53;
        cvt.rzi.s32.f32         %r6, %f54;
        cvt.rzi.s32.f32         %r7, %f55;
        cvt.rzi.s32.f32         %r8, %f56;
        shl.b32         %r9, %r8, 23;
        shl.b32         %r10, %r7, 23;
        shl.b32         %r11, %r6, 23;
        shl.b32         %r12, %r5, 23;
        add.s32         %r13, %r12, 1065353216;
        add.s32         %r14, %r11, 1065353216;
        add.s32         %r15, %r10, 1065353216;
        add.s32         %r16, %r9, 1065353216;
        mov.b32         %f97, %r16;
        mov.b32         %f98, %r15;
        mov.b32         %f99, %r14;
        mov.b32         %f100, %r13;
        mul.rn.f32      %f101, %f96, %f100;
        mul.rn.f32      %f102, %f95, %f99;
        mul.rn.f32      %f103, %f94, %f98;
        mul.rn.f32      %f104, %f93, %f97;
        add.s64         %rd13, %rd1, %rd11;
        st.global.v4.f32        [%rd13], {%f104, %f103, %f102, %f101};
        bar.sync        0;
        ret;

}

As specified here https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max, the max.NaN.f32 is available only of f32 or higher.

The issue seems to be here

defm FMAXNAN : F3<"max.NaN", fmaximum>;
. There is no predicate of hasSM<80> that is needed to guard these.

I tried MaheshRavishankar@5f0385d and that just gave me a compilation error that indicated that instruction selection failed. So wondering if someone could either fix or help direct me to how to fix this.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions