Open
Description
Starting with https://github.com/llvm/llvm-project/commit/dad9de0ae536
it seems like NVPTX backend is generating instructions that are only available on SM80 and above. Specifically, starting from this input
; ModuleID = 'module__max_sub_exp_dispatch_0_cuda_nvptx_fb.optimized.bc'
source_filename = "_max_sub_exp_dispatch_0"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@__dynamic_shared_memory__ = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
; Function Attrs: nounwind
define void @_max_sub_exp_dispatch_0_generic_12x128x128_f32(ptr noalias readonly align 16 %0, ptr noalias align 16 %1) local_unnamed_addr #0 {
.lr.ph:
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, 48
%4 = icmp eq i64 %3, 0
tail call void @llvm.assume(i1 %4)
%5 = ptrtoint ptr %1 to i64
%6 = and i64 %5, 48
%7 = icmp eq i64 %6, 0
tail call void @llvm.assume(i1 %7)
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
%9 = zext i32 %8 to i64
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !7
%11 = zext i32 %10 to i64
%12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
%13 = shl nuw nsw i32 %12, 2
%14 = zext i32 %13 to i64
%15 = shl nuw nsw i64 %9, 14
%16 = shl nuw nsw i64 %11, 7
%17 = add nuw nsw i64 %15, %16
%18 = or i64 %17, %14
%19 = getelementptr float, ptr %0, i64 %18
%20 = load <4 x float>, ptr %19, align 16
%21 = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %20)
tail call void @llvm.nvvm.barrier0()
%22 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %21, i32 1, i32 31)
%23 = extractvalue { float, i1 } %22, 0
%.inv = fcmp ole float %21, %23
%24 = fcmp uno float %23, 0.000000e+00
%25 = select i1 %24, i1 true, i1 %.inv
%26 = select i1 %25, float %23, float %21
%27 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %26, i32 2, i32 31)
%28 = extractvalue { float, i1 } %27, 0
%.inv1 = fcmp ole float %26, %28
%29 = fcmp uno float %28, 0.000000e+00
%30 = select i1 %29, i1 true, i1 %.inv1
%31 = select i1 %30, float %28, float %26
%32 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %31, i32 4, i32 31)
%33 = extractvalue { float, i1 } %32, 0
%.inv2 = fcmp ole float %31, %33
%34 = fcmp uno float %33, 0.000000e+00
%35 = select i1 %34, i1 true, i1 %.inv2
%36 = select i1 %35, float %33, float %31
%37 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %36, i32 8, i32 31)
%38 = extractvalue { float, i1 } %37, 0
%.inv3 = fcmp ole float %36, %38
%39 = fcmp uno float %38, 0.000000e+00
%40 = select i1 %39, i1 true, i1 %.inv3
%41 = select i1 %40, float %38, float %36
%42 = tail call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 -1, float %41, i32 16, i32 31)
%43 = icmp eq i32 %12, 0
br i1 %43, label %44, label %50
44: ; preds = %.lr.ph
%45 = extractvalue { float, i1 } %42, 0
%46 = fcmp uno float %45, 0.000000e+00
%.inv4 = fcmp ole float %41, %45
%47 = select i1 %46, i1 true, i1 %.inv4
%48 = select i1 %47, float %45, float %41
%.inv5 = fcmp ole float %48, 0xC7EFFFFFE0000000
%49 = select i1 %.inv5, float 0xC7EFFFFFE0000000, float %48
store float %49, ptr addrspace(3) @__dynamic_shared_memory__, align 16
br label %50
50: ; preds = %44, %.lr.ph
tail call void @llvm.nvvm.barrier0()
%51 = shl nuw nsw i64 %9, 14
%52 = shl nuw nsw i64 %11, 7
%53 = add nuw nsw i64 %51, %52
%54 = or i64 %53, %14
%55 = getelementptr float, ptr %0, i64 %54
%56 = load <4 x float>, ptr %55, align 16
%57 = load float, ptr addrspace(3) @__dynamic_shared_memory__, align 16
%58 = insertelement <4 x float> undef, float %57, i64 0
%59 = shufflevector <4 x float> %58, <4 x float> undef, <4 x i32> zeroinitializer
%60 = fsub <4 x float> %56, %59
%.inv6 = fcmp olt <4 x float> %60, <float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000>
%61 = select <4 x i1> %.inv6, <4 x float> <float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000, float 0xC055F33340000000>, <4 x float> %60
%.inv7 = fcmp ogt <4 x float> %61, <float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000>
%62 = select <4 x i1> %.inv7, <4 x float> <float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000, float 0x4056333340000000>, <4 x float> %61
%63 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %62, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>)
%64 = extractelement <4 x float> %63, i64 0
%65 = tail call float @llvm.floor.f32(float %64)
%66 = insertelement <4 x float> poison, float %65, i64 0
%67 = extractelement <4 x float> %63, i64 1
%68 = tail call float @llvm.floor.f32(float %67)
%69 = insertelement <4 x float> %66, float %68, i64 1
%70 = extractelement <4 x float> %63, i64 2
%71 = tail call float @llvm.floor.f32(float %70)
%72 = insertelement <4 x float> %69, float %71, i64 2
%73 = extractelement <4 x float> %63, i64 3
%74 = tail call float @llvm.floor.f32(float %73)
%75 = insertelement <4 x float> %72, float %74, i64 3
%.inv8 = fcmp olt <4 x float> %75, <float -1.270000e+02, float -1.270000e+02, float -1.270000e+02, float -1.270000e+02>
%76 = select <4 x i1> %.inv8, <4 x float> <float -1.270000e+02, float -1.270000e+02, float -1.270000e+02, float -1.270000e+02>, <4 x float> %75
%.inv9 = fcmp ogt <4 x float> %76, <float 1.270000e+02, float 1.270000e+02, float 1.270000e+02, float 1.270000e+02>
%77 = select <4 x i1> %.inv9, <4 x float> <float 1.270000e+02, float 1.270000e+02, float 1.270000e+02, float 1.270000e+02>, <4 x float> %76
%78 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %77, <4 x float> <float 0xBFE6300000000000, float 0xBFE6300000000000, float 0xBFE6300000000000, float 0xBFE6300000000000>, <4 x float> %62)
%79 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %77, <4 x float> <float 0x3F2BD01060000000, float 0x3F2BD01060000000, float 0x3F2BD01060000000, float 0x3F2BD01060000000>, <4 x float> %78)
%80 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %79, <4 x float> <float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000, float 0x3F2A0D2CE0000000>, <4 x float> <float 0x3F56E879C0000000, float 0x3F56E879C0000000, float 0x3F56E879C0000000, float 0x3F56E879C0000000>)
%81 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %80, <4 x float> %79, <4 x float> <float 0x3F81112100000000, float 0x3F81112100000000, float 0x3F81112100000000, float 0x3F81112100000000>)
%82 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %81, <4 x float> %79, <4 x float> <float 0x3FA5553820000000, float 0x3FA5553820000000, float 0x3FA5553820000000, float 0x3FA5553820000000>)
%83 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %82, <4 x float> %79, <4 x float> <float 0x3FC5555540000000, float 0x3FC5555540000000, float 0x3FC5555540000000, float 0x3FC5555540000000>)
%84 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %83, <4 x float> %79, <4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>)
%85 = fmul <4 x float> %79, %79
%86 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %84, <4 x float> %85, <4 x float> %79)
%87 = fadd <4 x float> %86, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
%88 = fptosi <4 x float> %77 to <4 x i32>
%89 = shl <4 x i32> %88, <i32 23, i32 23, i32 23, i32 23>
%90 = add <4 x i32> %89, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
%91 = bitcast <4 x i32> %90 to <4 x float>
%92 = fmul <4 x float> %87, %91
%93 = getelementptr float, ptr %1, i64 %54
store <4 x float> %92, ptr %93, align 16
tail call void @llvm.nvvm.barrier0()
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #3
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32) #4
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.floor.f32(float) #2
attributes #0 = { nounwind }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #3 = { convergent nocallback nounwind }
attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
!nvvm.annotations = !{!0, !1, !2, !3}
!llvm.ident = !{!4}
!nvvmir.version = !{!5}
!0 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"kernel", i32 1}
!1 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidx", i32 32}
!2 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidy", i32 1}
!3 = !{ptr @_max_sub_exp_dispatch_0_generic_12x128x128_f32, !"maxntidz", i32 1}
!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!5 = !{i32 2, i32 0}
!6 = !{i32 0, i32 2147483647}
!7 = !{i32 0, i32 65535}
!8 = !{i32 0, i32 32}
I get this PTX for all architectures (here sm 60)
//
// Generated by LLVM NVPTX Back-End
//
.version 7.6
.target sm_60
.address_size 64
// .globl _max_sub_exp_dispatch_0_generic_12x128x128_f32
.extern .shared .align 16 .b8 __dynamic_shared_memory__[];
.visible .entry _max_sub_exp_dispatch_0_generic_12x128x128_f32(
.param .u64 _max_sub_exp_dispatch_0_generic_12x128x128_f32_param_0,
.param .u64 _max_sub_exp_dispatch_0_generic_12x128x128_f32_param_1
)
.maxntid 32, 1, 1
{
.reg .pred %p<34>;
.reg .b32 %r<17>;
.reg .f32 %f<105>;
.reg .b64 %rd<14>;
ld.param.u64 %rd4, [_max_sub_exp_dispatch_0_generic_12x128x128_f32_param_0];
ld.param.u64 %rd5, [_max_sub_exp_dispatch_0_generic_12x128x128_f32_param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd6, %rd4;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ctaid.y;
mov.u32 %r3, %tid.x;
shl.b32 %r4, %r3, 2;
cvt.u64.u32 %rd7, %r4;
mul.wide.u32 %rd8, %r1, 16384;
mul.wide.u32 %rd9, %r2, 128;
add.s64 %rd10, %rd8, %rd9;
or.b64 %rd2, %rd10, %rd7;
shl.b64 %rd11, %rd2, 2;
add.s64 %rd3, %rd6, %rd11;
ld.global.nc.v4.f32 {%f2, %f3, %f4, %f5}, [%rd3];
max.NaN.f32 %f6, %f3, %f5;
max.NaN.f32 %f7, %f2, %f4;
max.NaN.f32 %f8, %f7, %f6;
bar.sync 0;
shfl.sync.bfly.b32 %f9|%p1, %f8, 1, 31, -1;
setp.le.f32 %p2, %f8, %f9;
setp.nan.f32 %p3, %f9, %f9;
selp.f32 %f10, %f9, %f8, %p2;
selp.f32 %f11, %f9, %f10, %p3;
shfl.sync.bfly.b32 %f12|%p4, %f11, 2, 31, -1;
setp.le.f32 %p5, %f11, %f12;
setp.nan.f32 %p6, %f12, %f12;
selp.f32 %f13, %f12, %f11, %p5;
selp.f32 %f14, %f12, %f13, %p6;
shfl.sync.bfly.b32 %f15|%p7, %f14, 4, 31, -1;
setp.le.f32 %p8, %f14, %f15;
setp.nan.f32 %p9, %f15, %f15;
selp.f32 %f16, %f15, %f14, %p8;
selp.f32 %f17, %f15, %f16, %p9;
shfl.sync.bfly.b32 %f18|%p10, %f17, 8, 31, -1;
setp.le.f32 %p11, %f17, %f18;
setp.nan.f32 %p12, %f18, %f18;
selp.f32 %f19, %f18, %f17, %p11;
selp.f32 %f20, %f18, %f19, %p12;
shfl.sync.bfly.b32 %f21|%p13, %f20, 16, 31, -1;
setp.ne.s32 %p14, %r3, 0;
@%p14 bra $L__BB0_2;
setp.nan.f32 %p15, %f21, %f21;
setp.le.f32 %p16, %f20, %f21;
selp.f32 %f22, %f21, %f20, %p16;
selp.f32 %f23, %f21, %f22, %p15;
setp.le.f32 %p17, %f23, 0fFF7FFFFF;
selp.f32 %f1, 0fFF7FFFFF, %f23, %p17;
st.shared.f32 [__dynamic_shared_memory__], %f1;
$L__BB0_2:
bar.sync 0;
ld.global.nc.v4.f32 {%f24, %f25, %f26, %f27}, [%rd3];
ld.shared.f32 %f28, [__dynamic_shared_memory__];
sub.rn.f32 %f29, %f24, %f28;
sub.rn.f32 %f30, %f25, %f28;
sub.rn.f32 %f31, %f26, %f28;
sub.rn.f32 %f32, %f27, %f28;
setp.lt.f32 %p18, %f32, 0fC2AF999A;
setp.lt.f32 %p19, %f31, 0fC2AF999A;
setp.lt.f32 %p20, %f30, 0fC2AF999A;
setp.lt.f32 %p21, %f29, 0fC2AF999A;
selp.f32 %f33, 0fC2AF999A, %f29, %p21;
selp.f32 %f34, 0fC2AF999A, %f30, %p20;
selp.f32 %f35, 0fC2AF999A, %f31, %p19;
selp.f32 %f36, 0fC2AF999A, %f32, %p18;
setp.gt.f32 %p22, %f36, 0f42B1999A;
setp.gt.f32 %p23, %f35, 0f42B1999A;
setp.gt.f32 %p24, %f34, 0f42B1999A;
setp.gt.f32 %p25, %f33, 0f42B1999A;
selp.f32 %f37, 0f42B1999A, %f33, %p25;
selp.f32 %f38, 0f42B1999A, %f34, %p24;
selp.f32 %f39, 0f42B1999A, %f35, %p23;
selp.f32 %f40, 0f42B1999A, %f36, %p22;
fma.rn.f32 %f41, %f40, 0f3FB8AA3B, 0f3F000000;
fma.rn.f32 %f42, %f39, 0f3FB8AA3B, 0f3F000000;
fma.rn.f32 %f43, %f38, 0f3FB8AA3B, 0f3F000000;
fma.rn.f32 %f44, %f37, 0f3FB8AA3B, 0f3F000000;
cvt.rmi.f32.f32 %f45, %f44;
cvt.rmi.f32.f32 %f46, %f43;
cvt.rmi.f32.f32 %f47, %f42;
cvt.rmi.f32.f32 %f48, %f41;
setp.lt.f32 %p26, %f45, 0fC2FE0000;
setp.lt.f32 %p27, %f46, 0fC2FE0000;
setp.lt.f32 %p28, %f47, 0fC2FE0000;
setp.lt.f32 %p29, %f48, 0fC2FE0000;
selp.f32 %f49, 0fC2FE0000, %f48, %p29;
selp.f32 %f50, 0fC2FE0000, %f47, %p28;
selp.f32 %f51, 0fC2FE0000, %f46, %p27;
selp.f32 %f52, 0fC2FE0000, %f45, %p26;
setp.gt.f32 %p30, %f52, 0f42FE0000;
setp.gt.f32 %p31, %f51, 0f42FE0000;
setp.gt.f32 %p32, %f50, 0f42FE0000;
setp.gt.f32 %p33, %f49, 0f42FE0000;
selp.f32 %f53, 0f42FE0000, %f49, %p33;
selp.f32 %f54, 0f42FE0000, %f50, %p32;
selp.f32 %f55, 0f42FE0000, %f51, %p31;
selp.f32 %f56, 0f42FE0000, %f52, %p30;
fma.rn.f32 %f57, %f56, 0fBF318000, %f37;
fma.rn.f32 %f58, %f55, 0fBF318000, %f38;
fma.rn.f32 %f59, %f54, 0fBF318000, %f39;
fma.rn.f32 %f60, %f53, 0fBF318000, %f40;
fma.rn.f32 %f61, %f53, 0f395E8083, %f60;
fma.rn.f32 %f62, %f54, 0f395E8083, %f59;
fma.rn.f32 %f63, %f55, 0f395E8083, %f58;
fma.rn.f32 %f64, %f56, 0f395E8083, %f57;
fma.rn.f32 %f65, %f64, 0f39506967, 0f3AB743CE;
fma.rn.f32 %f66, %f63, 0f39506967, 0f3AB743CE;
fma.rn.f32 %f67, %f62, 0f39506967, 0f3AB743CE;
fma.rn.f32 %f68, %f61, 0f39506967, 0f3AB743CE;
fma.rn.f32 %f69, %f68, %f61, 0f3C088908;
fma.rn.f32 %f70, %f67, %f62, 0f3C088908;
fma.rn.f32 %f71, %f66, %f63, 0f3C088908;
fma.rn.f32 %f72, %f65, %f64, 0f3C088908;
fma.rn.f32 %f73, %f72, %f64, 0f3D2AA9C1;
fma.rn.f32 %f74, %f71, %f63, 0f3D2AA9C1;
fma.rn.f32 %f75, %f70, %f62, 0f3D2AA9C1;
fma.rn.f32 %f76, %f69, %f61, 0f3D2AA9C1;
fma.rn.f32 %f77, %f76, %f61, 0f3E2AAAAA;
fma.rn.f32 %f78, %f75, %f62, 0f3E2AAAAA;
fma.rn.f32 %f79, %f74, %f63, 0f3E2AAAAA;
fma.rn.f32 %f80, %f73, %f64, 0f3E2AAAAA;
fma.rn.f32 %f81, %f80, %f64, 0f3F000000;
fma.rn.f32 %f82, %f79, %f63, 0f3F000000;
fma.rn.f32 %f83, %f78, %f62, 0f3F000000;
fma.rn.f32 %f84, %f77, %f61, 0f3F000000;
mul.rn.f32 %f85, %f64, %f64;
mul.rn.f32 %f86, %f63, %f63;
mul.rn.f32 %f87, %f62, %f62;
mul.rn.f32 %f88, %f61, %f61;
fma.rn.f32 %f89, %f84, %f88, %f61;
fma.rn.f32 %f90, %f83, %f87, %f62;
fma.rn.f32 %f91, %f82, %f86, %f63;
fma.rn.f32 %f92, %f81, %f85, %f64;
add.rn.f32 %f93, %f92, 0f3F800000;
add.rn.f32 %f94, %f91, 0f3F800000;
add.rn.f32 %f95, %f90, 0f3F800000;
add.rn.f32 %f96, %f89, 0f3F800000;
cvt.rzi.s32.f32 %r5, %f53;
cvt.rzi.s32.f32 %r6, %f54;
cvt.rzi.s32.f32 %r7, %f55;
cvt.rzi.s32.f32 %r8, %f56;
shl.b32 %r9, %r8, 23;
shl.b32 %r10, %r7, 23;
shl.b32 %r11, %r6, 23;
shl.b32 %r12, %r5, 23;
add.s32 %r13, %r12, 1065353216;
add.s32 %r14, %r11, 1065353216;
add.s32 %r15, %r10, 1065353216;
add.s32 %r16, %r9, 1065353216;
mov.b32 %f97, %r16;
mov.b32 %f98, %r15;
mov.b32 %f99, %r14;
mov.b32 %f100, %r13;
mul.rn.f32 %f101, %f96, %f100;
mul.rn.f32 %f102, %f95, %f99;
mul.rn.f32 %f103, %f94, %f98;
mul.rn.f32 %f104, %f93, %f97;
add.s64 %rd13, %rd1, %rd11;
st.global.v4.f32 [%rd13], {%f104, %f103, %f102, %f101};
bar.sync 0;
ret;
}
As specified here https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max, the max.NaN.f32
is available only of f32 or higher.
The issue seems to be here
. There is no predicate ofhasSM<80>
that is needed to guard these.
I tried MaheshRavishankar@5f0385d and that just gave me a compilation error that indicated that instruction selection failed. So wondering if someone could either fix or help direct me to how to fix this.