diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index a5daf36cfac72..9a985e46e22da 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { case 'l': case 'f': case 'd': + case 'q': Info.setAllowsRegister(); return true; } diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 211fee5f008a0..e2c47204e628f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5381,6 +5381,7 @@ NVPTX: - ``c`` or ``h``: A 16-bit integer register. - ``r``: A 32-bit integer register. - ``l`` or ``N``: A 64-bit integer register. +- ``q``: A 128-bit integer register. - ``f``: A 32-bit float register. - ``d``: A 64-bit float register. diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index b7a20c351f5ff..380d878c1f532 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -60,6 +60,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const { case 6: OS << "%fd"; break; + case 7: + OS << "%rq"; + break; } unsigned VReg = Reg.id() & 0x0FFFFFFF; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ca077d41d36ba..1645261d74d06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -315,6 +315,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { Ret = (5 << 28); } else if (RC == &NVPTX::Float64RegsRegClass) { Ret = (6 << 28); + } else if (RC == &NVPTX::Int128RegsRegClass) { + Ret = (7 << 28); } else { report_fatal_error("Bad register class"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 1e1cbb15e33d4..11193c11ede3b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -519,6 +519,20 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryConstantFP(N)) return; break; + case ISD::CopyToReg: { + if (N->getOperand(1).getValueType() == MVT::i128) { + SelectV2I64toI128(N); + return; + } + break; + } + case ISD::CopyFromReg: { + if (N->getOperand(1).getValueType() == MVT::i128) { + SelectI128toV2I64(N); + return; + } + break; + } default: break; } @@ -3798,6 +3812,60 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( return true; } +void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { + // Lower a CopyToReg with two 64-bit inputs + // Dst:i128, lo:i64, hi:i64 + // + // CopyToReg Dst, lo, hi; + // + // ==> + // + // tmp = V2I64toI128 {lo, hi}; + // CopyToReg Dst, tmp; + SDValue Dst = N->getOperand(1); + SDValue Lo = N->getOperand(2); + SDValue Hi = N->getOperand(3); + + SDLoc DL(N); + SDNode *Mov = + CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi}); + + SmallVector NewOps(N->getNumOperands() - 1); + NewOps[0] = N->getOperand(0); + NewOps[1] = Dst; + NewOps[2] = SDValue(Mov, 0); + if (N->getNumOperands() == 5) + NewOps[3] = N->getOperand(4); + SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector(N->values()), NewOps); + + ReplaceNode(N, NewValue.getNode()); +} + +void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { + // Lower CopyFromReg from a 128-bit regs to two 64-bit regs + // Dst:i128, Src:i128 + // + // {lo, hi} = CopyFromReg Src + // + // ==> + // + // {lo, hi} = I128toV2I64 Src + // + SDValue Ch = N->getOperand(0); + SDValue Src = N->getOperand(1); + SDValue Glue = N->getOperand(2); + SDLoc DL(N); + + // Add Glue and Ch to the operands and results to avoid break the execution + // order + SDNode *Mov = CurDAG->getMachineNode( + NVPTX::I128toV2I64, DL, + {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()}, + {Src, Ch, Glue}); + + ReplaceNode(N, Mov); +} + /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a /// conversion from \p SrcTy to \p DestTy. unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c5524351f2ff9..49626d4051485 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -74,7 +74,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool SelectSETP_F16X2(SDNode *N); bool SelectSETP_BF16X2(SDNode *N); bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); - + void SelectV2I64toI128(SDNode *N); + void SelectI128toV2I64(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 476a532db0a37..26c16ee9fd18f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -859,6 +859,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); } + // Custom lowering for inline asm with 128-bit operands + setOperationAction(ISD::CopyToReg, MVT::i128, Custom); + setOperationAction(ISD::CopyFromReg, MVT::i128, Custom); + // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. // No FPOW or FREM in PTX. @@ -2804,6 +2808,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVectorArith(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::CopyToReg: + return LowerCopyToReg_128(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -3094,6 +3100,54 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { return Result; } +SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, + SelectionDAG &DAG) const { + // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit + // operand so that it can pass the legalization. + + assert(Op.getOperand(1).getValueType() == MVT::i128 && + "Custom lowering for 128-bit CopyToReg only"); + + SDNode *Node = Op.getNode(); + SDLoc DL(Node); + + SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2)); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(1, DL)); + + SmallVector NewOps(Op->getNumOperands() + 1); + SmallVector ResultsType(Node->values()); + + NewOps[0] = Op->getOperand(0); // Chain + NewOps[1] = Op->getOperand(1); // Dst Reg + NewOps[2] = Lo; // Lower 64-bit + NewOps[3] = Hi; // Higher 64-bit + if (Op.getNumOperands() == 4) + NewOps[4] = Op->getOperand(3); // Glue if exists + + return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps); +} + +unsigned NVPTXTargetLowering::getNumRegisters( + LLVMContext &Context, EVT VT, + std::optional RegisterVT = std::nullopt) const { + if (VT == MVT::i128 && RegisterVT == MVT::i128) + return 1; + return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); +} + +bool NVPTXTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, std::optional CC) const { + if (Val.getValueType() == MVT::i128 && NumParts == 1) { + Parts[0] = Val; + return true; + } + return false; +} + // This creates target external symbol for a function parameter. // Name of the symbol is composed from its index and the function name. // Negative index corresponds to special parameter (unsized array) used for @@ -5150,6 +5204,7 @@ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { case 'l': case 'f': case 'd': + case 'q': case '0': case 'N': return C_RegisterClass; @@ -5175,6 +5230,12 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'l': case 'N': return std::make_pair(0U, &NVPTX::Int64RegsRegClass); + case 'q': { + if (STI.getSmVersion() < 70) + report_fatal_error("Inline asm with 128 bit operands is only " + "supported for sm_70 and higher!"); + return std::make_pair(0U, &NVPTX::Int128RegsRegClass); + } case 'f': return std::make_pair(0U, &NVPTX::Float32RegsRegClass); case 'd': @@ -6261,6 +6322,30 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, } } +static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results) { + // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit + // result so that it can pass the legalization + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + SDValue Glue = N->getOperand(2); + + assert(Reg.getValueType() == MVT::i128 && + "Custom lowering for CopyFromReg with 128-bit reg only"); + SmallVector ResultsType = {MVT::i64, MVT::i64, N->getValueType(1), + N->getValueType(2)}; + SmallVector NewOps = {Chain, Reg, Glue}; + + SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + {NewValue.getValue(0), NewValue.getValue(1)}); + + Results.push_back(Pair); + Results.push_back(NewValue.getValue(2)); + Results.push_back(NewValue.getValue(3)); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6272,6 +6357,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); return; + case ISD::CopyFromReg: + ReplaceCopyFromReg_128(N, DAG, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index e211286fcc556..63262961b363e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -640,6 +640,14 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCopyToReg_128(SDValue Op, SelectionDAG &DAG) const; + unsigned getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT) const override; + bool + splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, + SDValue *Parts, unsigned NumParts, MVT PartVT, + std::optional CC) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index b0d792b5ee3fe..673858f92e7ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -51,6 +51,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (DestRC == &NVPTX::Int64RegsRegClass) { Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr : NVPTX::BITCONVERT_64_F2I); + } else if (DestRC == &NVPTX::Int128RegsRegClass) { + Op = NVPTX::IMOV128rr; } else if (DestRC == &NVPTX::Float32RegsRegClass) { Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr : NVPTX::BITCONVERT_32_I2F); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index c4c35a1f74ba9..827febe845a4c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2097,6 +2097,8 @@ let IsSimpleMove=1, hasSideEffects=0 in { "mov.u32 \t$dst, $sss;", []>; def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), "mov.u64 \t$dst, $sss;", []>; + def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), + "mov.b128 \t$dst, $sss;", []>; def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), "mov.b16 \t$dst, $sss;", []>; @@ -3545,6 +3547,9 @@ let hasSideEffects = false in { def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$s1, Int32Regs:$s2), "mov.b64 \t$d, {{$s1, $s2}};", []>; + def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d), + (ins Int64Regs:$s1, Int64Regs:$s2), + "mov.b128 \t$d, {{$s1, $s2}};", []>; def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$s1, Float32Regs:$s2), "mov.b64 \t$d, {{$s1, $s2}};", []>; @@ -3560,6 +3565,9 @@ let hasSideEffects = false in { def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), (ins Int64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; + def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2), + (ins Int128Regs:$s), + "mov.b128 \t{{$d1, $d2}}, $s;", []>; def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), (ins Float64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; @@ -3629,7 +3637,7 @@ def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; +def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the // result back to 16-bits if necessary. We also need to subtract 16 because @@ -3667,7 +3675,7 @@ def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; // pattern that avoids the type conversion if we're truncating the result to // i32 anyway. def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; +def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>; // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. // If we know that we're storing into an i32, we can avoid the final trunc. diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index f1213f030bba7..a8a23f04c1249 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -31,6 +31,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) { return ".f32"; if (RC == &NVPTX::Float64RegsRegClass) return ".f64"; + if (RC == &NVPTX::Int128RegsRegClass) + return ".b128"; if (RC == &NVPTX::Int64RegsRegClass) // We use untyped (.b) integer registers here as NVCC does. // Correctness of generated code does not depend on register type, @@ -67,6 +69,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; if (RC == &NVPTX::Float64RegsRegClass) return "%fd"; + if (RC == &NVPTX::Int128RegsRegClass) + return "%rq"; if (RC == &NVPTX::Int64RegsRegClass) return "%rd"; if (RC == &NVPTX::Int32RegsRegClass) diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index b5231a9cf67f9..2011f0f7e328f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -37,6 +37,7 @@ foreach i = 0...4 in { def RS#i : NVPTXReg<"%rs"#i>; // 16-bit def R#i : NVPTXReg<"%r"#i>; // 32-bit def RL#i : NVPTXReg<"%rd"#i>; // 64-bit + def RQ#i : NVPTXReg<"%rq"#i>; // 128-bit def H#i : NVPTXReg<"%h"#i>; // 16-bit float def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float def F#i : NVPTXReg<"%f"#i>; // 32-bit float @@ -62,6 +63,8 @@ def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; +// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. +def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>; def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll new file mode 100644 index 0000000000000..3232f40a40a70 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} + +target triple = "nvptx64-nvidia-cuda" + +@value = internal addrspace(1) global i128 0, align 16 + +define void @test_b128_input_from_const() { +; CHECK-LABEL: test_b128_input_from_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd2, 0; +; CHECK-NEXT: mov.u64 %rd3, 42; +; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; +; CHECK-NEXT: mov.u32 %r1, value; +; CHECK-NEXT: cvta.global.u32 %r2, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; + + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) + ret void +} + +define void @test_b128_input_from_load(ptr nocapture readonly %data) { +; CHECK-LABEL: test_b128_input_from_load( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_load_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u64 %rd2, [%r2+8]; +; CHECK-NEXT: ld.global.u64 %rd3, [%r2]; +; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; +; CHECK-NEXT: mov.u32 %r3, value; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; + + %1 = addrspacecast ptr %data to ptr addrspace(1) + %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 + %3 = bitcast <2 x i64> %2 to i128 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) + ret void +} + +define void @test_b128_input_from_select(ptr nocapture readonly %flag) { +; CHECK-LABEL: test_b128_input_from_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_select_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u8 %rs1, [%r2]; +; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; +; CHECK-NEXT: selp.b64 %rd2, 24, 42, %p1; +; CHECK-NEXT: mov.u64 %rd3, 0; +; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3}; +; CHECK-NEXT: mov.u32 %r3, value; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; + + %1 = addrspacecast ptr %flag to ptr addrspace(1) + %2 = load i8, ptr addrspace(1) %1, align 1 + %3 = icmp eq i8 %2, 0 + %4 = select i1 %3, i128 24, i128 42 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %4) + ret void +} + +define void @test_store_b128_output() { +; CHECK-LABEL: test_store_b128_output( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { mov.b128 %rq1, 41; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, %rq1; +; CHECK-NEXT: add.cc.s64 %rd3, %rd1, 1; +; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, 0; +; CHECK-NEXT: st.global.u64 [value+8], %rd4; +; CHECK-NEXT: st.global.u64 [value], %rd3; +; CHECK-NEXT: ret; + + %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() + %add = add nsw i128 %1, 1 + %2 = bitcast i128 %add to <2 x i64> + store <2 x i64> %2, ptr addrspace(1) @value, align 16 + ret void +} + +define void @test_use_of_b128_output(ptr nocapture readonly %data) { +; CHECK-LABEL: test_use_of_b128_output( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_use_of_b128_output_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u64 %rd1, [%r2+8]; +; CHECK-NEXT: ld.global.u64 %rd2, [%r2]; +; CHECK-NEXT: mov.b128 %rq2, {%rd2, %rd1}; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { mov.b128 %rq1, %rq2; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd3, %rd4}, %rq1; +; CHECK-NEXT: add.cc.s64 %rd5, %rd3, 1; +; CHECK-NEXT: addc.cc.s64 %rd6, %rd4, 0; +; CHECK-NEXT: st.global.u64 [value], %rd5; +; CHECK-NEXT: st.global.u64 [value+8], %rd6; +; CHECK-NEXT: ret; + + %1 = addrspacecast ptr %data to ptr addrspace(1) + %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 + %3 = bitcast <2 x i64> %2 to i128 + %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) + %add = add nsw i128 %4, 1 + %5 = bitcast i128 %add to <2 x i64> + store <2 x i64> %5, ptr addrspace(1) @value, align 16 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll new file mode 100644 index 0000000000000..3d1d7fbbe27e8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} + +target triple = "nvptx64-nvidia-cuda" + +@u128_max = internal addrspace(1) global i128 0, align 16 +@u128_zero = internal addrspace(1) global i128 0, align 16 +@i128_max = internal addrspace(1) global i128 0, align 16 +@i128_min = internal addrspace(1) global i128 0, align 16 +@v_u128_max = internal addrspace(1) global i128 0, align 16 +@v_u128_zero = internal addrspace(1) global i128 0, align 16 +@v_i128_max = internal addrspace(1) global i128 0, align 16 +@v_i128_min = internal addrspace(1) global i128 0, align 16 +@v64 = internal addrspace(1) global ptr null, align 8 + +define void @test_corner_values() { +; CHECK-LABEL: test_corner_values( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b64 %rd<17>; +; CHECK-NEXT: .reg .b128 %rq<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.u32 %r1, [v64]; +; CHECK-NEXT: add.s32 %r2, %r1, 8; +; CHECK-NEXT: mov.u64 %rd13, -1; +; CHECK-NEXT: mov.b128 %rq1, {%rd13, %rd13}; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r2; +; CHECK-NEXT: mov.u32 %r3, v_u128_max; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd3, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; +; CHECK-NEXT: st.b64 [%rd1], lo; +; CHECK-NEXT: st.b64 [%rd2], hi; +; CHECK-NEXT: st.b128 [%rd3], %rq1; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r5, [v64]; +; CHECK-NEXT: add.s32 %r6, %r5, 16; +; CHECK-NEXT: add.s32 %r7, %r5, 24; +; CHECK-NEXT: mov.u64 %rd14, 9223372036854775807; +; CHECK-NEXT: mov.b128 %rq2, {%rd13, %rd14}; +; CHECK-NEXT: mov.u32 %r8, v_i128_max; +; CHECK-NEXT: cvta.global.u32 %r9, %r8; +; CHECK-NEXT: cvt.u64.u32 %rd6, %r9; +; CHECK-NEXT: cvt.u64.u32 %rd4, %r6; +; CHECK-NEXT: cvt.u64.u32 %rd5, %r7; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq2; +; CHECK-NEXT: st.b64 [%rd4], lo; +; CHECK-NEXT: st.b64 [%rd5], hi; +; CHECK-NEXT: st.b128 [%rd6], %rq2; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r10, [v64]; +; CHECK-NEXT: add.s32 %r11, %r10, 32; +; CHECK-NEXT: add.s32 %r12, %r10, 40; +; CHECK-NEXT: mov.u64 %rd15, -9223372036854775808; +; CHECK-NEXT: mov.u64 %rd16, 0; +; CHECK-NEXT: mov.b128 %rq3, {%rd16, %rd15}; +; CHECK-NEXT: mov.u32 %r13, v_i128_min; +; CHECK-NEXT: cvta.global.u32 %r14, %r13; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r14; +; CHECK-NEXT: cvt.u64.u32 %rd7, %r11; +; CHECK-NEXT: cvt.u64.u32 %rd8, %r12; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq3; +; CHECK-NEXT: st.b64 [%rd7], lo; +; CHECK-NEXT: st.b64 [%rd8], hi; +; CHECK-NEXT: st.b128 [%rd9], %rq3; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r15, [v64]; +; CHECK-NEXT: add.s32 %r16, %r15, 48; +; CHECK-NEXT: add.s32 %r17, %r15, 56; +; CHECK-NEXT: mov.b128 %rq4, {%rd16, %rd16}; +; CHECK-NEXT: mov.u32 %r18, v_u128_zero; +; CHECK-NEXT: cvta.global.u32 %r19, %r18; +; CHECK-NEXT: cvt.u64.u32 %rd12, %r19; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r16; +; CHECK-NEXT: cvt.u64.u32 %rd11, %r17; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq4; +; CHECK-NEXT: st.b64 [%rd10], lo; +; CHECK-NEXT: st.b64 [%rd11], hi; +; CHECK-NEXT: st.b128 [%rd12], %rq4; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; + + %1 = load ptr, ptr addrspace(1) @v64, align 8 + %2 = getelementptr inbounds i64, ptr %1, i64 1 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %1, ptr nonnull %2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) + %3 = load ptr, ptr addrspace(1) @v64, align 8 + %4 = getelementptr inbounds i64, ptr %3, i64 2 + %5 = getelementptr inbounds i64, ptr %3, i64 3 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %4, ptr nonnull %5, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) + %6 = load ptr, ptr addrspace(1) @v64, align 8 + %7 = getelementptr inbounds i64, ptr %6, i64 4 + %8 = getelementptr inbounds i64, ptr %6, i64 5 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %7, ptr nonnull %8, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) + %9 = load ptr, ptr addrspace(1) @v64, align 8 + %10 = getelementptr inbounds i64, ptr %9, i64 6 + %11 = getelementptr inbounds i64, ptr %9, i64 7 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %10, ptr nonnull %11, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll new file mode 100644 index 0000000000000..ae453977123e0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} + +target triple = "nvptx64-nvidia-cuda" + +@size = internal addrspace(1) global i32 0, align 4 +@x = internal addrspace(1) global i128 0, align 16 + +define void @test_b128_in_loop() { +; CHECK-LABEL: test_b128_in_loop( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b64 %rd<15>; +; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.s32 %rd1, [size]; +; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; +; CHECK-NEXT: @%p1 bra $L__BB0_3; +; CHECK-NEXT: // %bb.1: // %BB1 +; CHECK-NEXT: ld.global.u64 %rd13, [x+8]; +; CHECK-NEXT: ld.global.u64 %rd12, [x]; +; CHECK-NEXT: mov.u64 %rd14, 0; +; CHECK-NEXT: $L__BB0_2: // %BB2 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; +; CHECK-NEXT: add.cc.u64 lo, lo, %rd14; +; CHECK-NEXT: mov.b128 %rq1, {lo, hi}; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1; +; CHECK-NEXT: st.global.u64 [x+8], %rd13; +; CHECK-NEXT: st.global.u64 [x], %rd12; +; CHECK-NEXT: add.s64 %rd14, %rd14, 1; +; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14; +; CHECK-NEXT: @%p2 bra $L__BB0_2; +; CHECK-NEXT: $L__BB0_3: // %BB3 +; CHECK-NEXT: ret; + + %1 = load i32, ptr addrspace(1) @size, align 4 + %2 = icmp eq i32 %1, 0 + br i1 %2, label %BB3, label %BB1 + +BB1: ; preds = %0 + %3 = load i128, ptr addrspace(1) @x, align 16 + %4 = sext i32 %1 to i64 + br label %BB2 + +BB2: ; preds = %BB2, %BB1 + %5 = phi i128 [ %7, %BB2 ], [ %3, %BB1 ] + %6 = phi i64 [ %9, %BB2 ], [ 0, %BB1 ] + %7 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %6, i128 %5) + %8 = bitcast i128 %7 to <2 x i64> + store <2 x i64> %8, ptr addrspace(1) @x, align 16 + %9 = add nuw i64 %6, 1 + %10 = icmp eq i64 %9, %4 + br i1 %10, label %BB3, label %BB2 + +BB3: ; preds = %BB2, %0 + ret void +}