diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d5356d1be3d75..924144c5366f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit", "Support DPP (Data Parallel Primitives) extension in DP ALU" >; +def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr", + "HasDPPSrc1SGPR", + "true", + "Support SGPR for Src1 of DPP instructions" +>; + def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops", "HasPackedFP32Ops", "true", @@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet< def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [FeatureSALUFloatInsts])>; + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR])>; def FeatureISAVersion11_5_1 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, FeatureGFX11FullVGPRs])>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5032dd665b07e..35656bcaea1af 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, const OperandVector &Operands) { const unsigned Opc = Inst.getOpcode(); int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl); - if (DppCtrlIdx < 0) - return true; - unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); + if (DppCtrlIdx >= 0) { + unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); + + if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && + AMDGPU::isDPALU_DPP(MII.get(Opc))) { + // DP ALU DPP is supported for row_newbcast only on GFX9* + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); + Error(S, "DP ALU dpp only supports row_newbcast"); + return false; + } + } - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc))) { - // DP ALU DPP is supported for row_newbcast only on GFX9* - SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); - Error(S, "DP ALU dpp only supports row_newbcast"); - return false; + int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8); + bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0; + + if (IsDPP && !hasDPPSrc1SGPR(getSTI())) { + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx >= 0) { + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (Src1.isImm() || + (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]); + Error(Op.getStartLoc(), "invalid operand for instruction"); + return false; + } + } } return true; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index e1cd0f0a732b9..b38e20b2708f2 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { return &OldOpnd; } +static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, + MachineRegisterInfo &MRI) { + int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; + if (RegClass == -1) + return 0; + + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass)); +} + MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, @@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); + int Src0Idx = NumOperands; if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); Fail = true; @@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); if (Src1) { - if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + int OpNum = NumOperands; + // If subtarget does not support SGPRs for src1 operand then the + // requirements are the same as for src0. We check src0 instead because + // pseudos are shared between subtargets and allow SGPR for src1 on all. + if (!ST->hasDPPSrc1SGPR()) { + assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == + getOperandSize(*DPPInst, NumOperands, *MRI) && + "Src0 and Src1 operands should have the same size"); + OpNum = Src0Idx; + } + if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; break; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f60938d830fcc..e7d46c0df4346 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasDPP = false; bool HasDPP8 = false; bool HasDPALU_DPP = false; + bool HasDPPSrc1SGPR = false; bool HasPackedFP32Ops = false; bool HasImageInsts = false; bool HasExtendedImageInsts = false; @@ -916,6 +917,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasDPALU_DPP; } + bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } + bool hasPackedFP32Ops() const { return HasPackedFP32Ops; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index ea6df380153bb..60a6964c754ff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2296,7 +2296,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field RegisterClass Src1DPP = getVregSrcForVT.ret; field RegisterClass Src2DPP = getVregSrcForVT.ret; field RegisterOperand Src0VOP3DPP = VGPRSrc_32; - field RegisterOperand Src1VOP3DPP = VRegSrc_32; + field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT.ret; field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT.ret; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index da664c93d1889..6d0ad763d9e6c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureVOPD); } +bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) { + return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR); +} + unsigned hasKernargPreload(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureKernargPreload); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d5092333c2171..297a69f54d637 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI); bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); bool hasMAIInsts(const MCSubtargetInfo &STI); bool hasVOPD(const MCSubtargetInfo &STI); +bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll index 4d979e2cf7496..a72bcbb529e7d 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN ; GCN-LABEL: {{^}}dpp_add: ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 91e97a36a597a..fe1345e29f133 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100 +# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 --- @@ -6,7 +7,8 @@ # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec -# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec +# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec +# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec name: vop3 tracksRegLiveness: true body: | @@ -28,10 +30,54 @@ body: | %9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec - ; should not be combined because src1 imm is illegal + ; should not be combined on subtargets where src1 imm is illegal %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec ... +--- + +# GCN-label: name: vop3_sgpr_src1 +# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec +# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec +# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec +# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec +name: vop3_sgpr_src1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:sgpr_32 = COPY $sgpr0 + %3:sgpr_32 = COPY $sgpr1 + %4:vgpr_32 = IMPLICIT_DEF + + ; should be combined because src2 allows sgpr + %5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + + ; should be combined only on subtargets that allow sgpr for src1 + %7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec + %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + + ; should be combined only on subtargets that allow sgpr for src1 + %9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec + %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec + + ; should be combined only on subtargets that allow inlinable constants for src1 + %11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec + %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec + + ; should not be combined when literal constants are used + %13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec + %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec +... +--- # Regression test for src_modifiers on base u16 opcode # GCN-label: name: vop3_u16 diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s new file mode 100644 index 0000000000000..a4904c40b40ae --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s @@ -0,0 +1,19 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s + +// +// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable +// constant. +// + +v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s index e475a1d519077..5e508b466e830 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s @@ -45,6 +45,15 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0] v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0] // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt new file mode 100644 index 0000000000000..77161a49364b0 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s + +# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff + +# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff] +0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff + +# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05] +0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05 + +# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05] +0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05