diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 40d960e9b3a85..b88891ac4894b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5653,7 +5653,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) && ST.hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm())) + AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm())) SplitSize = 64; if (Size == SplitSize) { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 0d2feeb4edea3..0184075c2c909 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, if (DppCtrlIdx >= 0) { unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc))) { - // DP ALU DPP is supported for row_newbcast only on GFX9* + if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && + AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share + // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); - Error(S, "DP ALU dpp only supports row_newbcast"); + Error(S, isGFX12() ? "DP ALU dpp only supports row_share" + : "DP ALU dpp only supports row_newbcast"); return false; } } diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index f9a907a644373..184929a5a50f6 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) { DPPInst.addImm(ByteSelOpr->getImm()); } + if (MachineOperand *BitOp3 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) { + assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3)); + DPPInst.add(*BitOp3); + } } DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); @@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } - if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || - MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { - auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); - assert(DppCtrl && DppCtrl->isImm()); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) { + auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); + assert(DppCtrl && DppCtrl->isImm()); + unsigned DppCtrlVal = DppCtrl->getImm(); + if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) { + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) { + LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n"); + // Split it. + return false; + } + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) { LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" " control value\n"); // Let it split, then control may become legal. @@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) && + AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: DPP ALU DPP is not supported\n"); + break; + } + + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: not valid 64-bit DPP control value\n"); + break; + } + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (Use == Src0) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ee8683a549a80..aafbdc2e86a9b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, unsigned Imm = MI->getOperand(OpNo).getImm(); const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) { - O << " /* DP ALU dpp only supports row_newbcast */"; + if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && + AMDGPU::isDPALU_DPP(Desc, STI)) { + O << " /* DP ALU dpp only supports " + << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; } if (Imm <= DppCtrl::QUAD_PERM_LAST) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e866bd47e267d..25a1d615d48a8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6621,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) && ST->hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3))) + AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3))) SplitSize = 64; auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 19e6bcf6a219d..41885e45b4101 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2616,9 +2616,9 @@ std::pair SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); - if (ST.hasMovB64() && + if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) && AMDGPU::isLegalDPALU_DPPControl( - getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { + ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); return std::pair(&MI, nullptr); } @@ -5433,7 +5433,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && - !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { + !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && + AMDGPU::isDPALU_DPP(Desc, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c552f1a2c90e4..9278b859a8067 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, f64) : VCSrc_f64, !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1e3e9a20afb2e..e0ac040bdd226 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3309,7 +3309,33 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { return false; } -bool isDPALU_DPP(const MCInstrDesc &OpDesc) { +bool isDPALU_DPP32BitOpc(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MUL_LO_U32_e64: + case AMDGPU::V_MUL_LO_U32_e64_dpp: + case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_U32_e64: + case AMDGPU::V_MUL_HI_U32_e64_dpp: + case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_I32_e64: + case AMDGPU::V_MUL_HI_I32_e64_dpp: + case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250: + case AMDGPU::V_MAD_U32_e64: + case AMDGPU::V_MAD_U32_e64_dpp: + case AMDGPU::V_MAD_U32_e64_dpp_gfx1250: + return true; + default: + return false; + } +} + +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { + if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) + return false; + + if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) + return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); + return hasAny64BitVGPROperands(OpDesc); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1bcd36cf6241c..704bf106ace76 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1750,15 +1750,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST); bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); LLVM_READNONE -inline bool isLegalDPALU_DPPControl(unsigned DC) { - return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { + if (isGFX12(ST)) + return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST; + if (isGFX90A(ST)) + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; + return false; } /// \returns true if an instruction may have a 64-bit VGPR operand. bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. +bool isDPALU_DPP32BitOpc(unsigned Opc); + /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f4b6af647ca1a..329d003cf2506 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12 op> : multiclass VOP3_Real_Base_gfx11_gfx12 op> : VOP3_Real_Base, VOP3_Real_Base; +multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250 op> : + VOP3_Real_Base, VOP3_Real_Base; + multiclass VOP3_Realtriple_with_name_gfx11_gfx12 op, string opName, string asmName> : VOP3_Realtriple_with_name, @@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>; -defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>; -defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>; -defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>; defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>; defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">; defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">; @@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>; + defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll index bf37ccf3ac89f..43f6def22d981 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll @@ -1,12 +1,13 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A,DPP64-GFX9 -DCTL=row_newbcast +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,DPP64-GFX9,GFX942 -DCTL=row_newbcast +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10 -DCTL=row_share +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11 -DCTL=row_share +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX1250 -DCTL=row_share ; GCN-LABEL: {{^}}dpp64_ceil: ; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], -; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP64: v_ceil_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id @@ -21,8 +22,8 @@ define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) { ; GCN-LABEL: {{^}}dpp64_rcp: ; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], -; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP64-GFX9: v_rcp_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp64_rcp(ptr addrspace(1) %arg, i64 %in1) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id @@ -52,9 +53,9 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(ptr addrspace(1) %arg, i64 ; GCN-LABEL: {{^}}dpp64_div: ; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], -; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} ; GCN: v_div_scale_f64 ; GCN: v_rcp_f64_e32 define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) { @@ -69,6 +70,25 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) { ret void } +; On GFX9 it fails to combine because v_mul_lo_u32 has no e32 or dpp form. +; GCN-LABEL: {{^}}dpp_mul_row_share: +; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], +; DPP64-GFX9: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]] +; DPP64-GFX9: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; DPP64-GFX9: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}} +; GFX1250: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]] +; GFX1250: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX1250: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}} +define amdgpu_kernel void @dpp_mul_row_share(ptr addrspace(1) %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id + %load = load i32, ptr addrspace(1) %gep + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 336, i32 15, i32 15, i1 1) + %mul = mul i32 %tmp0, %load + store i32 %mul, ptr addrspace(1) %gep + ret void +} + ; GCN-LABEL: {{^}}dpp64_loop: ; GCN: v_mov_b32_dpp ; DPP64: v_mov_b32_dpp diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir new file mode 100644 index 0000000000000..9972ec82f63cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir @@ -0,0 +1,18 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=gcn-dpp-combine -o - %s | FileCheck %s -check-prefix=GFX1250 + +--- +name: v_bitop3_dpp +tracksRegLiveness: true +body: | + bb.0: + ; GFX1250-LABEL: name: v_bitop3_dpp + ; GFX1250: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX1250-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: [[V_BITOP3_B32_e64_dpp:%[0-9]+]]:vgpr_32 = V_BITOP3_B32_e64_dpp [[DEF]], [[V_MOV_B32_e32_]], 1, [[V_MOV_B32_dpp]], 128, 0, 15, 15, 1, implicit $exec + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 0, 15, 15, 0, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0, %0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_BITOP3_B32_e64 %1, 1, %2, 128, implicit $exec +...