From cc8ceabcce69b34af532e27eabea40d78b80ab36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 20 Sep 2024 15:31:49 +0200 Subject: [PATCH 1/2] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] create new pass (boilerplate only) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 ++ .../AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp | 54 +++++++++++++++++++ .../AMDGPU/AMDGPUDemoteSCCBranchToExecz.h | 31 +++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 ++ 7 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 342d55e828bca..e7515c16e44b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -373,6 +373,9 @@ extern char &AMDGPUCodeGenPrepareID; void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &); extern char &AMDGPURemoveIncompatibleFunctionsID; +void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &); +extern char &AMDGPUDemoteSCCBranchToExeczLegacyID; + void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &); extern char &AMDGPULateCodeGenPrepareLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp new file mode 100644 index 0000000000000..112de9f794342 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp @@ -0,0 +1,54 @@ +#include "llvm/CodeGen/MachineFunctionPass.h" + +#include "AMDGPU.h" +#include "AMDGPUDemoteSCCBranchToExecz.h" + +using namespace llvm; + +namespace { +#define DEBUG_TYPE "amdgpu-demote-scc-to-execz" +const char PassName[] = "AMDGPU if conversion"; + +class AMDGPUDemoteSCCBranchToExecz { +public: + AMDGPUDemoteSCCBranchToExecz() = default; + + bool run() { return false; } +}; + +class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + AMDGPUDemoteSCCBranchToExecz IfCvt{}; + return IfCvt.run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return PassName; } +}; + +char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0; + +} // namespace + +PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run( + MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { + AMDGPUDemoteSCCBranchToExecz IfCvt{}; + if (!IfCvt.run()) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} + +char &llvm::AMDGPUDemoteSCCBranchToExeczLegacyID = + AMDGPUDemoteSCCBranchToExeczLegacy::ID; +INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName, + false, false) +INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName, + false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h new file mode 100644 index 0000000000000..3db3b639dd55f --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h @@ -0,0 +1,31 @@ +//===- AMDGPURDemoteSCCBranchToExecz.h --- demote s_cbranch_scc -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Pass used to demote s_cbranch_scc0/1 branches to s_cbranch_execz +/// branches. These can be later removed by SIPreEmitPeephole. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class AMDGPUDemoteSCCBranchToExeczPass + : public PassInfoMixin { +public: + AMDGPUDemoteSCCBranchToExeczPass() = default; + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 0ebf34c901c14..d968ac61eea39 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -95,6 +95,7 @@ FUNCTION_PASS_WITH_PARAMS( #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) +MACHINE_FUNCTION_PASS("amdgpu-demote-scc-to-execz", AMDGPUDemoteSCCBranchToExeczPass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 16e23879cd735..62caf8db2c81b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -18,6 +18,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUCtorDtorLowering.h" +#include "AMDGPUDemoteSCCBranchToExecz.h" #include "AMDGPUExportClustering.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" @@ -498,6 +499,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); + initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(*PR); initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); @@ -1336,7 +1338,7 @@ void GCNPassConfig::addMachineSSAOptimization() { bool GCNPassConfig::addILPOpts() { if (EnableEarlyIfConversion) addPass(&EarlyIfConverterID); - + addPass(&AMDGPUDemoteSCCBranchToExeczLegacyID); TargetPassConfig::addILPOpts(); return false; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index fed29c3e14aae..52bb7db3f8ef9 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp AMDGPUHSAMetadataStreamer.cpp + AMDGPUDemoteSCCBranchToExecz.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp AMDGPUInstrInfo.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 646b1264f5dea..28eb45bbc96c6 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -315,6 +315,7 @@ ; GCN-O1-NEXT: Merge disjoint stack slots ; GCN-O1-NEXT: Local Stack Slot Allocation ; GCN-O1-NEXT: Remove dead machine instructions +; GCN-O1-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion ; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: Machine Block Frequency Analysis @@ -617,6 +618,7 @@ ; GCN-O1-OPTS-NEXT: Merge disjoint stack slots ; GCN-O1-OPTS-NEXT: Local Stack Slot Allocation ; GCN-O1-OPTS-NEXT: Remove dead machine instructions +; GCN-O1-OPTS-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis @@ -932,6 +934,7 @@ ; GCN-O2-NEXT: Merge disjoint stack slots ; GCN-O2-NEXT: Local Stack Slot Allocation ; GCN-O2-NEXT: Remove dead machine instructions +; GCN-O2-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion ; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: Machine Block Frequency Analysis @@ -1260,6 +1263,7 @@ ; GCN-O3-NEXT: Merge disjoint stack slots ; GCN-O3-NEXT: Local Stack Slot Allocation ; GCN-O3-NEXT: Remove dead machine instructions +; GCN-O3-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion ; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: Machine Block Frequency Analysis From 4626fb4330ad0871bc0335d637dba02422e83f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 20 Sep 2024 15:37:38 +0200 Subject: [PATCH 2/2] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] Implementation: demote s_cbranch_scc branches into vcmp + s_cbranch_execz branches --- .../AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp | 207 +++++++++++++++++- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 82 +++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 18 +- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 85 +------ .../AMDGPU/amdgpu-demote-scc-branches.ll | 59 +++-- 5 files changed, 339 insertions(+), 112 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp index 112de9f794342..8131ed666f191 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp @@ -2,18 +2,215 @@ #include "AMDGPU.h" #include "AMDGPUDemoteSCCBranchToExecz.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" using namespace llvm; namespace { #define DEBUG_TYPE "amdgpu-demote-scc-to-execz" -const char PassName[] = "AMDGPU if conversion"; +const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion"; + +std::optional getVALUOpc(const MachineInstr &MI, + bool Reverse = false) { + unsigned Opc = MI.getOpcode(); + switch (Opc) { +#define HandleOpcAndReverse(Opc, ReverseOpc, VOpc, ReverseVOpc) \ + case Opc: \ + return Reverse ? ReverseVOpc : VOpc; \ + case ReverseOpc: \ + return Reverse ? VOpc : ReverseVOpc + HandleOpcAndReverse(AMDGPU::S_CMP_EQ_I32, AMDGPU::S_CMP_LG_I32, + AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_NE_I32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U32, AMDGPU::S_CMP_LG_U32, + AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_NE_U32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_GT_I32, AMDGPU::S_CMP_LE_I32, + AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_LE_I32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_GT_U32, AMDGPU::S_CMP_LE_U32, + AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_LE_U32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_GE_I32, AMDGPU::S_CMP_LT_I32, + AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_LT_I32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_GE_U32, AMDGPU::S_CMP_LT_U32, + AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_LT_U32_e64); + HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U64, AMDGPU::S_CMP_LG_U64, + AMDGPU::V_CMP_EQ_U64_e64, AMDGPU::V_CMP_NE_U64_e64); +#undef HandleOpcAndReverse + default: + break; + } + return std::nullopt; +} + +bool isSCmpPromotableToVCmp(const MachineInstr &MI) { + return getVALUOpc(MI).has_value(); +} + +bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then, + MachineBasicBlock *&Tail) { + if (Head.succ_size() != 2) + return false; + + Then = Head.succ_begin()[0]; + Tail = Head.succ_begin()[1]; + + // Canonicalize so Succ0 has MBB as its single predecessor. + if (Then->pred_size() != 1) + std::swap(Then, Tail); + + if (Then->pred_size() != 1 || Then->succ_size() != 1) + return false; + + return *Then->succ_begin() == Tail; +} + +bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) { + auto CmpIt = std::next(Term.getReverseIterator()); + if (CmpIt == Term.getParent()->instr_rend()) + return false; + + if (!isSCmpPromotableToVCmp(*CmpIt)) + return false; + + Cmp = &*CmpIt; + return true; +} + +bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) { + auto TermIt = Head.getFirstInstrTerminator(); + if (TermIt == Head.end()) + return false; + + switch (TermIt->getOpcode()) { + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + Term = &*TermIt; + return true; + default: + return false; + } +} + +bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term, + MachineInstr *&Cmp, MachineBasicBlock *&Then, + MachineBasicBlock *&Tail) { + + if (!hasCbranchSCCTerm(Head, Term)) + return false; + + bool SCCIsUsedOutsideHead = any_of( + Head.liveouts(), [](const auto &P) { return P.PhysReg == AMDGPU::SCC; }); + if (SCCIsUsedOutsideHead) + return false; + + if (!isTriangular(Head, Then, Tail)) + return false; + + // phi-nodes in the tail can prevent splicing the instructions of the then + // and tail blocks in the head + if (!Tail->empty() && Tail->begin()->isPHI()) + return false; + + if (!hasPromotableCmpConditon(*Term, Cmp)) + return false; + + return true; +} + +bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) { + MachineBasicBlock *TBB = Term.getOperand(0).getMBB(); + return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1); +} class AMDGPUDemoteSCCBranchToExecz { + MachineFunction &MF; + const GCNSubtarget &ST; + const SIInstrInfo &TII; + const SIRegisterInfo &RegInfo; + const TargetSchedModel &SchedModel; + public: - AMDGPUDemoteSCCBranchToExecz() = default; + AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF) + : MF(MF), ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), + RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {} + + bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp, + const MachineBasicBlock &Then, + const MachineBasicBlock &Tail) { + bool IsWave32 = TII.isWave32(); + unsigned AndSaveExecOpc = + IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)), + AndSaveExecOpc, Mov}; + unsigned NewOpsCost = 0; + for (unsigned Opc : NewOps) + NewOpsCost += SchedModel.computeInstrLatency(Opc); + unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false); + + assert(NewOpsCost >= OldCmpCost); + return !TII.mustRetainExeczBranch(Term, Then, Tail, + NewOpsCost - OldCmpCost); + } + + void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head, + MachineBasicBlock &Then, MachineBasicBlock &Tail) { + unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)); + Cmp.setDesc(TII.get(NewCmpOpc)); + + Cmp.removeOperand(2); + + auto VCC = RegInfo.getVCC(); + auto Exec = RegInfo.getExec(); - bool run() { return false; } + auto &MRI = MF.getRegInfo(); + MCRegister ExecBackup = + MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec)); + + Cmp.insert(Cmp.operands_begin(), MachineOperand::CreateReg(VCC, true)); + Cmp.addImplicitDefUseOperands(MF); + + TII.legalizeOperands(Cmp); + + bool IsWave32 = TII.isWave32(); + unsigned AndSaveExecOpc = + IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(), + TII.get(AndSaveExecOpc), ExecBackup); + SaveAndMaskExec.addReg(VCC, RegState::Kill); + SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead + + DebugLoc DL = Term.getDebugLoc(); + TII.removeBranch(Head); + MachineOperand Cond[] = { + MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ), + MachineOperand::CreateReg(RegInfo.getExec(), false)}; + TII.insertBranch(Head, &Tail, &Then, Cond, DL); + + TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup); + } + + bool run() { + if (!SchedModel.hasInstrSchedModel()) + return false; + bool Changed = false; + + for (MachineBasicBlock &Head : MF) { + MachineInstr *Term; + MachineInstr *Cmp; + MachineBasicBlock *Then; + MachineBasicBlock *Tail; + if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail)) + continue; + + if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail)) + continue; + + demoteCmp(*Term, *Cmp, Head, *Then, *Tail); + Changed = true; + } + return Changed; + } }; class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass { @@ -23,7 +220,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass { AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { - AMDGPUDemoteSCCBranchToExecz IfCvt{}; + AMDGPUDemoteSCCBranchToExecz IfCvt{MF}; return IfCvt.run(); } @@ -40,7 +237,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0; PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run( MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - AMDGPUDemoteSCCBranchToExecz IfCvt{}; + AMDGPUDemoteSCCBranchToExecz IfCvt{MF}; if (!IfCvt.run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d676d561d0818..e957d737a9814 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4118,6 +4118,88 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE); } +namespace { +class BranchWeightCostModel { + const SIInstrInfo &TII; + const TargetSchedModel &SchedModel; + BranchProbability BranchProb; + static constexpr uint64_t BranchNotTakenCost = 1; + uint64_t BranchTakenCost; + uint64_t ThenCyclesCost; + +public: + BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, + const MachineBasicBlock &Succ, + unsigned ExtraTransformationCosts) + : TII(TII), SchedModel(TII.getSchedModel()), + ThenCyclesCost(ExtraTransformationCosts) { + const MachineBasicBlock &Head = *Branch.getParent(); + const auto *FromIt = find(Head.successors(), &Succ); + assert(FromIt != Head.succ_end()); + + BranchProb = Head.getSuccProbability(FromIt); + if (BranchProb.isUnknown()) + BranchProb = BranchProbability::getZero(); + BranchTakenCost = SchedModel.computeInstrLatency(&Branch); + } + + bool isProfitable(const MachineInstr &MI) { + if (TII.isWaitcnt(MI.getOpcode())) + return false; + + ThenCyclesCost += SchedModel.computeInstrLatency(&MI); + + // Consider `P = N/D` to be the probability of execz being false (skipping + // the then-block) The transformation is profitable if always executing the + // 'then' block is cheaper than executing sometimes 'then' and always + // executing s_cbranch_execz: + // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost + // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost + // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * + // BranchNotTakenCost + uint64_t Numerator = BranchProb.getNumerator(); + uint64_t Denominator = BranchProb.getDenominator(); + return (Denominator - Numerator) * ThenCyclesCost <= + ((Denominator - Numerator) * BranchTakenCost + + Numerator * BranchNotTakenCost); + } +}; +} // namespace + +bool SIInstrInfo::mustRetainExeczBranch( + const MachineInstr &Branch, const MachineBasicBlock &From, + const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const { + + assert(is_contained(Branch.getParent()->successors(), &From)); + BranchWeightCostModel CostModel{*this, Branch, From, + ExtraTransformationCosts}; + + const MachineFunction *MF = From.getParent(); + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (const MachineInstr &MI : MBB) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might never be taken when EXEC = 0. + // Hence we should retain cbranch out of the loop lest it become infinite. + if (MI.isConditionalBranch()) + return true; + + if (MI.isMetaInstruction()) + continue; + + if (hasUnwantedEffectsWhenEXECEmpty(MI)) + return true; + + if (!CostModel.isProfitable(MI)) + return true; + } + } + + return false; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 7041b59964645..863b661297323 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { TargetSchedModel SchedModel; mutable std::unique_ptr Formatter; +public: // The inverse predicate should have the negative value. enum BranchPredicate { INVALID_BR = 0, @@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { EXECZ = 3 }; +private: using SetVectorType = SmallSetVector; static unsigned getBranchOpcode(BranchPredicate Cond); @@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Return true if the instruction modifies the mode register.q static bool modifiesModeRegister(const MachineInstr &MI); + /// Returns true if it's protifable to remove an execz branch from Branch to + /// From + bool mustRetainExeczBranch(const MachineInstr &Branch, + const MachineBasicBlock &From, + const MachineBasicBlock &To, + unsigned ExtraTransformationCosts = 0) const; + /// This function is used to determine if an instruction can be safely /// executed under EXEC = 0 without hardware error, indeterminate results, /// and/or visible effects on future vector execution or outside the shader. - /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is - /// used in removing branches over short EXEC = 0 sequences. - /// As such it embeds certain assumptions which may not apply to every case - /// of EXEC = 0 execution. + /// Note: as of 2024 the only use of this is SIPreEmitPeephole and + /// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch) + /// where it is used in removing branches over short EXEC = 0 sequences. As + /// such it embeds certain assumptions which may not apply to every case of + /// EXEC = 0 execution. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; /// Returns true if the instruction could potentially depend on the value of diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 701084844cd9b..1c8beca086536 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -15,8 +15,6 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/Support/BranchProbability.h" using namespace llvm; @@ -35,9 +33,6 @@ class SIPreEmitPeephole : public MachineFunctionPass { MachineBasicBlock *&TrueMBB, MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond); - bool mustRetainExeczBranch(const MachineInstr &Branch, - const MachineBasicBlock &From, - const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); public: @@ -299,84 +294,6 @@ bool SIPreEmitPeephole::getBlockDestinations( return true; } -namespace { -class BranchWeightCostModel { - const SIInstrInfo &TII; - const TargetSchedModel &SchedModel; - BranchProbability BranchProb; - static constexpr uint64_t BranchNotTakenCost = 1; - uint64_t BranchTakenCost; - uint64_t ThenCyclesCost = 0; - -public: - BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, - const MachineBasicBlock &Succ) - : TII(TII), SchedModel(TII.getSchedModel()) { - const MachineBasicBlock &Head = *Branch.getParent(); - const auto *FromIt = find(Head.successors(), &Succ); - assert(FromIt != Head.succ_end()); - - BranchProb = Head.getSuccProbability(FromIt); - if (BranchProb.isUnknown()) - BranchProb = BranchProbability::getZero(); - BranchTakenCost = SchedModel.computeInstrLatency(&Branch); - } - - bool isProfitable(const MachineInstr &MI) { - if (TII.isWaitcnt(MI.getOpcode())) - return false; - - ThenCyclesCost += SchedModel.computeInstrLatency(&MI); - - // Consider `P = N/D` to be the probability of execz being false (skipping - // the then-block) The transformation is profitable if always executing the - // 'then' block is cheaper than executing sometimes 'then' and always - // executing s_cbranch_execz: - // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost - // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost - // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * - // BranchNotTakenCost - uint64_t Numerator = BranchProb.getNumerator(); - uint64_t Denominator = BranchProb.getDenominator(); - return (Denominator - Numerator) * ThenCyclesCost <= - ((Denominator - Numerator) * BranchTakenCost + - Numerator * BranchNotTakenCost); - } -}; - -bool SIPreEmitPeephole::mustRetainExeczBranch( - const MachineInstr &Branch, const MachineBasicBlock &From, - const MachineBasicBlock &To) const { - assert(is_contained(Branch.getParent()->successors(), &From)); - BranchWeightCostModel CostModel{*TII, Branch, From}; - - const MachineFunction *MF = From.getParent(); - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (const MachineInstr &MI : MBB) { - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might never be taken when EXEC = 0. - // Hence we should retain cbranch out of the loop lest it become infinite. - if (MI.isConditionalBranch()) - return true; - - if (MI.isMetaInstruction()) - continue; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) - return true; - - if (!CostModel.isProfitable(MI)) - return true; - } - } - - return false; -} -} // namespace - // Returns true if the skip branch instruction is removed. bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -396,7 +313,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return false; // Consider only when it is legal and profitable - if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB)) + if (TII->mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB)) return false; LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll index aa38f43368694..a305762cd4a55 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll @@ -101,8 +101,8 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no ; GFX9-LABEL: uniform_br_profitable: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s21, 1 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: v_cmp_ge_i32_e64 vcc, s21, 1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: ; %bb.1: ; %if.then ; GFX9-NEXT: s_mov_b32 s11, s18 ; GFX9-NEXT: s_mov_b32 s10, s17 @@ -111,26 +111,47 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX9-NEXT: .LBB2_2: ; %if.end +; GFX9-NEXT: ; %bb.2: ; %if.end +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: uniform_br_profitable: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s21, 1 -; GFX10-NEXT: s_cbranch_scc1 .LBB2_2 -; GFX10-NEXT: ; %bb.1: ; %if.then -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s19 -; GFX10-NEXT: s_mov_b32 s11, s18 -; GFX10-NEXT: s_mov_b32 s10, s17 -; GFX10-NEXT: s_mov_b32 s9, s16 -; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX10-NEXT: .LBB2_2: ; %if.end -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX1010-LABEL: uniform_br_profitable: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ge_i32_e64 vcc_lo, s21, 1 +; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1010-NEXT: ; %bb.1: ; %if.then +; GFX1010-NEXT: v_mov_b32_e32 v0, s6 +; GFX1010-NEXT: v_mov_b32_e32 v1, s19 +; GFX1010-NEXT: s_mov_b32 s11, s18 +; GFX1010-NEXT: s_mov_b32 s10, s17 +; GFX1010-NEXT: s_mov_b32 s9, s16 +; GFX1010-NEXT: s_mov_b32 s8, s7 +; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1010-NEXT: ; %bb.2: ; %if.end +; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-NEXT: s_mov_b32 exec_lo, s4 +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-LABEL: uniform_br_profitable: +; GFX1030: ; %bb.0: ; %entry +; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-NEXT: s_mov_b32 s4, exec_lo +; GFX1030-NEXT: v_cmpx_ge_i32_e64 s21, 1 +; GFX1030-NEXT: ; %bb.1: ; %if.then +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s19 +; GFX1030-NEXT: s_mov_b32 s11, s18 +; GFX1030-NEXT: s_mov_b32 s10, s17 +; GFX1030-NEXT: s_mov_b32 s9, s16 +; GFX1030-NEXT: s_mov_b32 s8, s7 +; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1030-NEXT: ; %bb.2: ; %if.end +; GFX1030-NEXT: s_mov_b32 exec_lo, s4 +; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %cmp = icmp sgt i32 %flag, 0 br i1 %cmp, label %if.then, label %if.end, !prof !1