Skip to content

[AMDGPU] Change CF intrinsics lowering to reconverge on predecessors. #92809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
// GFX90A-CAS: flat_atomic_cmpswap
// GFX90A-CAS: s_cbranch_execnz
// GFX90A-CAS: s_cbranch_scc1
__device__ float atomic_add_cas(float *p) {
return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
}
4 changes: 2 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
[llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
[IntrWillReturn, IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should document what this means

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I second that, all these control-flow pseudo need to have their semantics documented

[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Represent unreachable in a divergent region.
def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,11 +1553,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
MachineBasicBlock *BB = MI.getParent();
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
.add(MI.getOperand(1));

Register Reg = MI.getOperand(1).getReg();
Expand Down Expand Up @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_end_cf:
return selectEndCfIntrinsic(I);
case Intrinsic::amdgcn_wave_reconverge:
return selectWaveReconvergeIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectReturnAddress(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I) const;

bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
Expand Down
19 changes: 8 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned MovExecOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const unsigned MovExecTermOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;

const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
Expand Down Expand Up @@ -949,28 +947,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(

B.setInsertPt(*BodyBB, BodyBB->end());

Register LoopMask = MRI.createVirtualRegister(
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
.addDef(ExecReg)
.addDef(LoopMask)
.addReg(ExecReg)
.addReg(NewExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?

// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
.addReg(LoopMask)
.addReg(NewExec)
.addMBB(LoopBB);

// Save the EXEC mask before the loop.
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
.addReg(ExecReg);

// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
B.buildInstr(MovExecTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);

// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
B.setInsertPt(*RemainderBB, RemainderBB->begin());
Expand Down Expand Up @@ -4954,7 +4951,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_end_cf: {
case Intrinsic::amdgcn_wave_reconverge: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
Expand Down
92 changes: 51 additions & 41 deletions llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
Expand Down Expand Up @@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass {
Function *Else;
Function *IfBreak;
Function *Loop;
Function *EndCf;
Function *WaveReconverge;

DominatorTree *DT;
StackVector Stack;
Expand Down Expand Up @@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass {

bool handleLoop(BranchInst *Term);

bool closeControlFlow(BasicBlock *BB);
bool tryWaveReconverge(BasicBlock *BB);

public:
static char ID;
Expand Down Expand Up @@ -141,7 +142,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
{ IntMask });
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask });
}

/// Is the branch condition uniform or did the StructurizeCFG pass
Expand Down Expand Up @@ -203,8 +204,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {

/// Open a new "If" block
bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return false;

IRBuilder<> IRB(Term);
Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
Expand Down Expand Up @@ -305,43 +304,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
}

/// Close the last opened control flow
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);
bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is one core part of this change. It would be nice to have more comment with examples before the function header showing when and where wave_converge is inserted.


assert(Stack.back().first == BB);
if (succ_empty(BB))
return false;

if (L && L->getHeader() == BB) {
// We can't insert an EndCF call into a loop header, because it will
// get executed on every iteration of the loop, when it should be
// executed only once before the loop.
SmallVector <BasicBlock *, 8> Latches;
L->getLoopLatches(Latches);
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
if (Term->getNumSuccessors() == 1) {
// The current BBs single successor is a top of the stack. We need to
// reconverge over thaqt path.
BasicBlock *SingleSucc = *succ_begin(BB);
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();

SmallVector<BasicBlock *, 2> Preds;
for (BasicBlock *Pred : predecessors(BB)) {
if (!is_contained(Latches, Pred))
Preds.push_back(Pred);
if (isTopOfStack(SingleSucc)) {
Value *Exec = Stack.back().second;
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
}

BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
false);
}

Value *Exec = popSaved();
BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
Instruction *ExecDef = cast<Instruction>(Exec);
BasicBlock *DefBB = ExecDef->getParent();
if (!DT->dominates(DefBB, BB)) {
// Split edge to make Def dominate Use
FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
} else {
// We have a uniform conditional branch terminating the block.
// THis block may be the last in the Then path of the enclosing divergent
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo 'THis'

// IF.
if (!isUniform(Term))
// Divergent loop is going to be further processed in another place
return false;

for (auto Succ : Term->successors()) {
if (isTopOfStack(Succ)) {
// Just split to make a room for further WAVE_RECONVERGE insertion
SmallVector<BasicBlock*, 2> Preds;
for (auto P : predecessors(Succ)) {
if (DT->dominates(BB, P))
Preds.push_back(P);
}
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI,
nullptr, false);
}
}
IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
// TODO: StructurizeCFG 'Flow' blocks have debug locations from the
// condition, for now just avoid copying these DebugLocs so that stepping
// out of the then/else block in a debugger doesn't step to the condition.
IRB.SetCurrentDebugLocation(DebugLoc());
IRB.CreateCall(EndCf, {Exec});
}

return true;
Expand All @@ -365,14 +364,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {

if (!Term || Term->isUnconditional()) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();

Changed |= tryWaveReconverge(BB);

continue;
}

if (I.nodeVisited(Term->getSuccessor(1))) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();

// Let's take care of uniform loop latch that may be closing the Then
// path of the enclosing divergent branch.
Changed |= tryWaveReconverge(BB);

if (DT->dominates(Term->getSuccessor(1), BB))
Changed |= handleLoop(Term);
Expand All @@ -387,10 +392,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
continue;
}

Changed |= closeControlFlow(BB);
Stack.pop_back();
}

Changed |= openIf(Term);
if (isUniform(Term))
// Uniform conditional branch may be in the block that closes the Then
// path of the divergent conditional branch.
Changed |= tryWaveReconverge(BB);
else
Changed |= openIf(Term);
}

if (!Stack.empty()) {
Expand Down
34 changes: 30 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
return AMDGPUISD::ELSE;
case Intrinsic::amdgcn_loop:
return AMDGPUISD::LOOP;
case Intrinsic::amdgcn_end_cf:
case Intrinsic::amdgcn_wave_reconverge:
llvm_unreachable("should not occur");
default:
return 0;
Expand Down Expand Up @@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

return SDValue(Load, 0);
}
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
case Intrinsic::amdgcn_wave_reconverge:
return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_join:
Expand Down Expand Up @@ -15740,6 +15740,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
}

// ISel inserts copy to regs for the successor PHIs
// at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you avoid this by gluing the pseudo to the root node? Also, I think you can avoid a second walk over the function by doing this in EmitInstrWithCustomInserter

// branch.
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
MachineBasicBlock::iterator I(MI);
MachineBasicBlock::iterator Next = std::next(I);
bool NeedToMove = false;
while (Next != MBB.end() && !Next->isBranch()) {
NeedToMove = true;
Next++;
}

assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
"Malformed CFG detected!\n");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No newline in the string, this isn't real printing


if (NeedToMove) {
MBB.splice(Next, &MBB, &MI);
}

break;
}
}
}

// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
// classes if required. Ideally the register class constraints would differ
// per-subtarget, but there's no easy way to achieve that right now. This is
Expand Down Expand Up @@ -16336,7 +16362,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
default:
Result = false;
break;
case Intrinsic::amdgcn_end_cf:
case Intrinsic::amdgcn_wave_reconverge:
case Intrinsic::amdgcn_loop:
Result = true;
break;
Expand Down
Loading
Loading