Skip to content

AMDGPU: Fix temporal divergence introduced by machine-sink and performance regression introduced by D155343 #67456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/include/llvm/ADT/GenericCycleImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,25 @@ void GenericCycleInfo<ContextT>::compute(FunctionT &F) {
assert(validateTree());
}

template <typename ContextT>
void GenericCycleInfo<ContextT>::splitCriticalEdge(BlockT *Pred, BlockT *Succ,
BlockT *NewBlock) {
// Edge Pred-Succ is replaced by edges Pred-NewBlock and NewBlock-Succ, all
// cycles that had blocks Pred and Succ also get NewBlock.
CycleT *Cycle = this->getCycle(Pred);
if (Cycle && Cycle->contains(Succ)) {
while (Cycle) {
// FixMe: Appending NewBlock is fine as a set of blocks in a cycle. When
// printing cycle NewBlock is at the end of list but it should be in the
// middle to represent actual traversal of a cycle.
Cycle->appendBlock(NewBlock);
BlockMap.try_emplace(NewBlock, Cycle);
Cycle = Cycle->getParentCycle();
}
}
assert(validateTree());
}

/// \brief Find the innermost cycle containing a given block.
///
/// \returns the innermost cycle containing \p Block or nullptr if
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/ADT/GenericCycleInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ template <typename ContextT> class GenericCycleInfo {

void clear();
void compute(FunctionT &F);
void splitCriticalEdge(BlockT *Pred, BlockT *Succ, BlockT *New);

const FunctionT *getFunction() const { return Context.getFunction(); }
const ContextT &getSSAContext() const { return Context; }
Expand Down
9 changes: 9 additions & 0 deletions llvm/include/llvm/CodeGen/MachineBasicBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,15 @@ class MachineBasicBlock
static_cast<const MachineBasicBlock *>(this)->getSingleSuccessor());
}

/// Return the predecessor of this block if it has a single predecessor.
/// Otherwise return a null pointer.
///
const MachineBasicBlock *getSinglePredecessor() const;
MachineBasicBlock *getSinglePredecessor() {
return const_cast<MachineBasicBlock *>(
static_cast<const MachineBasicBlock *>(this)->getSinglePredecessor());
}

/// Return the fallthrough block if the block can implicitly
/// transfer control to the block after it by falling off the end of
/// it. If an explicit branch to the fallthrough block is not allowed,
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/TargetInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/ADT/Uniformity.h"
#include "llvm/CodeGen/MIRFormatter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
Expand Down Expand Up @@ -150,6 +151,11 @@ class TargetInstrInfo : public MCInstrInfo {
return false;
}

virtual bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
MachineCycleInfo *CI) const {
return true;
}

protected:
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
/// set, this hook lets the target specify whether the instruction is actually
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/CodeGen/MachineBasicBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,10 @@ const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
return Successors.size() == 1 ? Successors[0] : nullptr;
}

const MachineBasicBlock *MachineBasicBlock::getSinglePredecessor() const {
return Predecessors.size() == 1 ? Predecessors[0] : nullptr;
}

MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
MachineFunction::iterator Fallthrough = getIterator();
++Fallthrough;
Expand Down
15 changes: 6 additions & 9 deletions llvm/lib/CodeGen/MachineSink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
if (!Reg)
continue;
if (MO.isUse()) {
if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
if (Reg.isPhysical() &&
(TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
continue;
if (PI->modifiesRegister(Reg, TRI))
return true;
Expand Down Expand Up @@ -734,6 +735,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {

MadeChange = true;
++NumSplit;
CI->splitCriticalEdge(Pair.first, Pair.second, NewSucc);
} else
LLVM_DEBUG(dbgs() << " *** Not legal to break critical edge\n");
}
Expand Down Expand Up @@ -1250,24 +1252,19 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
if (MBB == SuccToSinkTo)
return nullptr;

if (!SuccToSinkTo)
return nullptr;

// It's not safe to sink instructions to EH landing pad. Control flow into
// landing pad is implicitly defined.
if (SuccToSinkTo->isEHPad())
if (SuccToSinkTo && SuccToSinkTo->isEHPad())
return nullptr;

// It ought to be okay to sink instructions into an INLINEASM_BR target, but
// only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
// the source block (which this code does not yet do). So for now, forbid
// doing so.
if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;

MachineBasicBlock::const_iterator InsertPos =
SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
if (SuccToSinkTo && !TII->isSafeToSink(MI, SuccToSinkTo, CI))
return nullptr;

return SuccToSinkTo;
Expand Down
42 changes: 42 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
}

bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
MachineBasicBlock *SuccToSinkTo,
MachineCycleInfo *CI) const {
// Allow sinking if MI edits lane mask (divergent i1 in sgpr).
if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
return true;

MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
// Check if sinking of MI would create temporal divergent use.
for (auto Op : MI.uses()) {
if (Op.isReg() && Op.getReg().isVirtual() &&
RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());

// SgprDef defined inside cycle
MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
if (FromCycle == nullptr)
continue;

MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
// Check if there is a FromCycle that contains SgprDef's basic block but
// does not contain SuccToSinkTo and also has divergent exit condition.
while (FromCycle && !FromCycle->contains(ToCycle)) {
// After structurize-cfg, there should be exactly one cycle exit.
SmallVector<MachineBasicBlock *, 1> ExitBlocks;
FromCycle->getExitBlocks(ExitBlocks);
assert(ExitBlocks.size() == 1);
assert(ExitBlocks[0]->getSinglePredecessor());

// FromCycle has divergent exit condition.
if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
return false;
}

FromCycle = FromCycle->getParentCycle();
}
}
}

return true;
}

bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
int64_t &Offset0,
int64_t &Offset1) const {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

bool isIgnorableUse(const MachineOperand &MO) const override;

bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
MachineCycleInfo *CI) const override;

bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
int64_t &Offset1) const override;

Expand Down
143 changes: 143 additions & 0 deletions llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck %s

---
name: multi_else_break
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: multi_else_break
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $vgpr4, $vgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.6
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6:
; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.3
bb.0:
successors: %bb.1(0x80000000)
liveins: $vgpr4, $vgpr5

%21:vgpr_32 = COPY $vgpr5
%20:vgpr_32 = COPY $vgpr4
%23:sreg_32 = S_MOV_B32 0
%33:vgpr_32 = COPY %23, implicit $exec
%38:sreg_32 = IMPLICIT_DEF
%44:sreg_32 = IMPLICIT_DEF
%26:sreg_32 = IMPLICIT_DEF
%29:sreg_32 = S_MOV_B32 1

bb.1:
successors: %bb.2(0x80000000)

%0:sreg_32 = PHI %23, %bb.0, %12, %bb.6
%1:vgpr_32 = PHI %33, %bb.0, %13, %bb.6

bb.2:
successors: %bb.4(0x40000000), %bb.5(0x40000000)

%48:sreg_32 = PHI %44, %bb.1, %10, %bb.5
%42:sreg_32 = PHI %38, %bb.1, %8, %bb.5
%2:sreg_32 = PHI %23, %bb.1, %11, %bb.5
%3:vgpr_32 = PHI %1, %bb.1, %9, %bb.5
%27:sreg_32 = V_CMP_LT_I32_e64 %3, %20, implicit $exec
%36:vgpr_32 = COPY %26
%39:sreg_32 = S_OR_B32 %42, $exec_lo, implicit-def $scc
%45:sreg_32 = S_OR_B32 %48, $exec_lo, implicit-def $scc
%4:sreg_32 = SI_IF killed %27, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_BRANCH %bb.4

bb.3:
SI_END_CF %12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_ENDPGM 0

bb.4:
successors: %bb.5(0x80000000)

%6:vgpr_32 = V_ADD_U32_e64 %3, %29, 0, implicit $exec
%30:sreg_32 = V_CMP_NE_U32_e64 %21, %6, implicit $exec
%43:sreg_32 = S_ANDN2_B32 %39, $exec_lo, implicit-def $scc
%40:sreg_32 = COPY %43
%49:sreg_32 = S_ANDN2_B32 %45, $exec_lo, implicit-def $scc
%50:sreg_32 = S_AND_B32 %30, $exec_lo, implicit-def $scc
%46:sreg_32 = S_OR_B32 %49, %50, implicit-def $scc

bb.5:
successors: %bb.6(0x04000000), %bb.2(0x7c000000)

%10:sreg_32 = PHI %45, %bb.2, %46, %bb.4
%8:sreg_32 = PHI %39, %bb.2, %40, %bb.4
%9:vgpr_32 = PHI %36, %bb.2, %6, %bb.4
SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
%11:sreg_32 = SI_IF_BREAK %10, %2, implicit-def dead $scc
%12:sreg_32 = SI_IF_BREAK %8, %0, implicit-def dead $scc
SI_LOOP %11, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_BRANCH %bb.6

bb.6:
successors: %bb.3(0x04000000), %bb.1(0x7c000000)

%13:vgpr_32 = PHI %9, %bb.5
SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
SI_LOOP %12, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_BRANCH %bb.3
...
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
Expand Down Expand Up @@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0
; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.5
Expand All @@ -52,6 +51,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
Expand Down
Loading