Skip to content

Commit 2ec7959

Browse files
[AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. (#157843)
Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, instructions that write to SCC, counter is KM_CNT. Also start tracking SCC for reads and writes. s_barrier_wait on the same barrier guarantees that the SCC write from s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
1 parent 8dae17b commit 2ec7959

File tree

5 files changed

+428
-7
lines changed

5 files changed

+428
-7
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ struct HardwareLimits {
121121
DECL(LDS_ACCESS) /* lds read & write */ \
122122
DECL(GDS_ACCESS) /* gds read & write */ \
123123
DECL(SQ_MESSAGE) /* send message */ \
124+
DECL(SCC_WRITE) /* write to SCC from barrier */ \
124125
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
125126
DECL(SMEM_GROUP) /* scalar-memory group */ \
126127
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
@@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
149150
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
150151
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
151152
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153+
// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
152154
// We reserve a fixed number of VGPR slots in the scoring tables for
153155
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
154156
enum RegisterMapping {
@@ -163,6 +165,9 @@ enum RegisterMapping {
163165
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
164166
NUM_LDS_VGPRS = 9, // One more than the stores we track.
165167
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
168+
NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169+
// Remaining non-allocatable registers
170+
SCC = NUM_ALL_ALLOCATABLE
166171
};
167172

168173
// Enumerate different types of result-returning VMEM operations. Although
@@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
401406
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402407
eventMask({VMEM_SAMPLER_READ_ACCESS}),
403408
eventMask({VMEM_BVH_READ_ACCESS}),
404-
eventMask({SMEM_ACCESS, SQ_MESSAGE}),
409+
eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
405410
eventMask({VMEM_GROUP, SMEM_GROUP})};
406411

407412
return WaitEventMaskForInstGFX12Plus;
@@ -586,6 +591,7 @@ class SIInsertWaitcnts {
586591
WaitcntBrackets &ScoreBrackets);
587592
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
588593
WaitcntBrackets &ScoreBrackets);
594+
static bool asynchronouslyWritesSCC(unsigned Opcode);
589595
};
590596

591597
// This objects maintains the current score brackets of each wait counter, and
@@ -626,7 +632,12 @@ class WaitcntBrackets {
626632
unsigned getRegScore(int GprNo, InstCounterType T) const {
627633
if (GprNo < NUM_ALL_VGPRS)
628634
return VgprScores[T][GprNo];
629-
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
635+
636+
if (GprNo < NUM_ALL_ALLOCATABLE)
637+
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
638+
639+
assert(GprNo == SCC);
640+
return SCCScore;
630641
}
631642

632643
bool merge(const WaitcntBrackets &Other);
@@ -646,6 +657,7 @@ class WaitcntBrackets {
646657
AMDGPU::Waitcnt &Wait) const {
647658
determineWait(T, {RegNo, RegNo + 1}, Wait);
648659
}
660+
void tryClearSCCWriteEvent(MachineInstr *Inst);
649661

650662
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
651663
void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -785,6 +797,10 @@ class WaitcntBrackets {
785797
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
786798
// X_CNT score.
787799
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
800+
// Reg score for SCC.
801+
unsigned SCCScore = 0;
802+
// The unique instruction that has an SCC write pending, if there is one.
803+
const MachineInstr *PendingSCCWrite = nullptr;
788804
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
789805
// write to each vgpr.
790806
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
820836
const MachineRegisterInfo *MRI,
821837
const SIRegisterInfo *TRI,
822838
const MachineOperand &Op) const {
839+
if (Op.getReg() == AMDGPU::SCC)
840+
return {SCC, SCC + 1};
841+
823842
if (!TRI->isInAllocatableClass(Op.getReg()))
824843
return {-1, -1};
825844

@@ -873,9 +892,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
873892
if (RegNo < NUM_ALL_VGPRS) {
874893
VgprUB = std::max(VgprUB, RegNo);
875894
VgprScores[CntTy][RegNo] = Score;
876-
} else {
895+
} else if (RegNo < NUM_ALL_ALLOCATABLE) {
877896
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
878897
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
898+
} else {
899+
assert(RegNo == SCC);
900+
SCCScore = Score;
879901
}
880902
}
881903
}
@@ -1086,6 +1108,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
10861108
if (Slot)
10871109
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
10881110
}
1111+
1112+
if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
1113+
setRegScore(SCC, T, CurrScore);
1114+
PendingSCCWrite = &Inst;
1115+
}
10891116
}
10901117
}
10911118

@@ -1154,6 +1181,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11541181
OS << RelScore << ":s" << J << " ";
11551182
}
11561183
}
1184+
if (T == KM_CNT && SCCScore > 0)
1185+
OS << SCCScore << ":scc ";
11571186
}
11581187
OS << '\n';
11591188
}
@@ -1228,6 +1257,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
12281257
}
12291258
}
12301259

1260+
void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1261+
// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1262+
// SCC has landed
1263+
if (PendingSCCWrite &&
1264+
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1265+
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1266+
unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1267+
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1268+
if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1269+
SCC_WRITE_PendingEvent) {
1270+
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1271+
}
1272+
1273+
PendingEvents &= ~SCC_WRITE_PendingEvent;
1274+
PendingSCCWrite = nullptr;
1275+
}
1276+
}
1277+
12311278
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
12321279
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
12331280
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
@@ -1917,6 +1964,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
19171964
Wait);
19181965
}
19191966
}
1967+
} else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
1968+
ScoreBrackets.tryClearSCCWriteEvent(&MI);
19201969
} else {
19211970
// FIXME: Should not be relying on memoperands.
19221971
// Look at the source operands of every instruction to see if
@@ -2006,6 +2055,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20062055
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
20072056
}
20082057
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
2058+
} else if (Op.getReg() == AMDGPU::SCC) {
2059+
ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
20092060
} else {
20102061
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
20112062
}
@@ -2343,6 +2394,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23432394
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
23442395
else
23452396
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2397+
} else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
2398+
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
23462399
} else {
23472400
switch (Inst.getOpcode()) {
23482401
case AMDGPU::S_SENDMSG:
@@ -2353,9 +2406,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23532406
break;
23542407
case AMDGPU::S_MEMTIME:
23552408
case AMDGPU::S_MEMREALTIME:
2356-
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2357-
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2358-
case AMDGPU::S_BARRIER_LEAVE:
23592409
case AMDGPU::S_GET_BARRIER_STATE_M0:
23602410
case AMDGPU::S_GET_BARRIER_STATE_IMM:
23612411
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
@@ -2422,6 +2472,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
24222472
if (T == DS_CNT)
24232473
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
24242474

2475+
if (T == KM_CNT) {
2476+
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2477+
if (Other.hasPendingEvent(SCC_WRITE)) {
2478+
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2479+
if (!OldEventsHasSCCWrite) {
2480+
PendingSCCWrite = Other.PendingSCCWrite;
2481+
} else {
2482+
if (PendingSCCWrite != Other.PendingSCCWrite)
2483+
PendingSCCWrite = nullptr;
2484+
}
2485+
}
2486+
}
2487+
24252488
for (int J = 0; J <= VgprUB; J++)
24262489
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
24272490

@@ -2453,6 +2516,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
24532516
counterTypeForInstr(Opcode).has_value();
24542517
}
24552518

2519+
bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
2520+
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2523+
}
2524+
24562525
// Generate s_waitcnt instructions where needed.
24572526
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24582527
MachineBasicBlock &Block,

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ define i1 @func1() {
1212
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
1313
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
1414
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
15+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
1516
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
1617
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
1718
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
18-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
1919
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
2020
;
2121
; GFX12-GISEL-LABEL: func1:
@@ -27,13 +27,86 @@ define i1 @func1() {
2727
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
2828
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
2929
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
30+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
31+
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
32+
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
33+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
34+
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
35+
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
36+
ret i1 %r
37+
}
38+
39+
define i1 @signal_isfirst_same_barrier_wait() {
40+
; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
41+
; GFX12-SDAG: ; %bb.0:
42+
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
43+
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
44+
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
45+
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
46+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
47+
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
48+
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
49+
; GFX12-SDAG-NEXT: s_barrier_wait -1
50+
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
51+
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
52+
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
53+
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
54+
;
55+
; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
56+
; GFX12-GISEL: ; %bb.0:
57+
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
58+
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
59+
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
60+
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
61+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
62+
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
63+
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
64+
; GFX12-GISEL-NEXT: s_barrier_wait -1
3065
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
3166
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
3267
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
68+
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
69+
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
70+
call void @llvm.amdgcn.s.barrier.wait(i16 -1)
71+
ret i1 %r
72+
}
73+
74+
define i1 @signal_isfirst_different_barrier_wait() {
75+
; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
76+
; GFX12-SDAG: ; %bb.0:
77+
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
78+
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
79+
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
80+
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
81+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
82+
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
83+
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
84+
; GFX12-SDAG-NEXT: s_barrier_wait 0
85+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
86+
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
87+
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
88+
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
89+
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
90+
;
91+
; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
92+
; GFX12-GISEL: ; %bb.0:
93+
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
94+
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
95+
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
96+
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
97+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
98+
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
99+
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
100+
; GFX12-GISEL-NEXT: s_barrier_wait 0
33101
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
102+
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
103+
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
104+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
34105
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
35106
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
107+
call void @llvm.amdgcn.s.barrier.wait(i16 0)
36108
ret i1 %r
37109
}
38110

111+
declare void @llvm.amdgcn.s.barrier.wait(i16)
39112
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)

0 commit comments

Comments
 (0)