@@ -121,6 +121,7 @@ struct HardwareLimits {
121
121
DECL (LDS_ACCESS) /* lds read & write */ \
122
122
DECL (GDS_ACCESS) /* gds read & write */ \
123
123
DECL (SQ_MESSAGE) /* send message */ \
124
+ DECL (SCC_WRITE) /* write to SCC from barrier */ \
124
125
DECL (SMEM_ACCESS) /* scalar-memory read & write */ \
125
126
DECL (SMEM_GROUP) /* scalar-memory group */ \
126
127
DECL (EXP_GPR_LOCK) /* export holding on its data src */ \
@@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
149
150
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
150
151
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
151
152
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153
+ // NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
152
154
// We reserve a fixed number of VGPR slots in the scoring tables for
153
155
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
154
156
enum RegisterMapping {
@@ -163,6 +165,9 @@ enum RegisterMapping {
163
165
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
164
166
NUM_LDS_VGPRS = 9 , // One more than the stores we track.
165
167
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
168
+ NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169
+ // Remaining non-allocatable registers
170
+ SCC = NUM_ALL_ALLOCATABLE
166
171
};
167
172
168
173
// Enumerate different types of result-returning VMEM operations. Although
@@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
401
406
eventMask ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402
407
eventMask ({VMEM_SAMPLER_READ_ACCESS}),
403
408
eventMask ({VMEM_BVH_READ_ACCESS}),
404
- eventMask ({SMEM_ACCESS, SQ_MESSAGE}),
409
+ eventMask ({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE }),
405
410
eventMask ({VMEM_GROUP, SMEM_GROUP})};
406
411
407
412
return WaitEventMaskForInstGFX12Plus;
@@ -586,6 +591,7 @@ class SIInsertWaitcnts {
586
591
WaitcntBrackets &ScoreBrackets);
587
592
bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
588
593
WaitcntBrackets &ScoreBrackets);
594
+ static bool asynchronouslyWritesSCC (unsigned Opcode);
589
595
};
590
596
591
597
// This objects maintains the current score brackets of each wait counter, and
@@ -626,7 +632,12 @@ class WaitcntBrackets {
626
632
unsigned getRegScore (int GprNo, InstCounterType T) const {
627
633
if (GprNo < NUM_ALL_VGPRS)
628
634
return VgprScores[T][GprNo];
629
- return SgprScores[getSgprScoresIdx (T)][GprNo - NUM_ALL_VGPRS];
635
+
636
+ if (GprNo < NUM_ALL_ALLOCATABLE)
637
+ return SgprScores[getSgprScoresIdx (T)][GprNo - NUM_ALL_VGPRS];
638
+
639
+ assert (GprNo == SCC);
640
+ return SCCScore;
630
641
}
631
642
632
643
bool merge (const WaitcntBrackets &Other);
@@ -646,6 +657,7 @@ class WaitcntBrackets {
646
657
AMDGPU::Waitcnt &Wait) const {
647
658
determineWait (T, {RegNo, RegNo + 1 }, Wait);
648
659
}
660
+ void tryClearSCCWriteEvent (MachineInstr *Inst);
649
661
650
662
void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
651
663
void applyWaitcnt (InstCounterType T, unsigned Count);
@@ -785,6 +797,10 @@ class WaitcntBrackets {
785
797
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
786
798
// X_CNT score.
787
799
unsigned SgprScores[2 ][SQ_MAX_PGM_SGPRS] = {{0 }};
800
+ // Reg score for SCC.
801
+ unsigned SCCScore = 0 ;
802
+ // The unique instruction that has an SCC write pending, if there is one.
803
+ const MachineInstr *PendingSCCWrite = nullptr ;
788
804
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
789
805
// write to each vgpr.
790
806
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0 };
@@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
820
836
const MachineRegisterInfo *MRI,
821
837
const SIRegisterInfo *TRI,
822
838
const MachineOperand &Op) const {
839
+ if (Op.getReg () == AMDGPU::SCC)
840
+ return {SCC, SCC + 1 };
841
+
823
842
if (!TRI->isInAllocatableClass (Op.getReg ()))
824
843
return {-1 , -1 };
825
844
@@ -873,9 +892,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
873
892
if (RegNo < NUM_ALL_VGPRS) {
874
893
VgprUB = std::max (VgprUB, RegNo);
875
894
VgprScores[CntTy][RegNo] = Score;
876
- } else {
895
+ } else if (RegNo < NUM_ALL_ALLOCATABLE) {
877
896
SgprUB = std::max (SgprUB, RegNo - NUM_ALL_VGPRS);
878
897
SgprScores[getSgprScoresIdx (CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
898
+ } else {
899
+ assert (RegNo == SCC);
900
+ SCCScore = Score;
879
901
}
880
902
}
881
903
}
@@ -1086,6 +1108,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
1086
1108
if (Slot)
1087
1109
setRegScore (FIRST_LDS_VGPR, T, CurrScore);
1088
1110
}
1111
+
1112
+ if (Context->asynchronouslyWritesSCC (Inst.getOpcode ())) {
1113
+ setRegScore (SCC, T, CurrScore);
1114
+ PendingSCCWrite = &Inst;
1115
+ }
1089
1116
}
1090
1117
}
1091
1118
@@ -1154,6 +1181,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
1154
1181
OS << RelScore << " :s" << J << " " ;
1155
1182
}
1156
1183
}
1184
+ if (T == KM_CNT && SCCScore > 0 )
1185
+ OS << SCCScore << " :scc " ;
1157
1186
}
1158
1187
OS << ' \n ' ;
1159
1188
}
@@ -1228,6 +1257,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1228
1257
}
1229
1258
}
1230
1259
1260
+ void WaitcntBrackets::tryClearSCCWriteEvent (MachineInstr *Inst) {
1261
+ // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1262
+ // SCC has landed
1263
+ if (PendingSCCWrite &&
1264
+ PendingSCCWrite->getOpcode () == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1265
+ PendingSCCWrite->getOperand (0 ).getImm () == Inst->getOperand (0 ).getImm ()) {
1266
+ unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1267
+ // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1268
+ if ((PendingEvents & Context->WaitEventMaskForInst [KM_CNT]) ==
1269
+ SCC_WRITE_PendingEvent) {
1270
+ setScoreLB (KM_CNT, getScoreUB (KM_CNT));
1271
+ }
1272
+
1273
+ PendingEvents &= ~SCC_WRITE_PendingEvent;
1274
+ PendingSCCWrite = nullptr ;
1275
+ }
1276
+ }
1277
+
1231
1278
void WaitcntBrackets::applyWaitcnt (const AMDGPU::Waitcnt &Wait) {
1232
1279
applyWaitcnt (LOAD_CNT, Wait.LoadCnt );
1233
1280
applyWaitcnt (EXP_CNT, Wait.ExpCnt );
@@ -1917,6 +1964,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1917
1964
Wait);
1918
1965
}
1919
1966
}
1967
+ } else if (MI.getOpcode () == AMDGPU::S_BARRIER_WAIT) {
1968
+ ScoreBrackets.tryClearSCCWriteEvent (&MI);
1920
1969
} else {
1921
1970
// FIXME: Should not be relying on memoperands.
1922
1971
// Look at the source operands of every instruction to see if
@@ -2006,6 +2055,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
2006
2055
ScoreBrackets.determineWait (EXP_CNT, Interval, Wait);
2007
2056
}
2008
2057
ScoreBrackets.determineWait (DS_CNT, Interval, Wait);
2058
+ } else if (Op.getReg () == AMDGPU::SCC) {
2059
+ ScoreBrackets.determineWait (KM_CNT, Interval, Wait);
2009
2060
} else {
2010
2061
ScoreBrackets.determineWait (SmemAccessCounter, Interval, Wait);
2011
2062
}
@@ -2343,6 +2394,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2343
2394
ScoreBrackets->updateByEvent (TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2344
2395
else
2345
2396
ScoreBrackets->updateByEvent (TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2397
+ } else if (asynchronouslyWritesSCC (Inst.getOpcode ())) {
2398
+ ScoreBrackets->updateByEvent (TII, TRI, MRI, SCC_WRITE, Inst);
2346
2399
} else {
2347
2400
switch (Inst.getOpcode ()) {
2348
2401
case AMDGPU::S_SENDMSG:
@@ -2353,9 +2406,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2353
2406
break ;
2354
2407
case AMDGPU::S_MEMTIME:
2355
2408
case AMDGPU::S_MEMREALTIME:
2356
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2357
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2358
- case AMDGPU::S_BARRIER_LEAVE:
2359
2409
case AMDGPU::S_GET_BARRIER_STATE_M0:
2360
2410
case AMDGPU::S_GET_BARRIER_STATE_IMM:
2361
2411
ScoreBrackets->updateByEvent (TII, TRI, MRI, SMEM_ACCESS, Inst);
@@ -2422,6 +2472,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2422
2472
if (T == DS_CNT)
2423
2473
StrictDom |= mergeScore (M, LastGDS, Other.LastGDS );
2424
2474
2475
+ if (T == KM_CNT) {
2476
+ StrictDom |= mergeScore (M, SCCScore, Other.SCCScore );
2477
+ if (Other.hasPendingEvent (SCC_WRITE)) {
2478
+ unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2479
+ if (!OldEventsHasSCCWrite) {
2480
+ PendingSCCWrite = Other.PendingSCCWrite ;
2481
+ } else {
2482
+ if (PendingSCCWrite != Other.PendingSCCWrite )
2483
+ PendingSCCWrite = nullptr ;
2484
+ }
2485
+ }
2486
+ }
2487
+
2425
2488
for (int J = 0 ; J <= VgprUB; J++)
2426
2489
StrictDom |= mergeScore (M, VgprScores[T][J], Other.VgprScores [T][J]);
2427
2490
@@ -2453,6 +2516,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
2453
2516
counterTypeForInstr (Opcode).has_value ();
2454
2517
}
2455
2518
2519
+ bool SIInsertWaitcnts::asynchronouslyWritesSCC (unsigned Opcode) {
2520
+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2523
+ }
2524
+
2456
2525
// Generate s_waitcnt instructions where needed.
2457
2526
bool SIInsertWaitcnts::insertWaitcntInBlock (MachineFunction &MF,
2458
2527
MachineBasicBlock &Block,
0 commit comments