Skip to content

Commit ef68d15

Browse files
authored
[AMDGPU] upstream barrier count reporting part1 (#154409)
1 parent f94290c commit ef68d15

16 files changed

+108
-23
lines changed

llvm/include/llvm/Support/AMDHSAKernelDescriptor.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,15 +186,21 @@ enum : int32_t {
186186
// [GFX10-GFX11].
187187
#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH) \
188188
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
189+
// [GFX10-GFX120].
190+
#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
191+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_##NAME, SHIFT, WIDTH)
189192
// GFX11+.
190-
#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
191-
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
193+
#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
194+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_##NAME, SHIFT, WIDTH)
192195
// [GFX11].
193196
#define COMPUTE_PGM_RSRC3_GFX11(NAME, SHIFT, WIDTH) \
194197
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_##NAME, SHIFT, WIDTH)
195198
// GFX12+.
196199
#define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH) \
197200
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
201+
// [GFX125].
202+
#define COMPUTE_PGM_RSRC3_GFX125(NAME, SHIFT, WIDTH) \
203+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX125_##NAME, SHIFT, WIDTH)
198204
enum : int32_t {
199205
COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
200206
COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
@@ -206,7 +212,9 @@ enum : int32_t {
206212
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
207213
COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
208214
COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
209-
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
215+
COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 3),
216+
COMPUTE_PGM_RSRC3_GFX125(NAMED_BAR_CNT, 14, 3),
217+
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 17, 14),
210218
COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
211219
COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
212220
};

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
720720
IsLocal),
721721
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
722722
IsLocal),
723+
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
724+
OutContext, IsLocal),
723725
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
724726
OutContext, IsLocal),
725727
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -807,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
807809
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
808810
}
809811

812+
if (AMDGPU::isGFX1250(STM)) {
813+
const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
814+
const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
815+
CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
816+
const MCExpr *BarBlks =
817+
MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
818+
OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
819+
false);
820+
}
821+
810822
OutStreamer->emitRawComment(
811823
" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
812824

@@ -1011,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
10111023
ProgInfo.DynamicCallStack =
10121024
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
10131025
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1026+
ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
10141027

10151028
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10161029

@@ -1253,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12531266
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
12541267
}
12551268

1269+
if (AMDGPU::isGFX1250(STM))
1270+
ProgInfo.ComputePGMRSrc3 =
1271+
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1272+
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1273+
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1274+
12561275
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
12571276
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
12581277
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
15121512
const GlobalValue *GV = G->getGlobal();
15131513

15141514
if (!MFI->isModuleEntryFunction()) {
1515+
auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
15151516
if (std::optional<uint32_t> Address =
15161517
AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1518+
if (IsNamedBarrier) {
1519+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1520+
MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1521+
}
15171522
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1523+
} else if (IsNamedBarrier) {
1524+
llvm_unreachable("named barrier should have an assigned address");
15181525
}
15191526
}
15201527

llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
3939
return GOCS(".num_agpr");
4040
case RIK_NumSGPR:
4141
return GOCS(".numbered_sgpr");
42+
case RIK_NumNamedBarrier:
43+
return GOCS(".num_named_barrier");
4244
case RIK_PrivateSegSize:
4345
return GOCS(".private_seg_size");
4446
case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
6668
MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
6769
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
6870
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
71+
MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
6972

7073
auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
7174
const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
7578
assignMaxRegSym(MaxVGPRSym, MaxVGPR);
7679
assignMaxRegSym(MaxAGPRSym, MaxAGPR);
7780
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
81+
assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
7882
}
7983

8084
void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
97101
return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
98102
}
99103

104+
MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
105+
return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
106+
}
107+
100108
// Tries to flatten recursive call register resource gathering. Simple cycle
101109
// avoiding dfs to find the constants in the propagated symbols.
102110
// Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
227235
case RIK_NumAGPR:
228236
ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
229237
break;
238+
case RIK_NumNamedBarrier:
239+
ArgExprs.push_back(MCSymbolRefExpr::create(
240+
getMaxNamedBarrierSymbol(OutContext), OutContext));
241+
break;
230242
}
231243
}
232244
}
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
245257
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
246258
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
247259
bool IsLocal = MF.getFunction().hasLocalLinkage();
260+
MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
248261

249262
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
250263
addMaxVGPRCandidate(FRI.NumVGPR);
251264
addMaxAGPRCandidate(FRI.NumAGPR);
252265
addMaxSGPRCandidate(FRI.NumExplicitSGPR);
266+
addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
253267
}
254268

255269
const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
288302
SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
289303
SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
290304
SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
305+
SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
291306

292307
{
293308
// The expression for private segment size should be: FRI.PrivateSegmentSize

llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class MCResourceInfo {
3131
RIK_NumVGPR,
3232
RIK_NumAGPR,
3333
RIK_NumSGPR,
34+
RIK_NumNamedBarrier,
3435
RIK_PrivateSegSize,
3536
RIK_UsesVCC,
3637
RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ class MCResourceInfo {
4344
int32_t MaxVGPR = 0;
4445
int32_t MaxAGPR = 0;
4546
int32_t MaxSGPR = 0;
47+
int32_t MaxNamedBarrier = 0;
4648

4749
// Whether the MCResourceInfo has been finalized through finalize(MCContext
4850
// &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ class MCResourceInfo {
7577
void addMaxSGPRCandidate(int32_t candidate) {
7678
MaxSGPR = std::max(MaxSGPR, candidate);
7779
}
80+
void addMaxNamedBarrierCandidate(int32_t candidate) {
81+
MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
82+
}
7883

7984
MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
8085
MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ class MCResourceInfo {
9095
MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
9196
MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
9297
MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
98+
MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
9399

94100
/// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
95101
/// granularity. However, some resource info has to be assigned the call

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
107107
if (!BarAddr)
108108
llvm_unreachable("named barrier should have an assigned address");
109109
Entry.first->second = BarAddr.value();
110+
unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
111+
recordNumNamedBarriers(BarAddr.value(), BarCnt);
110112
return BarAddr.value();
111113
}
112114

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
4949
// Flag to check dynamic LDS usage by kernel.
5050
bool UsesDynamicLDS = false;
5151

52+
uint32_t NumNamedBarriers = 0;
53+
5254
// Kernels + shaders. i.e. functions called by the hardware and not called
5355
// by other functions.
5456
bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
8688
return GDSSize;
8789
}
8890

91+
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
92+
NumNamedBarriers =
93+
std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
94+
}
95+
uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
96+
8997
bool isEntryFunction() const {
9098
return IsEntryFunction;
9199
}

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
142142
MRI.isLiveIn(MFI->getPreloadedReg(
143143
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144144

145+
Info.NumNamedBarrier = MFI->getNumNamedBarriers();
146+
145147
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
146148
// instructions aren't used to access the scratch buffer. Inline assembly may
147149
// need it though.

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ struct AMDGPUResourceUsageAnalysisImpl {
3535
int32_t NumVGPR = 0;
3636
int32_t NumAGPR = 0;
3737
int32_t NumExplicitSGPR = 0;
38+
int32_t NumNamedBarrier = 0;
3839
uint64_t CalleeSegmentSize = 0;
3940
uint64_t PrivateSegmentSize = 0;
4041
bool UsesVCC = false;

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
24222422
"must be zero on gfx10 or gfx11");
24232423
}
24242424

2425-
// Bits [14-30].
2426-
CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
2425+
// Bits [14-16]
2426+
if (isGFX1250()) {
2427+
PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
2428+
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
2429+
} else {
2430+
CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
2431+
"COMPUTE_PGM_RSRC3",
2432+
"must be zero on gfx10+");
2433+
}
2434+
2435+
// Bits [17-30].
2436+
CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
24272437
"COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
24282438

24292439
// Bits [31].

0 commit comments

Comments
 (0)