Skip to content

Conversation

cmc-rep
Copy link
Contributor

@cmc-rep cmc-rep commented Aug 19, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Aug 19, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Gang Chen (cmc-rep)

Changes

Patch is 20.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154409.diff

15 Files Affected:

  • (modified) llvm/include/llvm/Support/AMDHSAKernelDescriptor.h (+9-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+19)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+7)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp (+15)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h (+6)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (+8)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h (+1)
  • (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+12-2)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp (+5-4)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h (+10-12)
  • (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.cpp (+1)
  • (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.h (+3)
  • (modified) llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll (+5)
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 8f367390c531c..4fb6fa656b6e7 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -186,6 +186,9 @@ enum : int32_t {
 // [GFX10-GFX11].
 #define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH)                      \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
+// [GFX10-GFX120].
+#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_ ## NAME, SHIFT, WIDTH)
 // GFX11+.
 #define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
@@ -195,6 +198,9 @@ enum : int32_t {
 // GFX12+.
 #define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
+// [GFX125].
+#define COMPUTE_PGM_RSRC3_GFX125(NAME, SHIFT, WIDTH)                           \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX125_##NAME, SHIFT, WIDTH)
 enum : int32_t {
   COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
   COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
@@ -206,7 +212,9 @@ enum : int32_t {
   COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
   COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
   COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
+  COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 3),
+  COMPUTE_PGM_RSRC3_GFX125(NAMED_BAR_CNT, 14, 3),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 17, 14),
   COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
   COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 66c3fad6de1a1..69722bdf0a49b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -720,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                      IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
                      IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
+                     OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
                      OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -807,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
           " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
     }
 
+    if (AMDGPU::isGFX1250(STM)) {
+      const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
+      const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
+          CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
+      const MCExpr *BarBlks =
+          MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
+      OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
+                                  false);
+    }
+
     OutStreamer->emitRawComment(
         " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
 
@@ -1011,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DynamicCallStack =
       MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
                              GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
+  ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1253,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
+  if (AMDGPU::isGFX1250(STM))
+    ProgInfo.ComputePGMRSrc3 =
+        SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
+
   ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab7d753c..6c0192d68690e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   const GlobalValue *GV = G->getGlobal();
 
   if (!MFI->isModuleEntryFunction()) {
+    auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
     if (std::optional<uint32_t> Address =
             AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+      if (IsNamedBarrier) {
+        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+        MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+      }
       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+    } else if (IsNamedBarrier) {
+      llvm_unreachable("named barrier should have an assigned address");
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 6390853d53b56..6b3cdf57f3ad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
     return GOCS(".num_agpr");
   case RIK_NumSGPR:
     return GOCS(".numbered_sgpr");
+  case RIK_NumNamedBarrier:
+    return GOCS(".num_named_barrier");
   case RIK_PrivateSegSize:
     return GOCS(".private_seg_size");
   case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
     const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   assignMaxRegSym(MaxVGPRSym, MaxVGPR);
   assignMaxRegSym(MaxAGPRSym, MaxAGPR);
   assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+  assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
 }
 
 void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
+}
+
 // Tries to flatten recursive call register resource gathering. Simple cycle
 // avoiding dfs to find the constants in the propagated symbols.
 // Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
         case RIK_NumAGPR:
           ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
+        case RIK_NumNamedBarrier:
+          ArgExprs.push_back(MCSymbolRefExpr::create(
+              getMaxNamedBarrierSymbol(OutContext), OutContext));
+          break;
         }
       }
     }
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
   bool IsLocal = MF.getFunction().hasLocalLinkage();
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
     addMaxVGPRCandidate(FRI.NumVGPR);
     addMaxAGPRCandidate(FRI.NumAGPR);
     addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+    addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
   }
 
   const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
   SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
   SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
   SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+  SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
 
   {
     // The expression for private segment size should be: FRI.PrivateSegmentSize
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 297e93bb54e29..b6055166698b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -31,6 +31,7 @@ class MCResourceInfo {
     RIK_NumVGPR,
     RIK_NumAGPR,
     RIK_NumSGPR,
+    RIK_NumNamedBarrier,
     RIK_PrivateSegSize,
     RIK_UsesVCC,
     RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ class MCResourceInfo {
   int32_t MaxVGPR = 0;
   int32_t MaxAGPR = 0;
   int32_t MaxSGPR = 0;
+  int32_t MaxNamedBarrier = 0;
 
   // Whether the MCResourceInfo has been finalized through finalize(MCContext
   // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ class MCResourceInfo {
   void addMaxSGPRCandidate(int32_t candidate) {
     MaxSGPR = std::max(MaxSGPR, candidate);
   }
+  void addMaxNamedBarrierCandidate(int32_t candidate) {
+    MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
+  }
 
   MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
                       MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ class MCResourceInfo {
   MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+  MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
 
   /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
   /// granularity. However, some resource info has to be assigned the call
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c82caced3c9b..664a15ca55f53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
       if (!BarAddr)
         llvm_unreachable("named barrier should have an assigned address");
       Entry.first->second = BarAddr.value();
+      unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+      recordNumNamedBarriers(BarAddr.value(), BarCnt);
       return BarAddr.value();
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34..fc64e16ffbeb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -49,6 +49,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Flag to check dynamic LDS usage by kernel.
   bool UsesDynamicLDS = false;
 
+  uint32_t NumNamedBarriers = 0;
+
   // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return GDSSize;
   }
 
+  void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
+    NumNamedBarriers =
+        std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+  }
+  uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
+
   bool isEntryFunction() const {
     return IsEntryFunction;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ccd2de18979d1..0ea9add891111 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
                          MRI.isLiveIn(MFI->getPreloadedReg(
                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
 
+  Info.NumNamedBarrier = MFI->getNumNamedBarriers();
+
   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
   // instructions aren't used to access the scratch buffer. Inline assembly may
   // need it though.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index acfff960d3f58..9ae3bb3ca048e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -35,6 +35,7 @@ struct AMDGPUResourceUsageAnalysisImpl {
     int32_t NumVGPR = 0;
     int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
+    int32_t NumNamedBarrier = 0;
     uint64_t CalleeSegmentSize = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634e62272..070de008d4f59 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
                                    "must be zero on gfx10 or gfx11");
     }
 
-    // Bits [14-30].
-    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
+    // Bits [14-16]
+    if (isGFX1250()) {
+      PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
+                      COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
+    } else {
+      CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
+                                   "COMPUTE_PGM_RSRC3",
+                                   "must be zero on gfx10+");
+    }
+
+    // Bits [17-30].
+    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
 
     // Bits [31].
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e20581d76fcde..197de1228a29e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -276,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
 
 void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
     const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-    const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-    const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-    const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-    const MCSymbol *HasIndirectCall) {
+    const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+    const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+    const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+    const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {
 #define PRINT_RES_INFO(ARG)                                                    \
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
@@ -290,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
   PRINT_RES_INFO(NumVGPR);
   PRINT_RES_INFO(NumAGPR);
   PRINT_RES_INFO(NumExplicitSGPR);
+  PRINT_RES_INFO(NumNamedBarrier);
   PRINT_RES_INFO(PrivateSegmentSize);
   PRINT_RES_INFO(UsesVCC);
   PRINT_RES_INFO(UsesFlatScratch);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9c49020850584..22afcdebcdf09 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -62,10 +62,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 
   virtual void EmitMCResourceInfo(
       const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-      const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-      const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-      const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-      const MCSymbol *HasIndirectCall) {};
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {};
 
   virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
                                       const MCSymbol *MaxAGPR,
@@ -141,14 +141,12 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
 
   void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
-  void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-                          const MCSymbol *NumExplicitSGPR,
-                          const MCSymbol *PrivateSegmentSize,
-                          const MCSymbol *UsesVCC,
-                          const MCSymbol *UsesFlatScratch,
-                          const MCSymbol *HasDynamicallySizedStack,
-                          const MCSymbol *HasRecursion,
-                          const MCSymbol *HasIndirectCall) override;
+  void EmitMCResourceInfo(
+      const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
 
   void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
                               const MCSymbol *MaxSGPR) override;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 5940f45e74bf2..93ba0a337d7dd 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
 
   NumSGPRsForWavesPerEU = ZeroExpr;
   NumVGPRsForWavesPerEU = ZeroExpr;
+  NamedBarCnt = ZeroExpr;
   Occupancy = ZeroExpr;
   DynamicCallStack = ZeroExpr;
   VCCUsed = ZeroExpr;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 79099d2182cc8..171c4a313a53b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
   // Number of VGPRs that meets number of waves per execution unit request.
   const MCExpr *NumVGPRsForWavesPerEU = nullptr;
 
+  // Number of named barriers used by the kernel.
+  const MCExpr *NamedBarCnt = nullptr;
+
   // Final occupancy.
   const MCExpr *Occupancy = nullptr;
 
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 7cf8883082458..07189ee46e3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
 
 @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@@ -9,6 +10,7 @@
 ; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
 ; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
 
+; SOUT:        .set func1.num_named_barrier, 3
 define void @func1() {
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
     call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
@@ -16,6 +18,7 @@ define void @func1() {
     ret void
 ...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Aug 19, 2025

@llvm/pr-subscribers-llvm-support

Author: Gang Chen (cmc-rep)

Changes

Patch is 20.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154409.diff

15 Files Affected:

  • (modified) llvm/include/llvm/Support/AMDHSAKernelDescriptor.h (+9-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+19)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+7)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp (+15)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h (+6)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (+8)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h (+1)
  • (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+12-2)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp (+5-4)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h (+10-12)
  • (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.cpp (+1)
  • (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.h (+3)
  • (modified) llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll (+5)
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 8f367390c531c..4fb6fa656b6e7 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -186,6 +186,9 @@ enum : int32_t {
 // [GFX10-GFX11].
 #define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH)                      \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
+// [GFX10-GFX120].
+#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_ ## NAME, SHIFT, WIDTH)
 // GFX11+.
 #define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
@@ -195,6 +198,9 @@ enum : int32_t {
 // GFX12+.
 #define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
+// [GFX125].
+#define COMPUTE_PGM_RSRC3_GFX125(NAME, SHIFT, WIDTH)                           \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX125_##NAME, SHIFT, WIDTH)
 enum : int32_t {
   COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
   COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
@@ -206,7 +212,9 @@ enum : int32_t {
   COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
   COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
   COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
+  COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 3),
+  COMPUTE_PGM_RSRC3_GFX125(NAMED_BAR_CNT, 14, 3),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 17, 14),
   COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
   COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 66c3fad6de1a1..69722bdf0a49b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -720,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                      IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
                      IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
+                     OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
                      OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -807,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
           " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
     }
 
+    if (AMDGPU::isGFX1250(STM)) {
+      const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
+      const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
+          CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
+      const MCExpr *BarBlks =
+          MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
+      OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
+                                  false);
+    }
+
     OutStreamer->emitRawComment(
         " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
 
@@ -1011,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DynamicCallStack =
       MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
                              GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
+  ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1253,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
+  if (AMDGPU::isGFX1250(STM))
+    ProgInfo.ComputePGMRSrc3 =
+        SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
+
   ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab7d753c..6c0192d68690e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   const GlobalValue *GV = G->getGlobal();
 
   if (!MFI->isModuleEntryFunction()) {
+    auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
     if (std::optional<uint32_t> Address =
             AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+      if (IsNamedBarrier) {
+        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+        MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+      }
       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+    } else if (IsNamedBarrier) {
+      llvm_unreachable("named barrier should have an assigned address");
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 6390853d53b56..6b3cdf57f3ad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
     return GOCS(".num_agpr");
   case RIK_NumSGPR:
     return GOCS(".numbered_sgpr");
+  case RIK_NumNamedBarrier:
+    return GOCS(".num_named_barrier");
   case RIK_PrivateSegSize:
     return GOCS(".private_seg_size");
   case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
     const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   assignMaxRegSym(MaxVGPRSym, MaxVGPR);
   assignMaxRegSym(MaxAGPRSym, MaxAGPR);
   assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+  assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
 }
 
 void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
+}
+
 // Tries to flatten recursive call register resource gathering. Simple cycle
 // avoiding dfs to find the constants in the propagated symbols.
 // Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
         case RIK_NumAGPR:
           ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
+        case RIK_NumNamedBarrier:
+          ArgExprs.push_back(MCSymbolRefExpr::create(
+              getMaxNamedBarrierSymbol(OutContext), OutContext));
+          break;
         }
       }
     }
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
   bool IsLocal = MF.getFunction().hasLocalLinkage();
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
     addMaxVGPRCandidate(FRI.NumVGPR);
     addMaxAGPRCandidate(FRI.NumAGPR);
     addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+    addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
   }
 
   const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
   SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
   SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
   SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+  SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
 
   {
     // The expression for private segment size should be: FRI.PrivateSegmentSize
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 297e93bb54e29..b6055166698b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -31,6 +31,7 @@ class MCResourceInfo {
     RIK_NumVGPR,
     RIK_NumAGPR,
     RIK_NumSGPR,
+    RIK_NumNamedBarrier,
     RIK_PrivateSegSize,
     RIK_UsesVCC,
     RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ class MCResourceInfo {
   int32_t MaxVGPR = 0;
   int32_t MaxAGPR = 0;
   int32_t MaxSGPR = 0;
+  int32_t MaxNamedBarrier = 0;
 
   // Whether the MCResourceInfo has been finalized through finalize(MCContext
   // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ class MCResourceInfo {
   void addMaxSGPRCandidate(int32_t candidate) {
     MaxSGPR = std::max(MaxSGPR, candidate);
   }
+  void addMaxNamedBarrierCandidate(int32_t candidate) {
+    MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
+  }
 
   MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
                       MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ class MCResourceInfo {
   MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+  MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
 
   /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
   /// granularity. However, some resource info has to be assigned the call
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c82caced3c9b..664a15ca55f53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
       if (!BarAddr)
         llvm_unreachable("named barrier should have an assigned address");
       Entry.first->second = BarAddr.value();
+      unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+      recordNumNamedBarriers(BarAddr.value(), BarCnt);
       return BarAddr.value();
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34..fc64e16ffbeb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -49,6 +49,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Flag to check dynamic LDS usage by kernel.
   bool UsesDynamicLDS = false;
 
+  uint32_t NumNamedBarriers = 0;
+
   // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return GDSSize;
   }
 
+  void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
+    NumNamedBarriers =
+        std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+  }
+  uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
+
   bool isEntryFunction() const {
     return IsEntryFunction;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ccd2de18979d1..0ea9add891111 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
                          MRI.isLiveIn(MFI->getPreloadedReg(
                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
 
+  Info.NumNamedBarrier = MFI->getNumNamedBarriers();
+
   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
   // instructions aren't used to access the scratch buffer. Inline assembly may
   // need it though.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index acfff960d3f58..9ae3bb3ca048e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -35,6 +35,7 @@ struct AMDGPUResourceUsageAnalysisImpl {
     int32_t NumVGPR = 0;
     int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
+    int32_t NumNamedBarrier = 0;
     uint64_t CalleeSegmentSize = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634e62272..070de008d4f59 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
                                    "must be zero on gfx10 or gfx11");
     }
 
-    // Bits [14-30].
-    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
+    // Bits [14-16]
+    if (isGFX1250()) {
+      PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
+                      COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
+    } else {
+      CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
+                                   "COMPUTE_PGM_RSRC3",
+                                   "must be zero on gfx10+");
+    }
+
+    // Bits [17-30].
+    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
 
     // Bits [31].
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e20581d76fcde..197de1228a29e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -276,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
 
 void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
     const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-    const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-    const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-    const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-    const MCSymbol *HasIndirectCall) {
+    const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+    const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+    const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+    const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {
 #define PRINT_RES_INFO(ARG)                                                    \
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
@@ -290,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
   PRINT_RES_INFO(NumVGPR);
   PRINT_RES_INFO(NumAGPR);
   PRINT_RES_INFO(NumExplicitSGPR);
+  PRINT_RES_INFO(NumNamedBarrier);
   PRINT_RES_INFO(PrivateSegmentSize);
   PRINT_RES_INFO(UsesVCC);
   PRINT_RES_INFO(UsesFlatScratch);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9c49020850584..22afcdebcdf09 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -62,10 +62,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 
   virtual void EmitMCResourceInfo(
       const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-      const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-      const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-      const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-      const MCSymbol *HasIndirectCall) {};
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {};
 
   virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
                                       const MCSymbol *MaxAGPR,
@@ -141,14 +141,12 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
 
   void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
-  void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-                          const MCSymbol *NumExplicitSGPR,
-                          const MCSymbol *PrivateSegmentSize,
-                          const MCSymbol *UsesVCC,
-                          const MCSymbol *UsesFlatScratch,
-                          const MCSymbol *HasDynamicallySizedStack,
-                          const MCSymbol *HasRecursion,
-                          const MCSymbol *HasIndirectCall) override;
+  void EmitMCResourceInfo(
+      const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
 
   void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
                               const MCSymbol *MaxSGPR) override;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 5940f45e74bf2..93ba0a337d7dd 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
 
   NumSGPRsForWavesPerEU = ZeroExpr;
   NumVGPRsForWavesPerEU = ZeroExpr;
+  NamedBarCnt = ZeroExpr;
   Occupancy = ZeroExpr;
   DynamicCallStack = ZeroExpr;
   VCCUsed = ZeroExpr;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 79099d2182cc8..171c4a313a53b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
   // Number of VGPRs that meets number of waves per execution unit request.
   const MCExpr *NumVGPRsForWavesPerEU = nullptr;
 
+  // Number of named barriers used by the kernel.
+  const MCExpr *NamedBarCnt = nullptr;
+
   // Final occupancy.
   const MCExpr *Occupancy = nullptr;
 
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 7cf8883082458..07189ee46e3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
 
 @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@@ -9,6 +10,7 @@
 ; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
 ; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
 
+; SOUT:        .set func1.num_named_barrier, 3
 define void @func1() {
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
     call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
@@ -16,6 +18,7 @@ define void @func1() {
     ret void
 ...
[truncated]

@cmc-rep
Copy link
Contributor Author

cmc-rep commented Aug 19, 2025

I am upstreaming those commits in their original order. So this is the 1st one.

@cmc-rep cmc-rep requested a review from rampitec August 19, 2025 19:17
Copy link

github-actions bot commented Aug 19, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix formatting downstream first though to avoid merge conflict.

@cmc-rep cmc-rep merged commit ef68d15 into llvm:main Aug 19, 2025
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants