Skip to content

[AMDGPU] Split vgpr regalloc pipeline #93526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}

const MachineFunction &getMF() const { return *MF; }

//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass();
FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
Expand Down Expand Up @@ -154,6 +155,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};

void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsID;

void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;

Expand Down
96 changes: 96 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass should be invoked at the end of wwm-regalloc pipeline.
/// It identifies the WWM regs allocated during this pipeline and add
/// them to the list of reserved registers so that they won't be available for
/// per-thread VGPR allocation in the subsequent regalloc pipeline.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"

namespace {

class AMDGPUReserveWWMRegs : public MachineFunctionPass {
public:
static char ID;

AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "AMDGPU Reserve WWM Registers";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // End anonymous namespace.

INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
"AMDGPU Reserve WWM Registers", false, false)

char AMDGPUReserveWWMRegs::ID = 0;

char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;

bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
continue;

Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
? MI.getOperand(0).getReg()
: MI.getOperand(1).getReg();

assert(Reg.isPhysical() &&
"All WWM registers should have been allocated by now.");

MFI->reserveWWMRegister(Reg);
Changed |= true;
}
}

// The renamable flag can't be set for reserved registers. Reset the flag for
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
// pipeline.
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (Register Reg : MFI->getWWMReservedRegs()) {
for (MachineOperand &MO : MRI.reg_operands(Reg))
MO.setIsRenamable(false);
}

// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
MFI->clearNonWWMRegAllocMask();

return Changed;
}
93 changes: 89 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};

class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
public:
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};

static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
Expand All @@ -122,13 +128,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}

/// -{sgpr|vgpr}-regalloc=... command line option.
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const SIMachineFunctionInfo *MFI =
MRI.getMF().getInfo<SIMachineFunctionInfo>();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
}

/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }

/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;

static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
Expand All @@ -145,6 +162,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));

static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<WWMRegisterRegAlloc>>
WWMRegAlloc("wwm-regalloc", cl::Hidden,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));

static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
Expand All @@ -164,6 +186,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}

static void initializeDefaultWWMRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();

if (!Ctor) {
Ctor = WWMRegAlloc;
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
}
}

static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
Expand All @@ -188,6 +219,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}

static FunctionPass *createBasicWWMRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createGreedyWWMRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createFastWWMRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
}

static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
Expand All @@ -204,6 +247,14 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(

static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
"basic register allocator",
createBasicWWMRegisterAllocator);
static WWMRegisterRegAlloc
greedyRegAllocWWMReg("greedy", "greedy register allocator",
createGreedyWWMRegisterAllocator);
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
createFastWWMRegisterAllocator);
} // anonymous namespace

static cl::opt<bool>
Expand Down Expand Up @@ -440,6 +491,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
Expand Down Expand Up @@ -989,6 +1041,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {

FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;

bool addRegAssignAndRewriteFast() override;
Expand Down Expand Up @@ -1382,7 +1435,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}

bool GCNPassConfig::addPreRewrite() {
addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
Expand Down Expand Up @@ -1418,12 +1470,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}

FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
initializeDefaultWWMRegisterAllocatorOnce);

RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();

if (Optimized)
return createGreedyWWMRegisterAllocator();

return createFastWWMRegisterAllocator();
}

FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}

static const char RegAllocOptNotSupportedMessage[] =
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
"and -vgpr-regalloc";

bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
Expand All @@ -1435,11 +1503,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

addPass(createVGPRAllocPass(false));
// For allocating other wwm register operands.
addPass(createWWMRegAllocPass(false));

addPass(&SILowerWWMCopiesID);
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(false));

return true;
}

Expand All @@ -1459,8 +1535,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

// For allocating other whole wave mode registers.
addPass(createWWMRegAllocPass(true));
addPass(&SILowerWWMCopiesID);
addPass(createVirtRegRewriter(false));
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));

addPreRewrite();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
Expand Down
Loading
Loading