Skip to content

[AMDGPU][IGLP] SingleWaveOpt: Cache DSW Counters from PreRA #67759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 76 additions & 56 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,8 @@ class IGLPStrategy {
// Add SchedGroups to \p Pipeline to implement this Strategy.
virtual void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
bool IsPostRA) = 0;

// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
Expand All @@ -868,7 +869,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
public:
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
bool IsPostRA) override;

bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

Expand All @@ -880,7 +882,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {

void MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
bool IsPostRA) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)
Expand Down Expand Up @@ -1076,9 +1079,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
Cache->push_back(Pred.getSUnit());
}
}

// If the other group has no PERM preds, then this group won't share any
if (!Cache->size())
return false;
}

assert(Cache->size());
auto DAG = SyncPipe[0].DAG;
// Does the previous DS_WRITE share a V_PERM predecessor with this
// VMEM_READ
Expand All @@ -1095,7 +1101,8 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
public:
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
bool IsPostRA) override;

bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

Expand All @@ -1105,14 +1112,20 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
};

static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;

void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
bool IsPostRA) {
unsigned MFMACount = 0;
unsigned DSWCount = 0;
unsigned DSWWithPermCount = 0;
unsigned DSWWithSharedVMEMCount = 0;
unsigned DSRCount = 0;

assert((IsPostRA ||
DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jrbyrnes
I don't know this code at all, but gcc warned on this assert and suggested some paratheses, so it caught my eye.
Does it really check what the assert string says?
I think e.g. the assert passes if the counts are 1, 1, 0 but the string says all should be 0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right. Thanks for pointing that out.

"DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto I = SU.getInstr();
Expand All @@ -1121,7 +1134,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
else if (I->mayStore()) {
else if (I->mayStore() && !IsPostRA) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
Expand All @@ -1133,56 +1146,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}
}
DSWWithPermCount = DSWithPerms.size();
auto I = DSWithPerms.begin();
auto E = DSWithPerms.end();

// Get the count of DS_WRITES with V_PERM predecessors which
// have loop carried dependencies (WAR) on the same VMEM_READs.
// We consider partial overlap as a miss -- in other words,
// for a given DS_W, we only consider another DS_W as matching
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
// for every V_PERM pred of this DS_W.
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
SmallVector<SUnit *, 6> Counted;
for (; I != E; I++) {
SUnit *Cand = nullptr;
bool MissedAny = false;
for (auto &Pred : (*I)->Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;

if (Cand && llvm::is_contained(Counted, Cand))
break;

for (auto &Succ : Pred.getSUnit()->Succs) {
auto MI = Succ.getSUnit()->getInstr();
if (!TII->isVMEM(*MI) || !MI->mayLoad())
if (!IsPostRA) {
DSWWithPermCount = DSWithPerms.size();
auto I = DSWithPerms.begin();
auto E = DSWithPerms.end();

// Get the count of DS_WRITES with V_PERM predecessors which
// have loop carried dependencies (WAR) on the same VMEM_READs.
// We consider partial overlap as a miss -- in other words,
// for a given DS_W, we only consider another DS_W as matching
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
// for every V_PERM pred of this DS_W.
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
SmallVector<SUnit *, 6> Counted;
for (; I != E; I++) {
SUnit *Cand = nullptr;
bool MissedAny = false;
for (auto &Pred : (*I)->Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;

if (MissedAny || !VMEMLookup.size()) {
MissedAny = true;
VMEMLookup[MI] = *I;
continue;
}
if (Cand && llvm::is_contained(Counted, Cand))
break;

if (!VMEMLookup.contains(MI)) {
MissedAny = true;
VMEMLookup[MI] = *I;
continue;
}
for (auto &Succ : Pred.getSUnit()->Succs) {
auto MI = Succ.getSUnit()->getInstr();
if (!TII->isVMEM(*MI) || !MI->mayLoad())
continue;

Cand = VMEMLookup[MI];
if (llvm::is_contained(Counted, Cand)) {
MissedAny = true;
break;
if (MissedAny || !VMEMLookup.size()) {
MissedAny = true;
VMEMLookup[MI] = *I;
continue;
}

if (!VMEMLookup.contains(MI)) {
MissedAny = true;
VMEMLookup[MI] = *I;
continue;
}

Cand = VMEMLookup[MI];
if (llvm::is_contained(Counted, Cand)) {
MissedAny = true;
break;
}
}
}
}
if (!MissedAny && Cand) {
DSWWithSharedVMEMCount += 2;
Counted.push_back(Cand);
Counted.push_back(*I);
if (!MissedAny && Cand) {
DSWWithSharedVMEMCount += 2;
Counted.push_back(Cand);
Counted.push_back(*I);
}
}
}

Expand Down Expand Up @@ -1398,7 +1414,11 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
// first created SchedGroup first.
bool IsBottomUp = 1;

// Whether the mutation is being applied to post RA scheduling
bool IsPostRA = false;

IGroupLPDAGMutation() = default;
IGroupLPDAGMutation(bool IsPostRA) : IsPostRA(IsPostRA) {}
};

unsigned SchedGroup::NumSchedGroups = 0;
Expand Down Expand Up @@ -1686,16 +1706,16 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
auto S = createIGLPStrategy(StrategyID, DAG, TII);
if (S->shouldApplyStrategy(DAG)) {
IsBottomUp = S->IsBottomUp;
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsPostRA);
}
}

} // namespace

namespace llvm {

std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
return std::make_unique<IGroupLPDAGMutation>();
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA) {
return std::make_unique<IGroupLPDAGMutation>(IsPostRA);
}

} // end namespace llvm
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

namespace llvm {

std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA);

} // namespace llvm

Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
Expand All @@ -450,7 +450,7 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
return DAG;
}

Expand Down Expand Up @@ -905,7 +905,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;

SavedMutations.swap(DAG.Mutations);
DAG.addMutation(createIGroupLPDAGMutation());
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));

InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
Expand Down Expand Up @@ -843,7 +843,7 @@ bool GCNSchedStage::initGCNRegion() {
StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
SavedMutations.clear();
SavedMutations.swap(DAG.Mutations);
DAG.addMutation(createIGroupLPDAGMutation());
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
}

return true;
Expand Down Expand Up @@ -1557,7 +1557,7 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
addMutation(createIGroupLPDAGMutation());
addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
}

ScheduleDAGMI::schedule();
Expand Down
Loading