llvm · jrbyrnes · Oct 7, 2023 · Sep 29, 2023 · Oct 5, 2023 · mikaelholmen
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -850,7 +850,8 @@ class IGLPStrategy {
   // Add SchedGroups to \p Pipeline to implement this Strategy.
   virtual void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -868,7 +869,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 public:
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -880,7 +882,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 
 void MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    bool IsPostRA) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -1076,9 +1079,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
               Cache->push_back(Pred.getSUnit());
           }
         }
+
+        // If the other group has no PERM preds, then this group won't share any
+        if (!Cache->size())
+          return false;
       }
 
-      assert(Cache->size());
       auto DAG = SyncPipe[0].DAG;
       // Does the previous DS_WRITE share a V_PERM predecessor with this
       // VMEM_READ
@@ -1095,7 +1101,8 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 public:
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -1105,14 +1112,20 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   }
 };
 
+static unsigned DSWCount = 0;
+static unsigned DSWWithPermCount = 0;
+static unsigned DSWWithSharedVMEMCount = 0;
+
 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    bool IsPostRA) {
   unsigned MFMACount = 0;
-  unsigned DSWCount = 0;
-  unsigned DSWWithPermCount = 0;
-  unsigned DSWWithSharedVMEMCount = 0;
   unsigned DSRCount = 0;
+
+  assert((IsPostRA ||
+          DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
+         "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
     auto I = SU.getInstr();
@@ -1121,7 +1134,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore()) {
+      else if (I->mayStore() && !IsPostRA) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1133,56 +1146,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
       }
     }
   }
-  DSWWithPermCount = DSWithPerms.size();
-  auto I = DSWithPerms.begin();
-  auto E = DSWithPerms.end();
-
-  // Get the count of DS_WRITES with V_PERM predecessors which
-  // have loop carried dependencies (WAR) on the same VMEM_READs.
-  // We consider partial overlap as a miss -- in other words,
-  // for a given DS_W, we only consider another DS_W as matching
-  // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
-  // for every V_PERM pred of this DS_W.
-  DenseMap<MachineInstr *, SUnit *> VMEMLookup;
-  SmallVector<SUnit *, 6> Counted;
-  for (; I != E; I++) {
-    SUnit *Cand = nullptr;
-    bool MissedAny = false;
-    for (auto &Pred : (*I)->Preds) {
-      if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
-        continue;
 
-      if (Cand && llvm::is_contained(Counted, Cand))
-        break;
-
-      for (auto &Succ : Pred.getSUnit()->Succs) {
-        auto MI = Succ.getSUnit()->getInstr();
-        if (!TII->isVMEM(*MI) || !MI->mayLoad())
+  if (!IsPostRA) {
+    DSWWithPermCount = DSWithPerms.size();
+    auto I = DSWithPerms.begin();
+    auto E = DSWithPerms.end();
+
+    // Get the count of DS_WRITES with V_PERM predecessors which
+    // have loop carried dependencies (WAR) on the same VMEM_READs.
+    // We consider partial overlap as a miss -- in other words,
+    // for a given DS_W, we only consider another DS_W as matching
+    // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+    // for every V_PERM pred of this DS_W.
+    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+    SmallVector<SUnit *, 6> Counted;
+    for (; I != E; I++) {
+      SUnit *Cand = nullptr;
+      bool MissedAny = false;
+      for (auto &Pred : (*I)->Preds) {
+        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;
 
-        if (MissedAny || !VMEMLookup.size()) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        if (Cand && llvm::is_contained(Counted, Cand))
+          break;
 
-        if (!VMEMLookup.contains(MI)) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        for (auto &Succ : Pred.getSUnit()->Succs) {
+          auto MI = Succ.getSUnit()->getInstr();
+          if (!TII->isVMEM(*MI) || !MI->mayLoad())
+            continue;
 
-        Cand = VMEMLookup[MI];
-        if (llvm::is_contained(Counted, Cand)) {
-          MissedAny = true;
-          break;
+          if (MissedAny || !VMEMLookup.size()) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          if (!VMEMLookup.contains(MI)) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          Cand = VMEMLookup[MI];
+          if (llvm::is_contained(Counted, Cand)) {
+            MissedAny = true;
+            break;
+          }
         }
       }
-    }
-    if (!MissedAny && Cand) {
-      DSWWithSharedVMEMCount += 2;
-      Counted.push_back(Cand);
-      Counted.push_back(*I);
+      if (!MissedAny && Cand) {
+        DSWWithSharedVMEMCount += 2;
+        Counted.push_back(Cand);
+        Counted.push_back(*I);
+      }
     }
   }
 
@@ -1398,7 +1414,11 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // first created SchedGroup first.
   bool IsBottomUp = 1;
 
+  // Whether the mutation is being applied to post RA scheduling
+  bool IsPostRA = false;
+
   IGroupLPDAGMutation() = default;
+  IGroupLPDAGMutation(bool IsPostRA) : IsPostRA(IsPostRA) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -1686,16 +1706,16 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (S->shouldApplyStrategy(DAG)) {
     IsBottomUp = S->IsBottomUp;
-    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsPostRA);
   }
 }
 
 } // namespace
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
-  return std::make_unique<IGroupLPDAGMutation>();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA) {
+  return std::make_unique<IGroupLPDAGMutation>(IsPostRA);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,7 @@
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA);
 
 } // namespace llvm
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -440,7 +440,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -450,7 +450,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   return DAG;
 }
 
@@ -905,7 +905,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createIGroupLPDAGMutation());
+    DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -706,7 +706,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation());
+  DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -843,7 +843,7 @@ bool GCNSchedStage::initGCNRegion() {
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
     SavedMutations.clear();
     SavedMutations.swap(DAG.Mutations);
-    DAG.addMutation(createIGroupLPDAGMutation());
+    DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   }
 
   return true;
@@ -1557,7 +1557,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation());
+    addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
   }
 
   ScheduleDAGMI::schedule();