Skip to content

pr/amdgpu closed world #66488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions llvm/include/llvm/Transforms/IPO/Attributor.h
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,7 @@ struct AttributorConfig {
/// Callback function to determine if an indirect call targets should be made
/// direct call targets (with an if-cascade).
std::function<bool(Attributor &A, const AbstractAttribute &AA, CallBase &CB,
Function &AssummedCallee)>
Function &AssumedCallee, unsigned NumCallees)>
IndirectCalleeSpecializationCallback = nullptr;

/// Helper to update an underlying call graph and to delete functions.
Expand Down Expand Up @@ -1718,10 +1718,11 @@ struct Attributor {
/// Return true if we should specialize the call site \b CB for the potential
/// callee \p Fn.
bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &AA,
CallBase &CB, Function &Callee) {
CallBase &CB, Function &Callee,
unsigned NumCallees) {
return Configuration.IndirectCalleeSpecializationCallback
? Configuration.IndirectCalleeSpecializationCallback(*this, AA,
CB, Callee)
? Configuration.IndirectCalleeSpecializationCallback(
*this, AA, CB, Callee, NumCallees)
: true;
}

Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);

Pass *createAMDGPUAnnotateKernelFeaturesPass();
Pass *createAMDGPUAttributorLegacyPass();
Pass *createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility = false);
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
Expand Down Expand Up @@ -287,8 +287,13 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
private:
TargetMachine &TM;

/// Asserts whether we can assume whole program visibility during codegen.
bool HasWholeProgramVisibility = false;

public:
AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){};
AMDGPUAttributorPass(TargetMachine &TM,
bool HasWholeProgramVisibility = false)
: TM(TM), HasWholeProgramVisibility(HasWholeProgramVisibility){};
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

Expand Down
48 changes: 38 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/Casting.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include <optional>

#define DEBUG_TYPE "amdgpu-attributor"

Expand Down Expand Up @@ -1023,7 +1027,8 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
}
}

static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
bool HasWholeProgramVisibility) {
SetVector<Function *> Functions;
for (Function &F : M) {
if (!F.isIntrinsic())
Expand All @@ -1036,14 +1041,32 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID});
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID,
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
&AAIndirectCallInfo::ID});

/// Helper to decide if we should specialize the indirect \p CB for \p Callee,
/// which is one of the \p NumCallees potential callees.
auto IndirectCalleeSpecializationCallback =
[&](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
Function &Callee, unsigned NumCallees) {
if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
return false;
// Singleton functions should be specialized.
if (NumCallees == 1)
return true;
// Otherwise specialize uniform values.
const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's unfortunate this non-subtarget dependent property requires you to query the function's subtarget

return TTI.isAlwaysUniform(CB.getCalledOperand());
};

AttributorConfig AC(CGUpdater);
AC.Allowed = &Allowed;
AC.IsModulePass = true;
AC.DefaultInitializeLiveInternals = false;
AC.IsClosedWorldModule = HasWholeProgramVisibility;
AC.IndirectCalleeSpecializationCallback =
IndirectCalleeSpecializationCallback;
AC.IPOAmendableCB = [](const Function &F) {
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
};
Expand All @@ -1070,8 +1093,12 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
}

class AMDGPUAttributorLegacy : public ModulePass {
/// Asserts whether we can assume whole program visibility during codegen.
bool HasWholeProgramVisibility = false;

public:
AMDGPUAttributorLegacy() : ModulePass(ID) {}
AMDGPUAttributorLegacy(bool HasWholeProgramVisibility = false)
: ModulePass(ID), HasWholeProgramVisibility(HasWholeProgramVisibility) {}

/// doInitialization - Virtual method overridden by subclasses to do
/// any necessary initialization before any pass is run.
Expand All @@ -1086,7 +1113,7 @@ class AMDGPUAttributorLegacy : public ModulePass {

bool runOnModule(Module &M) override {
AnalysisGetter AG(this);
return runImpl(M, AG, *TM);
return runImpl(M, AG, *TM, HasWholeProgramVisibility);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
Expand All @@ -1107,14 +1134,15 @@ PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
AnalysisGetter AG(FAM);

// TODO: Probably preserves CFG
return runImpl(M, AG, TM) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
return runImpl(M, AG, TM, HasWholeProgramVisibility)
? PreservedAnalyses::none()
: PreservedAnalyses::all();
}

char AMDGPUAttributorLegacy::ID = 0;

Pass *llvm::createAMDGPUAttributorLegacyPass() {
return new AMDGPUAttributorLegacy();
Pass *llvm::createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility) {
return new AMDGPUAttributorLegacy(HasWholeProgramVisibility);
}
INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
false, false)
Expand Down
11 changes: 3 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,16 +734,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
});

// FIXME: Why is AMDGPUAttributor not in CGSCC?
PB.registerOptimizerLastEPCallback(
[this](ModulePassManager &MPM, OptimizationLevel Level) {
if (Level != OptimizationLevel::O0) {
MPM.addPass(AMDGPUAttributorPass(*this));
}
});

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
if (Level != OptimizationLevel::O0)
PM.addPass(
AMDGPUAttributorPass(*this, /*HasWholeProgramVisibility=*/true));
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/IPO/Attributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3836,7 +3836,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
if (MaxSpecializationPerCB.getNumOccurrences()) {
AC.IndirectCalleeSpecializationCallback =
[&](Attributor &, const AbstractAttribute &AA, CallBase &CB,
Function &Callee) {
Function &Callee, unsigned NumCallees) {
if (MaxSpecializationPerCB == 0)
return false;
auto &Set = IndirectCalleeTrackingMap[&CB];
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/IPO/AttributorAttributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12347,7 +12347,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
SmallVector<Function *, 8> SkippedAssumedCallees;
SmallVector<std::pair<CallInst *, Instruction *>> NewCalls;
for (Function *NewCallee : AssumedCallees) {
if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) {
if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee,
AssumedCallees.size())) {
SkippedAssumedCallees.push_back(NewCallee);
SpecializedForAllCallees = false;
continue;
Expand Down
Loading
Loading