Skip to content

[LV] Cherry-picked changes for loop interleaving algorithm #8320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

89 changes: 53 additions & 36 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
cl::desc("We don't interleave loops with a estimated constant trip count "
"below this number"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
Expand Down Expand Up @@ -316,12 +311,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));

/// Interleave small loops with scalar reductions.
static cl::opt<bool> InterleaveSmallLoopScalarReduction(
"interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
cl::desc("Enable interleaving for loops with small iteration counts that "
"contain scalar reductions to expose ILP."));

/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
Expand Down Expand Up @@ -5823,14 +5812,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
// Do not interleave loops with a relatively small known or estimated trip
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
// because with the above conditions interleaving can expose ILP and break
// cross iteration dependences for reductions.
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
return 1;

// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
Expand Down Expand Up @@ -5903,21 +5884,58 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

// If trip count is known or estimated compile time constant, limit the
// interleave count to be less than the trip count divided by VF, provided it
// is at least 1.
//
// For scalable vectors we can't know if interleaving is beneficial. It may
// not be beneficial for small loops if none of the lanes in the second vector
// iterations is enabled. However, for larger loops, there is likely to be a
// similar benefit as for fixed-width vectors. For now, we choose to leave
// the InterleaveCount as if vscale is '1', although if some information about
// the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {
MaxInterleaveCount =
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
// Make sure MaxInterleaveCount is greater than 0.
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable()) {
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
EstimatedVF *= *VScale;
}
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");

unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (KnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC =
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;

// If trip count is known we select between two prospective ICs, where
// 1) the aggressive IC is capped by the trip count divided by VF
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
// The final IC is selected in a way that the epilogue loop trip count is
// minimized while maximizing the IC itself, so that we either run the
// vector loop at least once if it generates a small epilogue loop, or else
// we run the vector loop at least twice.

unsigned InterleaveCountUB = bit_floor(
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
MaxInterleaveCount = InterleaveCountLB;

if (InterleaveCountUB != InterleaveCountLB) {
unsigned TailTripCountUB =
(AvailableTC % (EstimatedVF * InterleaveCountUB));
unsigned TailTripCountLB =
(AvailableTC % (EstimatedVF * InterleaveCountLB));
// If both produce same scalar tail, maximize the IC to do the same work
// in fewer vector loop iterations
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
} else if (BestKnownTC && *BestKnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
? (*BestKnownTC) - 1
: *BestKnownTC;

// If trip count is an estimated compile time constant, limit the
// IC to be capped by the trip count divided by VF * 2, such that the vector
// loop runs at least twice to make interleaving seem profitable when there
// is an epilogue loop present. Since exact Trip count is not known we
// choose to be conservative in our IC estimate.
MaxInterleaveCount = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
}

assert(MaxInterleaveCount > 0 &&
Expand Down Expand Up @@ -6021,8 +6039,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// If there are scalar reductions and TTI has enabled aggressive
// interleaving for reductions, we will interleave to expose ILP.
if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
AggressivelyInterleaveReductions) {
if (VF.isScalar() && AggressivelyInterleaveReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
// Interleave no less than SmallIC but not as aggressive as the normal IC
// to satisfy the rare situation when resources are too limited.
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; REQUIRES: x86-registered-target
; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
; RUN: < %s | FileCheck %s

Expand Down Expand Up @@ -79,6 +80,8 @@ entry:


; VECTORIZE: mul <4 x i32>
; VECTORIZE: mul <4 x i32>
; VECTORIZE-NOT: mul <4 x i32>

for.body: ; preds = %for.body, %entry
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
Expand Down
Loading