@@ -1663,17 +1663,17 @@ class LoopVectorizationCostModel {
1663
1663
/// disabled or unsupported, then the scalable part will be equal to
1664
1664
/// ElementCount::getScalable(0).
1665
1665
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1666
+ unsigned MaxTripCount,
1666
1667
ElementCount UserVF,
1667
1668
bool FoldTailByMasking);
1668
1669
1669
1670
/// \return the maximized element count based on the targets vector
1670
1671
/// registers and the loop trip-count, but limited to a maximum safe VF.
1671
1672
/// This is a helper function of computeFeasibleMaxVF.
1672
- ElementCount getMaximizedVFForTarget (unsigned ConstTripCount,
1673
- unsigned SmallestType,
1674
- unsigned WidestType,
1675
- ElementCount MaxSafeVF,
1676
- bool FoldTailByMasking);
1673
+ ElementCount
1674
+ getMaximizedVFForTarget(unsigned ConstTripCount, unsigned MaxTripCount,
1675
+ unsigned SmallestType, unsigned WidestType,
1676
+ ElementCount MaxSafeVF, bool FoldTailByMasking);
1677
1677
1678
1678
/// \return the maximum legal scalable VF, based on the safe max number
1679
1679
/// of elements.
@@ -4811,7 +4811,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4811
4811
}
4812
4812
4813
4813
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4814
- unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4814
+ unsigned ConstTripCount, unsigned MaxTripCount, ElementCount UserVF,
4815
+ bool FoldTailByMasking) {
4815
4816
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4816
4817
unsigned SmallestType, WidestType;
4817
4818
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4898,14 +4899,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4898
4899
4899
4900
FixedScalableVFPair Result(ElementCount::getFixed(1),
4900
4901
ElementCount::getScalable(0));
4901
- if (auto MaxVF =
4902
- getMaximizedVFForTarget (ConstTripCount, SmallestType, WidestType,
4903
- MaxSafeFixedVF, FoldTailByMasking))
4902
+ if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, MaxTripCount,
4903
+ SmallestType, WidestType,
4904
+ MaxSafeFixedVF, FoldTailByMasking))
4904
4905
Result.FixedVF = MaxVF;
4905
4906
4906
- if (auto MaxVF =
4907
- getMaximizedVFForTarget ( ConstTripCount, SmallestType, WidestType,
4908
- MaxSafeScalableVF, FoldTailByMasking))
4907
+ if (auto MaxVF = getMaximizedVFForTarget(
4908
+ ConstTripCount, MaxTripCount , SmallestType, WidestType,
4909
+ MaxSafeScalableVF, FoldTailByMasking))
4909
4910
if (MaxVF.isScalable()) {
4910
4911
Result.ScalableVF = MaxVF;
4911
4912
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@@ -4928,6 +4929,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4928
4929
}
4929
4930
4930
4931
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4932
+ unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4931
4933
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4932
4934
if (TC == 1) {
4933
4935
reportVectorizationFailure("Single iteration (non) loop",
@@ -4938,7 +4940,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4938
4940
4939
4941
switch (ScalarEpilogueStatus) {
4940
4942
case CM_ScalarEpilogueAllowed:
4941
- return computeFeasibleMaxVF (TC, UserVF, false );
4943
+ return computeFeasibleMaxVF(TC, MaxTC, UserVF, false);
4942
4944
case CM_ScalarEpilogueNotAllowedUsePredicate:
4943
4945
[[fallthrough]];
4944
4946
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -4976,7 +4978,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4976
4978
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4977
4979
"scalar epilogue instead.\n");
4978
4980
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4979
- return computeFeasibleMaxVF (TC, UserVF, false );
4981
+ return computeFeasibleMaxVF(TC, MaxTC, UserVF, false);
4980
4982
}
4981
4983
return FixedScalableVFPair::getNone();
4982
4984
}
@@ -4993,7 +4995,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4993
4995
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4994
4996
}
4995
4997
4996
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF (TC, UserVF, true );
4998
+ FixedScalableVFPair MaxFactors =
4999
+ computeFeasibleMaxVF(TC, MaxTC, UserVF, true);
4997
5000
4998
5001
// Avoid tail folding if the trip count is known to be a multiple of any VF
4999
5002
// we choose.
@@ -5069,8 +5072,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5069
5072
}
5070
5073
5071
5074
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5072
- unsigned ConstTripCount, unsigned SmallestType , unsigned WidestType ,
5073
- ElementCount MaxSafeVF, bool FoldTailByMasking) {
5075
+ unsigned ConstTripCount, unsigned MaxTripCount , unsigned SmallestType ,
5076
+ unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) {
5074
5077
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5075
5078
const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5076
5079
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5108,24 +5111,24 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5108
5111
}
5109
5112
5110
5113
// When a scalar epilogue is required, at least one iteration of the scalar
5111
- // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
5114
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
5112
5115
// max VF that results in a dead vector loop.
5113
- if (ConstTripCount > 0 && requiresScalarEpilogue (true ))
5114
- ConstTripCount -= 1 ;
5115
-
5116
- if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5117
- (!FoldTailByMasking || isPowerOf2_32 (ConstTripCount ))) {
5118
- // If loop trip count (TC) is known at compile time there is no point in
5119
- // choosing VF greater than TC (as done in the loop below). Select maximum
5120
- // power of two which doesn't exceed TC.
5121
- // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5122
- // when the TC is less than or equal to the known number of lanes.
5123
- auto ClampedConstTripCount = llvm::bit_floor (ConstTripCount );
5116
+ if (MaxTripCount > 0 && requiresScalarEpilogue(true))
5117
+ MaxTripCount -= 1;
5118
+
5119
+ if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
5120
+ (!FoldTailByMasking || isPowerOf2_32(MaxTripCount ))) {
5121
+ // If upper bound loop trip count (TC) is known at compile time there is no
5122
+ // point in choosing VF greater than TC (as done in the loop below). Select
5123
+ // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
5124
+ // scalable, we only fall back on a fixed VF when the TC is less than or
5125
+ // equal to the known number of lanes.
5126
+ auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount );
5124
5127
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5125
5128
"exceeding the constant trip count: "
5126
- << ClampedConstTripCount << " \n " );
5129
+ << ClampedUpperTripCount << "\n");
5127
5130
return ElementCount::get(
5128
- ClampedConstTripCount ,
5131
+ ClampedUpperTripCount ,
5129
5132
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
5130
5133
}
5131
5134
0 commit comments