@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5436
5436
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5437
5437
5438
5438
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5439
- if (KnownTC) {
5439
+ if (KnownTC > 0) {
5440
+ // At least one iteration must be scalar when this constraint holds. So the
5441
+ // maximum available iterations for interleaving is one less.
5442
+ unsigned AvailableTC =
5443
+ requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5444
+
5440
5445
// If trip count is known we select between two prospective ICs, where
5441
5446
// 1) the aggressive IC is capped by the trip count divided by VF
5442
5447
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5446
5451
// we run the vector loop at least twice.
5447
5452
5448
5453
unsigned InterleaveCountUB = bit_floor(
5449
- std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5454
+ std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5450
5455
unsigned InterleaveCountLB = bit_floor(std::max(
5451
- 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5456
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5452
5457
MaxInterleaveCount = InterleaveCountLB;
5453
5458
5454
5459
if (InterleaveCountUB != InterleaveCountLB) {
5455
- unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5456
- unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5460
+ unsigned TailTripCountUB =
5461
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5462
+ unsigned TailTripCountLB =
5463
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5457
5464
// If both produce same scalar tail, maximize the IC to do the same work
5458
5465
// in fewer vector loop iterations
5459
5466
if (TailTripCountUB == TailTripCountLB)
5460
5467
MaxInterleaveCount = InterleaveCountUB;
5461
5468
}
5462
- } else if (BestKnownTC) {
5469
+ } else if (BestKnownTC && *BestKnownTC > 0) {
5470
+ // At least one iteration must be scalar when this constraint holds. So the
5471
+ // maximum available iterations for interleaving is one less.
5472
+ unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5473
+ ? (*BestKnownTC) - 1
5474
+ : *BestKnownTC;
5475
+
5463
5476
// If trip count is an estimated compile time constant, limit the
5464
5477
// IC to be capped by the trip count divided by VF * 2, such that the vector
5465
5478
// loop runs at least twice to make interleaving seem profitable when there
5466
5479
// is an epilogue loop present. Since exact Trip count is not known we
5467
5480
// choose to be conservative in our IC estimate.
5468
5481
MaxInterleaveCount = bit_floor(std::max(
5469
- 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5482
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5470
5483
}
5471
5484
5472
5485
assert(MaxInterleaveCount > 0 &&
0 commit comments