Skip to content

Commit c04687f

Browse files
[LoopVectorize][AArch64][SVE] Generate wide active lane masks
This patch makes the LoopVectorize generate lane masks longer than the VF to allow the target to better utilise the instruction set. The vectorizer emit one or more wide `llvm.get.active.lane.mask.*` calls plus several `llvm.vector.extract.*` calls to yield the required number of VF-wide masks. The motivating exammple is a vectorised loop with unroll factor 2 that can use the SVE2.1 `whilelo` instruction with predicate pair result, or a SVE `whilelo` instruction with smaller element size plus `punpklo`/`punpkhi`. How wide is the lane mask that the vectoriser emits is controlled by a TargetTransformInfo hook `getMaxPredicateLength`.The default impementation (return the same length as the VF) keeps the change non-functional for targets that can't or are not prepared to handle wider lane masks.
1 parent c4dbea5 commit c04687f

23 files changed

+3045
-1146
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,8 @@ class TargetTransformInfo {
12411241
/// and the number of execution units in the CPU.
12421242
unsigned getMaxInterleaveFactor(ElementCount VF) const;
12431243

1244+
ElementCount getMaxPredicateLength(ElementCount VF) const;
1245+
12441246
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
12451247
static OperandValueInfo getOperandInfo(const Value *V);
12461248

@@ -1999,6 +2001,9 @@ class TargetTransformInfo::Concept {
19992001
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
20002002

20012003
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
2004+
2005+
virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
2006+
20022007
virtual InstructionCost getArithmeticInstrCost(
20032008
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
20042009
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2622,6 +2627,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26222627
unsigned getMaxInterleaveFactor(ElementCount VF) override {
26232628
return Impl.getMaxInterleaveFactor(VF);
26242629
}
2630+
2631+
ElementCount getMaxPredicateLength(ElementCount VF) const override {
2632+
return Impl.getMaxPredicateLength(VF);
2633+
}
2634+
26252635
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
26262636
unsigned &JTSize,
26272637
ProfileSummaryInfo *PSI,

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,8 @@ class TargetTransformInfoImplBase {
531531

532532
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
533533

534+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
535+
534536
InstructionCost getArithmeticInstrCost(
535537
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
536538
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
890890

891891
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
892892

893+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
894+
893895
InstructionCost getArithmeticInstrCost(
894896
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
895897
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
816816
return TTIImpl->getMaxInterleaveFactor(VF);
817817
}
818818

819+
ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
820+
return TTIImpl->getMaxPredicateLength(VF);
821+
}
822+
819823
TargetTransformInfo::OperandValueInfo
820824
TargetTransformInfo::getOperandInfo(const Value *V) {
821825
OperandValueKind OpInfo = OK_AnyValue;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3362,6 +3362,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
33623362
return ST->getMaxInterleaveFactor();
33633363
}
33643364

3365+
ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
3366+
// Do not create masks bigger than `<vscale x 16 x i1>`.
3367+
unsigned N = ST->hasSVE() ? 16 : 0;
3368+
// Do not create masks that are more than twice the VF.
3369+
N = std::min(N, 2 * VF.getKnownMinValue());
3370+
return VF.isScalable() ? ElementCount::getScalable(N)
3371+
: ElementCount::getFixed(N);
3372+
}
3373+
33653374
// For Falkor, we want to avoid having too many strided loads in a loop since
33663375
// that can exhaust the HW prefetcher resources. We adjust the unroller
33673376
// MaxCount preference below to attempt to ensure unrolling doesn't create too

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
157157

158158
unsigned getMaxInterleaveFactor(ElementCount VF);
159159

160+
ElementCount getMaxPredicateLength(ElementCount VF) const;
161+
160162
bool prefersVectorizedAddressing() const;
161163

162164
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ class VPBuilder {
196196
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
197197
DebugLoc DL = {}, const Twine &Name = "");
198198

199+
VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
200+
const Twine &Name = "") {
201+
auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
202+
if (BB)
203+
BB->insert(ALM, InsertPt);
204+
return ALM;
205+
}
206+
199207
//===--------------------------------------------------------------------===//
200208
// RAII helpers.
201209
//===--------------------------------------------------------------------===//

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,10 @@ class InnerLoopVectorizer {
593593
/// count of the original loop for both main loop and epilogue vectorization.
594594
void setTripCount(Value *TC) { TripCount = TC; }
595595

596+
ElementCount getMaxPredicateLength(ElementCount VF) const {
597+
return TTI->getMaxPredicateLength(VF);
598+
}
599+
596600
protected:
597601
friend class LoopVectorizationPlanner;
598602

@@ -7516,7 +7520,8 @@ LoopVectorizationPlanner::executePlan(
75167520
LLVM_DEBUG(BestVPlan.dump());
75177521

75187522
// Perform the actual loop transformation.
7519-
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7523+
VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
7524+
DT, ILV.Builder, &ILV, &BestVPlan,
75207525
OrigLoop->getHeader()->getContext());
75217526

75227527
// 0. Generate SCEV-dependent code into the preheader, including TripCount,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,12 +214,13 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
214214
return It;
215215
}
216216

217-
VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
217+
VPTransformState::VPTransformState(ElementCount VF, unsigned UF,
218+
ElementCount MaxPred, LoopInfo *LI,
218219
DominatorTree *DT, IRBuilderBase &Builder,
219220
InnerLoopVectorizer *ILV, VPlan *Plan,
220221
LLVMContext &Ctx)
221-
: VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
222-
LVer(nullptr),
222+
: VF(VF), UF(UF), MaxPred(MaxPred), LI(LI), DT(DT), Builder(Builder),
223+
ILV(ILV), Plan(Plan), LVer(nullptr),
223224
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
224225

225226
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,13 +234,14 @@ struct VPIteration {
234234
/// VPTransformState holds information passed down when "executing" a VPlan,
235235
/// needed for generating the output IR.
236236
struct VPTransformState {
237-
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
238-
DominatorTree *DT, IRBuilderBase &Builder,
237+
VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
238+
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
239239
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);
240240

241241
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
242242
ElementCount VF;
243243
unsigned UF;
244+
ElementCount MaxPred;
244245

245246
/// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
246247
/// value set during plan transformation, possibly a default value = whole
@@ -1167,7 +1168,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
11671168
Not,
11681169
SLPLoad,
11691170
SLPStore,
1170-
ActiveLaneMask,
11711171
ExplicitVectorLength,
11721172
CalculateTripCountMinusVF,
11731173
// Increment the canonical IV separately for each unrolled part.
@@ -1321,6 +1321,50 @@ class VPInstruction : public VPRecipeWithIRFlags {
13211321
}
13221322
};
13231323

1324+
class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
1325+
const std::string Name;
1326+
1327+
public:
1328+
VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
1329+
const Twine &Name = "")
1330+
: VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
1331+
std::initializer_list<VPValue *>{IV, TC}, DL),
1332+
Name(Name.str()) {}
1333+
1334+
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
1335+
1336+
VPRecipeBase *clone() override {
1337+
SmallVector<VPValue *, 2> Operands(operands());
1338+
assert(Operands.size() == 2 && "by construction");
1339+
auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
1340+
getDebugLoc(), Name);
1341+
New->transferFlags(*this);
1342+
return New;
1343+
}
1344+
1345+
void execute(VPTransformState &State) override;
1346+
1347+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1348+
/// Print the recipe.
1349+
void print(raw_ostream &O, const Twine &Indent,
1350+
VPSlotTracker &SlotTracker) const override;
1351+
#endif
1352+
1353+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1354+
assert(is_contained(operands(), Op) &&
1355+
"Op must be an operand of the recipe");
1356+
1357+
return true;
1358+
}
1359+
1360+
bool onlyFirstPartUsed(const VPValue *Op) const override {
1361+
assert(is_contained(operands(), Op) &&
1362+
"Op must be an operand of the recipe");
1363+
1364+
return false;
1365+
}
1366+
};
1367+
13241368
/// VPWidenRecipe is a recipe for producing a copy of vector type its
13251369
/// ingredient. This recipe covers most of the traditional vectorization cases
13261370
/// where each ingredient transforms into a vectorized version of itself.

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,6 @@ m_BranchOnCond(const Op0_t &Op0) {
209209
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
210210
}
211211

212-
template <typename Op0_t, typename Op1_t>
213-
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
214-
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
215-
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
216-
}
217212

218213
template <typename Op0_t, typename Op1_t>
219214
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
@@ -266,6 +261,35 @@ inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
266261
m_Or(const Op0_t &Op0, const Op1_t &Op1) {
267262
return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
268263
}
264+
265+
template <typename Op0_t, typename Op1_t>
266+
struct VPActiveLaneMask_match {
267+
Op0_t Op0;
268+
Op1_t Op1;
269+
270+
VPActiveLaneMask_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
271+
272+
bool match(const VPValue *V) {
273+
auto *DefR = V->getDefiningRecipe();
274+
return DefR && match(DefR);
275+
}
276+
277+
bool match(const VPRecipeBase *R) {
278+
auto *DefR = dyn_cast<VPActiveLaneMaskRecipe>(R);
279+
if (!DefR)
280+
return false;
281+
assert(DefR->getNumOperands() == 2 &&
282+
"recipe with matched opcode does not have 2 operands");
283+
return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1));
284+
}
285+
};
286+
287+
template <typename Op0_t, typename Op1_t>
288+
inline VPActiveLaneMask_match<Op0_t, Op1_t>
289+
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
290+
return {Op0, Op1};
291+
}
292+
269293
} // namespace VPlanPatternMatch
270294
} // namespace llvm
271295

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 86 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -336,24 +336,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
336336
Value *Op2 = State.get(getOperand(2), Part);
337337
return Builder.CreateSelect(Cond, Op1, Op2, Name);
338338
}
339-
case VPInstruction::ActiveLaneMask: {
340-
// Get first lane of vector induction variable.
341-
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
342-
// Get the original loop tripcount.
343-
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
344339

345-
// If this part of the active lane mask is scalar, generate the CMP directly
346-
// to avoid unnecessary extracts.
347-
if (State.VF.isScalar())
348-
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
349-
Name);
350-
351-
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
352-
auto *PredTy = VectorType::get(Int1Ty, State.VF);
353-
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
354-
{PredTy, ScalarTC->getType()},
355-
{VIVElem0, ScalarTC}, nullptr, Name);
356-
}
357340
case VPInstruction::FirstOrderRecurrenceSplice: {
358341
// Generate code to combine the previous and current values in vector v3.
359342
//
@@ -618,7 +601,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
618601
case VPInstruction::PtrAdd:
619602
// TODO: Cover additional opcodes.
620603
return vputils::onlyFirstLaneUsed(this);
621-
case VPInstruction::ActiveLaneMask:
622604
case VPInstruction::ExplicitVectorLength:
623605
case VPInstruction::CalculateTripCountMinusVF:
624606
case VPInstruction::CanonicalIVIncrementForPart:
@@ -653,9 +635,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
653635
case VPInstruction::SLPStore:
654636
O << "combined store";
655637
break;
656-
case VPInstruction::ActiveLaneMask:
657-
O << "active lane mask";
658-
break;
659638
case VPInstruction::ExplicitVectorLength:
660639
O << "EXPLICIT-VECTOR-LENGTH";
661640
break;
@@ -692,8 +671,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
692671
DL.print(O);
693672
}
694673
}
674+
675+
void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
676+
VPSlotTracker &SlotTracker) const {
677+
O << Indent << "EMIT ";
678+
679+
printAsOperand(O, SlotTracker);
680+
O << " = active lane mask";
681+
printFlags(O);
682+
printOperands(O, SlotTracker);
683+
684+
if (auto DL = getDebugLoc()) {
685+
O << ", !dbg ";
686+
DL.print(O);
687+
}
688+
}
689+
695690
#endif
696691

692+
void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
693+
assert(!State.Instance && "VPInstruction executing an Instance");
694+
695+
IRBuilderBase &Builder = State.Builder;
696+
Builder.SetCurrentDebugLocation(getDebugLoc());
697+
698+
// If this the active lane mask is scalar, generate the CMP directly
699+
// to avoid unnecessary extracts.
700+
if (State.VF.isScalar()) {
701+
for (int Part = State.UF - 1; Part >= 0; --Part) {
702+
// Get first lane of vector induction variable.
703+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
704+
// Get the original loop tripcount.
705+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
706+
707+
Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0,
708+
ScalarTC, Name);
709+
State.set(this, V, Part);
710+
}
711+
return;
712+
}
713+
714+
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
715+
auto *PredTy = VectorType::get(Int1Ty, State.VF);
716+
717+
unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
718+
State.UF * State.VF.getKnownMinValue());
719+
if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
720+
MaxPred % State.VF.getKnownMinValue() != 0) {
721+
for (int Part = State.UF - 1; Part >= 0; --Part) {
722+
// Get first lane of vector induction variable.
723+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
724+
// Get the original loop tripcount.
725+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
726+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
727+
{PredTy, ScalarTC->getType()},
728+
{VIVElem0, ScalarTC}, nullptr, Name);
729+
State.set(this, V, Part);
730+
}
731+
return;
732+
}
733+
734+
// Generate long active lane masks covering all the unrolled iterations.
735+
unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
736+
auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
737+
SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
738+
for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
739+
// Get first lane of vector induction variable.
740+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
741+
// Get the original loop tripcount.
742+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
743+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
744+
{LongPredTy, ScalarTC->getType()},
745+
{VIVElem0, ScalarTC}, nullptr, Name);
746+
LongMask[Part / PartsPerMask] = V;
747+
}
748+
749+
for (int Part = State.UF - 1; Part >= 0; --Part) {
750+
Value *ALM = LongMask[Part / PartsPerMask];
751+
const unsigned I = Part % PartsPerMask;
752+
Value *V = Builder.CreateIntrinsic(
753+
Intrinsic::vector_extract, {PredTy, ALM->getType()},
754+
{ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
755+
I * State.VF.getKnownMinValue())},
756+
nullptr, Name);
757+
758+
State.set(this, V, Part);
759+
}
760+
}
761+
697762
void VPWidenCallRecipe::execute(VPTransformState &State) {
698763
assert(State.VF.isVector() && "not widening");
699764
auto &CI = *cast<CallInst>(getUnderlyingInstr());

0 commit comments

Comments
 (0)