Skip to content

Commit 790bbf6

Browse files
committed
[VP][EVL] Support select instruction with EVL-vectorization
1 parent f41f6ea commit 790bbf6

File tree

7 files changed

+227
-7
lines changed

7 files changed

+227
-7
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

+65-3
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
922922
case VPRecipeBase::VPWidenSC:
923923
case VPRecipeBase::VPWidenEVLSC:
924924
case VPRecipeBase::VPWidenSelectSC:
925+
case VPRecipeBase::VPWidenSelectEVLSC:
925926
case VPRecipeBase::VPBlendSC:
926927
case VPRecipeBase::VPPredInstPHISC:
927928
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1689,10 +1690,17 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
16891690

16901691
/// A recipe for widening select instructions.
16911692
struct VPWidenSelectRecipe : public VPSingleDefRecipe {
1693+
1694+
protected:
1695+
template <typename IterT>
1696+
VPWidenSelectRecipe(unsigned VPDefOpcode, SelectInst &I,
1697+
iterator_range<IterT> Operands)
1698+
: VPSingleDefRecipe(VPDefOpcode, Operands, &I, I.getDebugLoc()) {}
1699+
1700+
public:
16921701
template <typename IterT>
16931702
VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
1694-
: VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
1695-
I.getDebugLoc()) {}
1703+
: VPWidenSelectRecipe(VPDef::VPWidenSelectSC, I, Operands) {}
16961704

16971705
~VPWidenSelectRecipe() override = default;
16981706

@@ -1701,7 +1709,15 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
17011709
operands());
17021710
}
17031711

1704-
VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
1712+
static inline bool classof(const VPRecipeBase *R) {
1713+
return R->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
1714+
R->getVPDefID() == VPRecipeBase::VPWidenSelectEVLSC;
1715+
}
1716+
1717+
static inline bool classof(const VPUser *U) {
1718+
auto *R = dyn_cast<VPRecipeBase>(U);
1719+
return R && classof(R);
1720+
}
17051721

17061722
/// Produce a widened version of the select instruction.
17071723
void execute(VPTransformState &State) override;
@@ -1721,6 +1737,52 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
17211737
}
17221738
};
17231739

1740+
// A recipe for widening select instruction with vector-predication intrinsics
1741+
// with explicit vector length (EVL).
1742+
struct VPWidenSelectEVLRecipe : public VPWidenSelectRecipe {
1743+
1744+
template <typename IterT>
1745+
VPWidenSelectEVLRecipe(SelectInst &I, iterator_range<IterT> Operands,
1746+
VPValue &EVL)
1747+
: VPWidenSelectRecipe(VPDef::VPWidenSelectEVLSC, I, Operands) {
1748+
addOperand(&EVL);
1749+
}
1750+
1751+
VPWidenSelectEVLRecipe(VPWidenSelectRecipe &W, VPValue &EVL)
1752+
: VPWidenSelectEVLRecipe(*cast<SelectInst>(W.getUnderlyingInstr()),
1753+
W.operands(), EVL) {}
1754+
1755+
~VPWidenSelectEVLRecipe() override = default;
1756+
1757+
VPWidenSelectEVLRecipe *clone() final {
1758+
llvm_unreachable("VPWidenSelectEVLRecipe cannot be cloned");
1759+
return nullptr;
1760+
}
1761+
1762+
VP_CLASSOF_IMPL(VPDef::VPWidenSelectEVLSC)
1763+
1764+
VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
1765+
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
1766+
1767+
/// Produce a vp-intrinsic version of the select instruction.
1768+
void execute(VPTransformState &State) final;
1769+
1770+
/// Returns true if the recipe only uses the first lane of operand \p Op.
1771+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1772+
assert(is_contained(operands(), Op) &&
1773+
"Op must be an operand of the recipe");
1774+
// EVL in that recipe is always the last operand, thus any use before means
1775+
// the VPValue should be vectorized.
1776+
return getEVL() == Op;
1777+
}
1778+
1779+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1780+
/// Print the recipe.
1781+
void print(raw_ostream &O, const Twine &Indent,
1782+
VPSlotTracker &SlotTracker) const final;
1783+
#endif
1784+
};
1785+
17241786
/// A recipe for handling GEP instructions.
17251787
class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
17261788
bool isPointerLoopInvariant() const {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+51-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
7676
case VPWidenPHISC:
7777
case VPWidenSC:
7878
case VPWidenEVLSC:
79-
case VPWidenSelectSC: {
79+
case VPWidenSelectSC:
80+
case VPWidenSelectEVLSC: {
8081
const Instruction *I =
8182
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
8283
(void)I;
@@ -117,7 +118,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
117118
case VPWidenPHISC:
118119
case VPWidenSC:
119120
case VPWidenEVLSC:
120-
case VPWidenSelectSC: {
121+
case VPWidenSelectSC:
122+
case VPWidenSelectEVLSC: {
121123
const Instruction *I =
122124
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123125
(void)I;
@@ -168,7 +170,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
168170
case VPWidenPointerInductionSC:
169171
case VPWidenSC:
170172
case VPWidenEVLSC:
171-
case VPWidenSelectSC: {
173+
case VPWidenSelectSC:
174+
case VPWidenSelectEVLSC: {
172175
const Instruction *I =
173176
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
174177
(void)I;
@@ -1060,6 +1063,21 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
10601063
getOperand(2)->printAsOperand(O, SlotTracker);
10611064
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
10621065
}
1066+
1067+
void VPWidenSelectEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1068+
VPSlotTracker &SlotTracker) const {
1069+
O << Indent << "WIDEN-SELECT ";
1070+
printAsOperand(O, SlotTracker);
1071+
O << " = vp.select ";
1072+
getOperand(0)->printAsOperand(O, SlotTracker);
1073+
O << ", ";
1074+
getOperand(1)->printAsOperand(O, SlotTracker);
1075+
O << ", ";
1076+
getOperand(2)->printAsOperand(O, SlotTracker);
1077+
O << ", ";
1078+
getOperand(3)->printAsOperand(O, SlotTracker);
1079+
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1080+
}
10631081
#endif
10641082

10651083
void VPWidenSelectRecipe::execute(VPTransformState &State) {
@@ -1082,6 +1100,36 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
10821100
}
10831101
}
10841102

1103+
void VPWidenSelectEVLRecipe::execute(VPTransformState &State) {
1104+
State.setDebugLocFrom(getDebugLoc());
1105+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1106+
"explicit vector length.");
1107+
1108+
Value *EVLArg = State.get(getEVL(), 0, /*NeedsScalar=*/true);
1109+
IRBuilderBase &BuilderIR = State.Builder;
1110+
VectorBuilder Builder(BuilderIR);
1111+
Builder.setEVL(EVLArg);
1112+
// The condition can be loop invariant but still defined inside the
1113+
// loop. This means that we can't just use the original 'cond' value.
1114+
// We have to take the 'vectorized' value and pick the first lane.
1115+
// Instcombine will make this a no-op.
1116+
auto *InvarCond =
1117+
isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
1118+
1119+
Value *Cond = InvarCond ? InvarCond : State.get(getCond(), 0);
1120+
if (!isa<VectorType>(Cond->getType())) {
1121+
Cond = BuilderIR.CreateVectorSplat(State.VF, Cond, "splat.cond");
1122+
}
1123+
1124+
Value *Op0 = State.get(getOperand(1), 0);
1125+
Value *Op1 = State.get(getOperand(2), 0);
1126+
Value *VPInst = Builder.createVectorInstruction(
1127+
Instruction::Select, Op0->getType(), {Cond, Op0, Op1}, "vp.select");
1128+
State.set(this, VPInst, 0);
1129+
State.addMetadata(VPInst,
1130+
dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1131+
}
1132+
10851133
VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
10861134
const FastMathFlags &FMF) {
10871135
AllowReassoc = FMF.allowReassoc();

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13441344
return nullptr;
13451345
return new VPWidenEVLRecipe(*W, EVL);
13461346
})
1347+
.Case<VPWidenSelectRecipe>(
1348+
[&](VPWidenSelectRecipe *W) -> VPRecipeBase * {
1349+
return new VPWidenSelectEVLRecipe(*W, EVL);
1350+
})
13471351
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
13481352
VPValue *NewMask = GetNewMask(Red->getCondOp());
13491353
return new VPReductionEVLRecipe(*Red, EVL, NewMask);

llvm/lib/Transforms/Vectorize/VPlanValue.h

+1
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ class VPDef {
359359
VPWidenSC,
360360
VPWidenEVLSC,
361361
VPWidenSelectSC,
362+
VPWidenSelectEVLSC,
362363
VPBlendSC,
363364
// START: Phi-like recipes. Need to be kept together.
364365
VPWidenPHISC,

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
148148
return VerifyEVLUse(
149149
*W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2);
150150
})
151+
.Case<VPWidenSelectEVLRecipe>(
152+
[&](const VPWidenSelectEVLRecipe *S) {
153+
return VerifyEVLUse(*S, 3);
154+
})
151155
.Case<VPReductionEVLRecipe>([&](const VPReductionEVLRecipe *R) {
152156
return VerifyEVLUse(*R, 2);
153157
})

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
7070
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
7171
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7272
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
73-
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
73+
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
7474
; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7575
; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
7676
; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
; REQUIRES: asserts
2+
3+
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
4+
; RUN: -force-tail-folding-style=data-with-evl \
5+
; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
6+
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
7+
8+
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
9+
; RUN: -force-tail-folding-style=none \
10+
; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
11+
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
12+
13+
define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
14+
; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
15+
; IF-EVL-NEXT: Live-in vp<%0> = VF * UF
16+
; IF-EVL-NEXT: Live-in vp<%1> = vector-trip-count
17+
; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
18+
19+
; IF-EVL: vector.ph:
20+
; IF-EVL-NEXT: Successor(s): vector loop
21+
22+
; IF-EVL: <x1> vector loop: {
23+
; IF-EVL-NEXT: vector.body:
24+
; IF-EVL-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%12>
25+
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<%4> = phi ir<0>, vp<%11>
26+
; IF-EVL-NEXT: EMIT vp<%5> = EXPLICIT-VECTOR-LENGTH vp<%4>, ir<%N>
27+
; IF-EVL-NEXT: vp<%6> = SCALAR-STEPS vp<%4>, ir<1>
28+
; IF-EVL-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%6>
29+
; IF-EVL-NEXT: vp<%7> = vector-pointer ir<%arrayidx>
30+
; IF-EVL-NEXT: WIDEN ir<%0> = vp.load vp<%7>, vp<%5>
31+
; IF-EVL-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%c>, vp<%6>
32+
; IF-EVL-NEXT: vp<%8> = vector-pointer ir<%arrayidx3>
33+
; IF-EVL-NEXT: WIDEN ir<%1> = vp.load vp<%8>, vp<%5>
34+
; IF-EVL-NEXT: WIDEN ir<%cmp4> = icmp sgt ir<%0>, ir<%1>
35+
; IF-EVL-NEXT: WIDEN ir<%2> = vp.sub ir<0>, ir<%1>, vp<%5>
36+
; IF-EVL-NEXT: WIDEN-SELECT ir<%cond.p> = vp.select ir<%cmp4>, ir<%1>, ir<%2>, vp<%5>
37+
; IF-EVL-NEXT: WIDEN ir<%cond> = vp.add ir<%cond.p>, ir<%0>, vp<%5>
38+
; IF-EVL-NEXT: CLONE ir<%arrayidx15> = getelementptr inbounds ir<%a>, vp<%6>
39+
; IF-EVL-NEXT: vp<%9> = vector-pointer ir<%arrayidx15>
40+
; IF-EVL-NEXT: WIDEN vp.store vp<%9>, ir<%cond>, vp<%5>
41+
; IF-EVL-NEXT: SCALAR-CAST vp<%10> = zext vp<%5> to i64
42+
; IF-EVL-NEXT: EMIT vp<%11> = add vp<%10>, vp<%4>
43+
; IF-EVL-NEXT: EMIT vp<%12> = add vp<%3>, vp<%0>
44+
; IF-EVL-NEXT: EMIT branch-on-count vp<%12>, vp<%1>
45+
; IF-EVL-NEXT: No successors
46+
; IF-EVL-NEXT: }
47+
48+
; NO-VP: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
49+
; NO-VP-NEXT: Live-in vp<%0> = VF * UF
50+
; NO-VP-NEXT: Live-in vp<%1> = vector-trip-count
51+
; NO-VP-NEXT: Live-in ir<%N> = original trip-count
52+
53+
; NO-VP: vector.ph:
54+
; NO-VP-NEXT: Successor(s): vector loop
55+
56+
; NO-VP: <x1> vector loop: {
57+
; NO-VP-NEXT: vector.body:
58+
; NO-VP-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%7>
59+
; NO-VP-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
60+
; NO-VP-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%3>
61+
; NO-VP-NEXT: vp<%4> = vector-pointer ir<%arrayidx>
62+
; NO-VP-NEXT: WIDEN ir<%0> = load vp<%4>
63+
; NO-VP-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%c>, vp<%3>
64+
; NO-VP-NEXT: vp<%5> = vector-pointer ir<%arrayidx3>
65+
; NO-VP-NEXT: WIDEN ir<%1> = load vp<%5>
66+
; NO-VP-NEXT: WIDEN ir<%cmp4> = icmp sgt ir<%0>, ir<%1>
67+
; NO-VP-NEXT: WIDEN ir<%2> = sub ir<0>, ir<%1>
68+
; NO-VP-NEXT: WIDEN-SELECT ir<%cond.p> = select ir<%cmp4>, ir<%1>, ir<%2>
69+
; NO-VP-NEXT: WIDEN ir<%cond> = add ir<%cond.p>, ir<%0>
70+
; NO-VP-NEXT: CLONE ir<%arrayidx15> = getelementptr inbounds ir<%a>, vp<%3>
71+
; NO-VP-NEXT: vp<%6> = vector-pointer ir<%arrayidx15>
72+
; NO-VP-NEXT: WIDEN store vp<%6>, ir<%cond>
73+
; NO-VP-NEXT: EMIT vp<%7> = add nuw vp<%2>, vp<%0>
74+
; NO-VP-NEXT: EMIT branch-on-count vp<%7>, vp<%1>
75+
; NO-VP-NEXT: No successors
76+
; NO-VP-NEXT: }
77+
78+
79+
entry:
80+
%cmp30 = icmp sgt i64 %N, 0
81+
br i1 %cmp30, label %for.body, label %for.cond.cleanup
82+
83+
for.cond.cleanup:
84+
ret void
85+
86+
for.body:
87+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
88+
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
89+
%0 = load i32, ptr %arrayidx, align 4
90+
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
91+
%1 = load i32, ptr %arrayidx3, align 4
92+
%cmp4 = icmp sgt i32 %0, %1
93+
%2 = sub i32 0, %1
94+
%cond.p = select i1 %cmp4, i32 %1, i32 %2
95+
%cond = add i32 %cond.p, %0
96+
%arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
97+
store i32 %cond, ptr %arrayidx15, align 4
98+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
99+
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
100+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
101+
}

0 commit comments

Comments
 (0)