Skip to content

Commit 541f273

Browse files
committed
[LV][EVL] Support call instruction with EVL-vectorization
Only support smax/smin/umax/umin
1 parent c1621ed commit 541f273

File tree

13 files changed

+220
-22
lines changed

13 files changed

+220
-22
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,12 @@ bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx);
160160
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
161161
const TargetLibraryInfo *TLI);
162162

163+
/// Returns VP intrinsic ID for call.
164+
/// For the input call instruction it finds mapping intrinsic and returns
165+
/// its intrinsic ID, in case it does not found it return not_intrinsic.
166+
Intrinsic::ID getVPIntrinsicIDForCall(const CallInst *CI,
167+
const TargetLibraryInfo *TLI);
168+
163169
/// Given a vector and an element number, see if the scalar value is
164170
/// already around as a register, for example if it were inserted then extracted
165171
/// from the vector.

llvm/include/llvm/IR/VectorBuilder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,11 @@ class VectorBuilder {
9999
const Twine &Name = Twine());
100100

101101
/// Emit a VP reduction intrinsic call for recurrence kind.
102-
/// \param RdxID The intrinsic ID of llvm.vector.reduce.*
102+
/// \param ID The intrinsic ID of Call Intrinsic
103103
/// \param ValTy The type of operand which the reduction operation is
104104
/// performed.
105105
/// \param VecOpArray The operand list.
106-
Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
106+
Value *createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
107107
ArrayRef<Value *> VecOpArray,
108108
const Twine &Name = Twine());
109109
};

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,13 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
169169
return Intrinsic::not_intrinsic;
170170
}
171171

172+
Intrinsic::ID llvm::getVPIntrinsicIDForCall(const CallInst *CI,
173+
const TargetLibraryInfo *TLI) {
174+
Intrinsic::ID ID = getIntrinsicForCallSite(*CI, TLI);
175+
176+
return VPIntrinsic::getForIntrinsic(ID);
177+
}
178+
172179
/// Given a vector and an element number, see if the scalar value is
173180
/// already around as a register, for example if it were inserted then extracted
174181
/// from the vector.

llvm/lib/IR/VectorBuilder.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,12 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
6060
return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
6161
}
6262

63-
Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
64-
Type *ValTy,
63+
Value *VectorBuilder::createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
6564
ArrayRef<Value *> InstOpArray,
6665
const Twine &Name) {
67-
auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
68-
assert(VPReductionIntrinsic::isVPReduction(VPID) &&
69-
"No VPIntrinsic for this reduction");
66+
auto VPID = VPIntrinsic::getForIntrinsic(ID);
67+
assert(VPIntrinsic::isVPIntrinsic(VPID) &&
68+
"No VPIntrinsic for this Intrinsic");
7069
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
7170
}
7271

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,7 +1299,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
12991299
Type *SrcEltTy = SrcTy->getElementType();
13001300
Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
13011301
Value *Ops[] = {Iden, Src};
1302-
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
1302+
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
13031303
}
13041304

13051305
Value *llvm::createReduction(IRBuilderBase &B,
@@ -1342,7 +1342,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
13421342
Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
13431343
auto *SrcTy = cast<VectorType>(Src->getType());
13441344
Value *Ops[] = {Start, Src};
1345-
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
1345+
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
13461346
}
13471347

13481348
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8616,7 +8616,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
86168616
// TODO: try to put it close to addActiveLaneMask().
86178617
// Discard the plan if it is not EVL-compatible
86188618
if (CM.foldTailWithEVL() &&
8619-
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
8619+
!VPlanTransforms::tryAddExplicitVectorLength(*Plan, *TLI))
86208620
break;
86218621
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
86228622
VPlans.push_back(std::move(Plan));

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 96 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
883883
case VPRecipeBase::VPScalarIVStepsSC:
884884
case VPRecipeBase::VPVectorPointerSC:
885885
case VPRecipeBase::VPWidenCallSC:
886+
case VPRecipeBase::VPWidenCallEVLSC:
886887
case VPRecipeBase::VPWidenCanonicalIVSC:
887888
case VPRecipeBase::VPWidenCastSC:
888889
case VPRecipeBase::VPWidenGEPSC:
@@ -1610,6 +1611,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
16101611

16111612
/// A recipe for widening Call instructions.
16121613
class VPWidenCallRecipe : public VPSingleDefRecipe {
1614+
public:
16131615
/// ID of the vector intrinsic to call when widening the call. If set the
16141616
/// Intrinsic::not_intrinsic, a library call will be used instead.
16151617
Intrinsic::ID VectorIntrinsicID;
@@ -1619,26 +1621,48 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
16191621
/// VF with a valid variant.
16201622
Function *Variant;
16211623

1622-
public:
1624+
protected:
16231625
template <typename IterT>
1624-
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
1626+
VPWidenCallRecipe(unsigned VPDefOpcode, Value *UV,
1627+
iterator_range<IterT> CallArguments,
16251628
Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
16261629
Function *Variant = nullptr)
1627-
: VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
1630+
: VPSingleDefRecipe(VPDefOpcode, CallArguments, UV, DL),
16281631
VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
16291632
assert(
16301633
isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
16311634
"last operand must be the called function");
16321635
}
16331636

1637+
public:
1638+
template <typename IterT>
1639+
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
1640+
Intrinsic::ID VectorIntrinsicID, DebugLoc DL)
1641+
: VPWidenCallRecipe(VPDef::VPWidenCallSC, UV, CallArguments,
1642+
VectorIntrinsicID, DL) {}
1643+
1644+
template <typename IterT>
1645+
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
1646+
Intrinsic::ID VectorIntrinsicID, DebugLoc DL,
1647+
Function *Variant)
1648+
: VPWidenCallRecipe(VPDef::VPWidenCallSC, UV, CallArguments,
1649+
VectorIntrinsicID, DL, Variant) {}
1650+
16341651
~VPWidenCallRecipe() override = default;
16351652

16361653
VPWidenCallRecipe *clone() override {
16371654
return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
16381655
VectorIntrinsicID, getDebugLoc(), Variant);
16391656
}
1657+
static inline bool classof(const VPRecipeBase *R) {
1658+
return R->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
1659+
R->getVPDefID() == VPRecipeBase::VPWidenCallEVLSC;
1660+
}
16401661

1641-
VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
1662+
static inline bool classof(const VPUser *U) {
1663+
auto *R = dyn_cast<VPRecipeBase>(U);
1664+
return R && classof(R);
1665+
}
16421666

16431667
/// Produce a widened version of the call instruction.
16441668
void execute(VPTransformState &State) override;
@@ -1665,6 +1689,74 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
16651689
#endif
16661690
};
16671691

1692+
/// A recipe for widening Call instructions with vector-predication intrinsics
1693+
/// with explicit vector length (EVL).
1694+
class VPWidenCallEVLRecipe : public VPWidenCallRecipe {
1695+
// using VPRecipeWithIRFlags::transferFlags;
1696+
// Intrinsic::ID VectorIntrinsicID;
1697+
1698+
public:
1699+
template <typename IterT>
1700+
VPWidenCallEVLRecipe(Value *UV, iterator_range<IterT> CallArguments,
1701+
Intrinsic::ID VectorIntrinsicID, DebugLoc DL,
1702+
VPValue &EVL)
1703+
: VPWidenCallRecipe(VPDef::VPWidenCallEVLSC, UV, CallArguments,
1704+
VectorIntrinsicID, DL) {
1705+
addOperand(&EVL);
1706+
}
1707+
1708+
VPWidenCallEVLRecipe(VPWidenCallRecipe &W, Intrinsic::ID VectorIntrinsicID,
1709+
DebugLoc DL, VPValue &EVL)
1710+
: VPWidenCallEVLRecipe(W.getUnderlyingValue(), W.operands(),
1711+
VectorIntrinsicID, DL, EVL) {}
1712+
1713+
~VPWidenCallEVLRecipe() override = default;
1714+
1715+
VPWidenCallEVLRecipe *clone() override {
1716+
llvm_unreachable("VPWidenCallEVLRecipe cannot be cloned");
1717+
return nullptr;
1718+
}
1719+
1720+
VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
1721+
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
1722+
1723+
// Intrinsic::ID getVectorIntrinsicID() {
1724+
// return VectorIntrinsicID;
1725+
// }
1726+
1727+
VP_CLASSOF_IMPL(VPDef::VPWidenCallEVLSC)
1728+
1729+
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const final;
1730+
1731+
Function *getCalledScalarFunction() const {
1732+
return cast<Function>(getOperand(getNumOperands() - 2)->getLiveInIRValue());
1733+
}
1734+
1735+
operand_range arg_operands() {
1736+
return make_range(op_begin(), op_begin() + getNumOperands() - 2);
1737+
}
1738+
const_operand_range arg_operands() const {
1739+
return make_range(op_begin(), op_begin() + getNumOperands() - 2);
1740+
}
1741+
/// Produce a widened version of the call instruction.
1742+
void execute(VPTransformState &State) final;
1743+
1744+
/// Returns true if the recipe only uses the first lane of operand \p Op.
1745+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1746+
assert(is_contained(operands(), Op) &&
1747+
"Op must be an operand of the recipe");
1748+
// EVL in that recipe is always the last operand, thus any use before means
1749+
// the VPValue should be vectorized.
1750+
return getEVL() == Op;
1751+
}
1752+
1753+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1754+
/// Print the recipe.
1755+
void print(raw_ostream &O, const Twine &Indent,
1756+
VPSlotTracker &SlotTracker) const final;
1757+
#endif
1758+
};
1759+
16681760
/// A recipe representing a sequence of load -> update -> store as part of
16691761
/// a histogram operation. This means there may be aliasing between vector
16701762
/// lanes, which is handled by the llvm.experimental.vector.histogram family

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
7676
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
7777
->mayWriteToMemory();
7878
case VPWidenCallSC:
79+
// case VPWidenCallEVLSC:
7980
return !cast<VPWidenCallRecipe>(this)
8081
->getCalledScalarFunction()
8182
->onlyReadsMemory();
@@ -117,6 +118,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
117118
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
118119
->mayReadFromMemory();
119120
case VPWidenCallSC:
121+
// case VPWidenCallEVLSC:
120122
return !cast<VPWidenCallRecipe>(this)
121123
->getCalledScalarFunction()
122124
->onlyWritesMemory();
@@ -158,6 +160,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
158160
case VPInstructionSC:
159161
return mayWriteToMemory();
160162
case VPWidenCallSC: {
163+
// case VPWidenCallEVLSC: {
161164
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
162165
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
163166
}
@@ -951,6 +954,52 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
951954
State.addMetadata(V, CI);
952955
}
953956

957+
void VPWidenCallEVLRecipe::execute(VPTransformState &State) {
958+
Function *CalledScalarFn = getCalledScalarFunction();
959+
assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
960+
"DbgInfoIntrinsic should have been dropped during VPlan construction");
961+
State.setDebugLocFrom(getDebugLoc());
962+
963+
bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
964+
965+
// TODO: more intrinsics to support , Now only support the
966+
// llvm.smax/llvm.smin/llvm.umax/llvm.umin
967+
auto *TysForDecl = VectorType::get(
968+
CalledScalarFn->getReturnType()->getScalarType(), State.VF);
969+
970+
SmallVector<Value *, 4> Args;
971+
for (const auto &I : enumerate(arg_operands())) {
972+
Value *Arg;
973+
if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(
974+
CalledScalarFn->getIntrinsicID(), I.index()))
975+
Arg = State.get(I.value(), VPLane(0));
976+
else
977+
Arg = State.get(I.value());
978+
Args.push_back(Arg);
979+
}
980+
981+
IRBuilderBase &BuilderIR = State.Builder;
982+
VectorBuilder VBuilder(BuilderIR);
983+
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
984+
VBuilder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true));
985+
986+
auto VPInst = VBuilder.createSimpleIntrinsic(VectorIntrinsicID, TysForDecl,
987+
Args, "vp.call");
988+
// FIXME: IR/Recipe/EVLRecipe has same the flags. Can copy from IR?
989+
if (VPInst) {
990+
if (auto *VecOp = dyn_cast<CallInst>(VPInst))
991+
VecOp->copyIRFlags(getUnderlyingInstr());
992+
}
993+
994+
auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
995+
SmallVector<OperandBundleDef, 1> OpBundles;
996+
if (CI)
997+
CI->getOperandBundlesAsDefs(OpBundles);
998+
999+
State.set(this, VPInst);
1000+
State.addMetadata(VPInst, CI);
1001+
}
1002+
9541003
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
9551004
VPCostContext &Ctx) const {
9561005
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -998,6 +1047,12 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
9981047
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
9991048
}
10001049

1050+
// TODO: Reimplement of the computeCost
1051+
InstructionCost VPWidenCallEVLRecipe::computeCost(ElementCount VF,
1052+
VPCostContext &Ctx) const {
1053+
return VPRecipeBase::computeCost(VF, Ctx);
1054+
}
1055+
10011056
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
10021057
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
10031058
VPSlotTracker &SlotTracker) const {
@@ -1115,6 +1170,32 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
11151170
}
11161171
}
11171172

1173+
void VPWidenCallEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1174+
VPSlotTracker &SlotTracker) const {
1175+
O << Indent << "WIDEN-CALL ";
1176+
1177+
Function *CalledFn = getCalledScalarFunction();
1178+
if (CalledFn->getReturnType()->isVoidTy())
1179+
O << "void ";
1180+
else {
1181+
printAsOperand(O, SlotTracker);
1182+
O << " = ";
1183+
}
1184+
1185+
O << "vp.call @" << CalledFn->getName() << "(";
1186+
interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
1187+
Op->printAsOperand(O, SlotTracker);
1188+
});
1189+
O << ")";
1190+
1191+
if (VectorIntrinsicID)
1192+
O << " (using vector intrinsic)";
1193+
else {
1194+
O << " (using library function";
1195+
O << ")";
1196+
}
1197+
}
1198+
11181199
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
11191200
VPSlotTracker &SlotTracker) const {
11201201
O << Indent << "WIDEN-SELECT ";

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,7 +1350,8 @@ void VPlanTransforms::addActiveLaneMask(
13501350
}
13511351

13521352
/// Replace recipes with their EVL variants.
1353-
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
1353+
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL,
1354+
const TargetLibraryInfo &TLI) {
13541355
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
13551356
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
13561357
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
@@ -1379,6 +1380,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13791380
return nullptr;
13801381
return new VPWidenEVLRecipe(*W, EVL);
13811382
})
1383+
.Case<VPWidenCallRecipe>([&](VPWidenCallRecipe *W) {
1384+
auto *CI = cast<CallInst>(W->getUnderlyingInstr());
1385+
Intrinsic::ID VPID = getVPIntrinsicIDForCall(CI, &TLI);
1386+
return new VPWidenCallEVLRecipe(*W, VPID, CI->getDebugLoc(),
1387+
EVL);
1388+
})
13821389
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
13831390
VPValue *NewMask = GetNewMask(Red->getCondOp());
13841391
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -1429,7 +1436,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
14291436
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
14301437
/// ...
14311438
///
1432-
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
1439+
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan,
1440+
const TargetLibraryInfo &TLI) {
14331441
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
14341442
// The transform updates all users of inductions to work based on EVL, instead
14351443
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1481,7 +1489,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
14811489
NextEVLIV->insertBefore(CanonicalIVIncrement);
14821490
EVLPhi->addOperand(NextEVLIV);
14831491

1484-
transformRecipestoEVLRecipes(Plan, *VPEVL);
1492+
transformRecipestoEVLRecipes(Plan, *VPEVL, TLI);
14851493

14861494
// Replace all uses of VPCanonicalIVPHIRecipe by
14871495
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.

0 commit comments

Comments
 (0)