Skip to content

Commit 71e24f0

Browse files
committed
[VPlan] Rewrite optimizeMaskToEVL in terms of pattern matching. NFC
Stacked on llvm#155383 Currently in optimizeMaskToEVL we convert every widened load, store or reduction to a VP predicated recipe with EVL, regardless of whether or not it uses the header mask. So currently we have to be careful when working on other parts VPlan to make sure that the EVL transform doesn't break or transform something incorrectly, because it's not a semantics preserving transform. Forgetting to do so has caused miscompiles before, like the case that was fixed in llvm#113667 This PR rewrites it to work in terms of pattern matching, so it now only converts a recipe to a VP predicated recipe if it uses the header mask. It also splits out the load/store transforms into separate patterns for reversed and non-reversed, which should make llvm#146525 easier to implement and reason about. After this the transform should be a true optimisation and not change any semantics, so it shouldn't miscompile things if other parts of VPlan change. This fixes llvm#152541, and allows us to move addExplicitVectorLength into tryToBuildVPlanWithVPRecipes in llvm#153144
1 parent 4c91627 commit 71e24f0

File tree

4 files changed

+152
-70
lines changed

4 files changed

+152
-70
lines changed

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ struct Recipe_match {
255255
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
256256
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
257257
std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
258-
std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
258+
std::is_same<RecipeTy, VPWidenGEPRecipe>::value ||
259+
std::is_same<RecipeTy, VPVectorEndPointerRecipe>::value)
259260
return DefR;
260261
else
261262
return DefR && DefR->getOpcode() == Opcode;
@@ -587,6 +588,79 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
587588
return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
588589
}
589590

591+
template <typename Addr_t, typename Mask_t, bool Reverse> struct Load_match {
592+
Addr_t Addr;
593+
Mask_t Mask;
594+
595+
Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {}
596+
597+
template <typename OpTy> bool match(const OpTy *V) const {
598+
auto *Load = dyn_cast<VPWidenLoadRecipe>(V);
599+
if (!Load || Load->isReverse() != Reverse || !Addr.match(Load->getAddr()) ||
600+
!Load->isMasked() || !Mask.match(Load->getMask()))
601+
return false;
602+
return true;
603+
}
604+
};
605+
606+
/// Match a non-reversed masked load.
607+
template <typename Addr_t, typename Mask_t>
608+
inline Load_match<Addr_t, Mask_t, false> m_Load(const Addr_t &Addr,
609+
const Mask_t &Mask) {
610+
return Load_match<Addr_t, Mask_t, false>(Addr, Mask);
611+
}
612+
613+
/// Match a reversed masked load.
614+
template <typename Addr_t, typename Mask_t>
615+
inline Load_match<Addr_t, Mask_t, true> m_ReverseLoad(const Addr_t &Addr,
616+
const Mask_t &Mask) {
617+
return Load_match<Addr_t, Mask_t, true>(Addr, Mask);
618+
}
619+
620+
template <typename Addr_t, typename Val_t, typename Mask_t, bool Reverse>
621+
struct Store_match {
622+
Addr_t Addr;
623+
Val_t Val;
624+
Mask_t Mask;
625+
626+
Store_match(Addr_t Addr, Val_t Val, Mask_t Mask)
627+
: Addr(Addr), Val(Val), Mask(Mask) {}
628+
629+
template <typename OpTy> bool match(const OpTy *V) const {
630+
auto *Store = dyn_cast<VPWidenStoreRecipe>(V);
631+
if (!Store || Store->isReverse() != Reverse ||
632+
!Addr.match(Store->getAddr()) || !Val.match(Store->getStoredValue()) ||
633+
!Store->isMasked() || !Mask.match(Store->getMask()))
634+
return false;
635+
return true;
636+
}
637+
};
638+
639+
/// Match a non-reversed masked store.
640+
template <typename Addr_t, typename Val_t, typename Mask_t>
641+
inline Store_match<Addr_t, Val_t, Mask_t, false>
642+
m_Store(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) {
643+
return Store_match<Addr_t, Val_t, Mask_t, false>(Addr, Val, Mask);
644+
}
645+
646+
/// Match a reversed masked store.
647+
template <typename Addr_t, typename Val_t, typename Mask_t>
648+
inline Store_match<Addr_t, Val_t, Mask_t, true>
649+
m_ReverseStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) {
650+
return Store_match<Addr_t, Val_t, Mask_t, true>(Addr, Val, Mask);
651+
}
652+
653+
template <typename Op0_t, typename Op1_t>
654+
using VectorEndPointerRecipe_match =
655+
Recipe_match<std::tuple<Op0_t, Op1_t>, 0,
656+
/*Commutative*/ false, VPVectorEndPointerRecipe>;
657+
658+
template <typename Op0_t, typename Op1_t>
659+
VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0,
660+
const Op1_t &Op1) {
661+
return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1);
662+
}
663+
590664
/// Match a call argument at a given argument index.
591665
template <typename Opnd_t> struct Argument_match {
592666
/// Call argument index to match.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 75 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2272,90 +2272,98 @@ void VPlanTransforms::addActiveLaneMask(
22722272
HeaderMask->eraseFromParent();
22732273
}
22742274

2275+
template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2276+
Op0_t In;
2277+
Op1_t &Out;
2278+
2279+
RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2280+
2281+
template <typename OpTy> bool match(OpTy *V) const {
2282+
if (m_Specific(In).match(V)) {
2283+
Out = nullptr;
2284+
return true;
2285+
}
2286+
if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V))
2287+
return true;
2288+
return false;
2289+
}
2290+
};
2291+
2292+
/// Match a specific mask \p in, or a combination of it (logical-and in, out).
2293+
/// Returns the remaining part \p out if so, or nullptr otherwise.
2294+
template <typename Op0_t, typename Op1_t>
2295+
static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2296+
Op1_t &Out) {
2297+
return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2298+
}
2299+
22752300
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
22762301
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
22772302
/// recipe could be created.
22782303
/// \p HeaderMask Header Mask.
22792304
/// \p CurRecipe Recipe to be transform.
22802305
/// \p TypeInfo VPlan-based type analysis.
2281-
/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
22822306
/// \p EVL The explicit vector length parameter of vector-predication
22832307
/// intrinsics.
22842308
static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
22852309
VPRecipeBase &CurRecipe,
2286-
VPTypeAnalysis &TypeInfo,
2287-
VPValue &AllOneMask, VPValue &EVL) {
2288-
// FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2289-
// header mask.
2290-
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2291-
assert(OrigMask && "Unmasked recipe when folding tail");
2292-
// HeaderMask will be handled using EVL.
2293-
VPValue *Mask;
2294-
if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2295-
return Mask;
2296-
return HeaderMask == OrigMask ? nullptr : OrigMask;
2297-
};
2310+
VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2311+
VPlan *Plan = CurRecipe.getParent()->getPlan();
2312+
VPValue *Addr, *Mask, *EndPtr;
22982313

22992314
/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2300-
auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2301-
auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2302-
if (!EndPtr)
2303-
return Addr;
2304-
assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2305-
"VPVectorEndPointerRecipe with non-VF VF operand?");
2306-
assert(
2307-
all_of(EndPtr->users(),
2308-
[](VPUser *U) {
2309-
return cast<VPWidenMemoryRecipe>(U)->isReverse();
2310-
}) &&
2311-
"VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2312-
VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2313-
EVLAddr->insertBefore(&CurRecipe);
2314-
EVLAddr->setOperand(1, &EVL);
2315-
return EVLAddr;
2315+
auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2316+
auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2317+
EVLEndPtr->insertBefore(&CurRecipe);
2318+
EVLEndPtr->setOperand(1, &EVL);
2319+
return EVLEndPtr;
23162320
};
23172321

2318-
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
2319-
.Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
2320-
VPValue *NewMask = GetNewMask(L->getMask());
2321-
VPValue *NewAddr = GetNewAddr(L->getAddr());
2322-
return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2323-
})
2324-
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2325-
VPValue *NewMask = GetNewMask(S->getMask());
2326-
VPValue *NewAddr = GetNewAddr(S->getAddr());
2327-
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2328-
})
2329-
.Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2330-
VPValue *NewMask = GetNewMask(IR->getMask());
2331-
return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2332-
})
2333-
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2334-
VPValue *NewMask = GetNewMask(Red->getCondOp());
2335-
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2336-
})
2337-
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2338-
VPValue *LHS, *RHS;
2339-
// Transform select with a header mask condition
2340-
// select(header_mask, LHS, RHS)
2341-
// into vector predication merge.
2342-
// vp.merge(all-true, LHS, RHS, EVL)
2343-
if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2344-
m_VPValue(RHS))))
2345-
return nullptr;
2346-
// Use all true as the condition because this transformation is
2347-
// limited to selects whose condition is a header mask.
2348-
return new VPWidenIntrinsicRecipe(
2349-
Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2350-
TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2351-
})
2352-
.Default([&](VPRecipeBase *R) { return nullptr; });
2322+
if (match(&CurRecipe,
2323+
m_Load(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2324+
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2325+
EVL, Mask);
2326+
2327+
if (match(&CurRecipe,
2328+
m_ReverseLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2329+
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))))
2330+
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2331+
AdjustEndPtr(EndPtr), EVL, Mask);
2332+
2333+
if (match(&CurRecipe, m_Store(m_VPValue(Addr), m_VPValue(),
2334+
m_RemoveMask(HeaderMask, Mask))))
2335+
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2336+
EVL, Mask);
2337+
2338+
if (match(&CurRecipe, m_ReverseStore(m_VPValue(EndPtr), m_VPValue(),
2339+
m_RemoveMask(HeaderMask, Mask))) &&
2340+
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))))
2341+
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2342+
AdjustEndPtr(EndPtr), EVL, Mask);
2343+
2344+
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2345+
if (Rdx->isConditional() &&
2346+
match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2347+
return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2348+
2349+
if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2350+
if (Interleave->getMask() &&
2351+
match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2352+
return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2353+
2354+
VPValue *LHS, *RHS;
2355+
if (match(&CurRecipe,
2356+
m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2357+
return new VPWidenIntrinsicRecipe(
2358+
Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2359+
TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
2360+
2361+
return nullptr;
23532362
}
23542363

23552364
/// Replace recipes with their EVL variants.
23562365
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
23572366
VPTypeAnalysis TypeInfo(Plan);
2358-
VPValue *AllOneMask = Plan.getTrue();
23592367
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
23602368
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
23612369

@@ -2414,7 +2422,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
24142422
ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
24152423
VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
24162424
Intrinsic::experimental_vp_splice,
2417-
{V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2425+
{V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
24182426
TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
24192427
VPSplice->insertBefore(&R);
24202428
R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
@@ -2448,7 +2456,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
24482456
for (VPUser *U : collectUsersRecursively(EVLMask)) {
24492457
auto *CurRecipe = cast<VPRecipeBase>(U);
24502458
VPRecipeBase *EVLRecipe =
2451-
optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2459+
optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
24522460
if (!EVLRecipe)
24532461
continue;
24542462

llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
454454
; CHECK-NEXT: [[TMP22:%.*]] = or <vscale x 8 x i1> [[TMP19]], [[TMP21]]
455455
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
456456
; CHECK-NEXT: [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
457-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
457+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> [[TMP24]])
458458
; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP27]] to i64
459459
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
460460
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]

llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
4848
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 8 x i1> [[TMP21]], i32 0
4949
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]]
5050
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]]
51-
; CHECK-NEXT: call void @llvm.vp.store.nxv8i16.p0(<vscale x 8 x i16> zeroinitializer, ptr align 2 [[TMP24]], <vscale x 8 x i1> [[TMP22]], i32 [[TMP25]])
51+
; CHECK-NEXT: call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> zeroinitializer, ptr [[TMP24]], i32 2, <vscale x 8 x i1> [[TMP22]])
5252
; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64
5353
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP26]], [[INDEX]]
5454
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]

0 commit comments

Comments
 (0)