-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VPlan] Perform optimizeMaskToEVL in terms of pattern matching #155394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
71e24f0
f6c13fb
3e19e28
9a6ac60
479951d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -262,7 +262,8 @@ struct Recipe_match { | |
// Check for recipes that do not have opcodes. | ||
if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || | ||
std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || | ||
std::is_same_v<RecipeTy, VPDerivedIVRecipe>) | ||
std::is_same_v<RecipeTy, VPDerivedIVRecipe> || | ||
std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) | ||
return DefR; | ||
else | ||
return DefR && DefR->getOpcode() == Opcode; | ||
|
@@ -603,6 +604,79 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { | |
return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); | ||
} | ||
|
||
template <typename Addr_t, typename Mask_t, bool Reverse> struct Load_match { | ||
Addr_t Addr; | ||
Mask_t Mask; | ||
|
||
Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} | ||
|
||
template <typename OpTy> bool match(const OpTy *V) const { | ||
auto *Load = dyn_cast<VPWidenLoadRecipe>(V); | ||
if (!Load || Load->isReverse() != Reverse || !Addr.match(Load->getAddr()) || | ||
!Load->isMasked() || !Mask.match(Load->getMask())) | ||
return false; | ||
return true; | ||
} | ||
}; | ||
|
||
/// Match a non-reversed masked load. | ||
template <typename Addr_t, typename Mask_t> | ||
inline Load_match<Addr_t, Mask_t, false> m_Load(const Addr_t &Addr, | ||
const Mask_t &Mask) { | ||
return Load_match<Addr_t, Mask_t, false>(Addr, Mask); | ||
} | ||
|
||
/// Match a reversed masked load. | ||
template <typename Addr_t, typename Mask_t> | ||
inline Load_match<Addr_t, Mask_t, true> m_ReverseLoad(const Addr_t &Addr, | ||
const Mask_t &Mask) { | ||
return Load_match<Addr_t, Mask_t, true>(Addr, Mask); | ||
} | ||
|
||
template <typename Addr_t, typename Val_t, typename Mask_t, bool Reverse> | ||
struct Store_match { | ||
Addr_t Addr; | ||
Val_t Val; | ||
Mask_t Mask; | ||
|
||
Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) | ||
: Addr(Addr), Val(Val), Mask(Mask) {} | ||
|
||
template <typename OpTy> bool match(const OpTy *V) const { | ||
auto *Store = dyn_cast<VPWidenStoreRecipe>(V); | ||
if (!Store || Store->isReverse() != Reverse || | ||
!Addr.match(Store->getAddr()) || !Val.match(Store->getStoredValue()) || | ||
!Store->isMasked() || !Mask.match(Store->getMask())) | ||
return false; | ||
return true; | ||
} | ||
}; | ||
|
||
/// Match a non-reversed masked store. | ||
template <typename Addr_t, typename Val_t, typename Mask_t> | ||
inline Store_match<Addr_t, Val_t, Mask_t, false> | ||
m_Store(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { | ||
return Store_match<Addr_t, Val_t, Mask_t, false>(Addr, Val, Mask); | ||
} | ||
|
||
/// Match a reversed masked store. | ||
template <typename Addr_t, typename Val_t, typename Mask_t> | ||
inline Store_match<Addr_t, Val_t, Mask_t, true> | ||
m_ReverseStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { | ||
return Store_match<Addr_t, Val_t, Mask_t, true>(Addr, Val, Mask); | ||
} | ||
|
||
|
||
template <typename Op0_t, typename Op1_t> | ||
using VectorEndPointerRecipe_match = | ||
Recipe_match<std::tuple<Op0_t, Op1_t>, 0, | ||
/*Commutative*/ false, VPVectorEndPointerRecipe>; | ||
|
||
template <typename Op0_t, typename Op1_t> | ||
VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, | ||
const Op1_t &Op1) { | ||
return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); | ||
} | ||
|
||
/// Match a call argument at a given argument index. | ||
template <typename Opnd_t> struct Argument_match { | ||
/// Call argument index to match. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2430,90 +2430,98 @@ void VPlanTransforms::addActiveLaneMask( | |
HeaderMask->eraseFromParent(); | ||
} | ||
|
||
template <typename Op0_t, typename Op1_t> struct RemoveMask_match { | ||
Op0_t In; | ||
Op1_t &Out; | ||
|
||
RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} | ||
|
||
template <typename OpTy> bool match(OpTy *V) const { | ||
if (m_Specific(In).match(V)) { | ||
Out = nullptr; | ||
return true; | ||
} | ||
if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) | ||
return true; | ||
return false; | ||
} | ||
}; | ||
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// Match a specific mask \p In, or a combination of it (logical-and In, Out). | ||
/// Returns the remaining part \p Out if so, or nullptr otherwise. | ||
template <typename Op0_t, typename Op1_t> | ||
static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, | ||
Op1_t &Out) { | ||
return RemoveMask_match<Op0_t, Op1_t>(In, Out); | ||
} | ||
|
||
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding | ||
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based | ||
/// recipe could be created. | ||
/// \p HeaderMask Header Mask. | ||
/// \p CurRecipe Recipe to be transform. | ||
/// \p TypeInfo VPlan-based type analysis. | ||
/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. | ||
/// \p EVL The explicit vector length parameter of vector-predication | ||
/// intrinsics. | ||
static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, | ||
VPRecipeBase &CurRecipe, | ||
VPTypeAnalysis &TypeInfo, | ||
VPValue &AllOneMask, VPValue &EVL) { | ||
// FIXME: Don't transform recipes to EVL recipes if they're not masked by the | ||
// header mask. | ||
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { | ||
assert(OrigMask && "Unmasked recipe when folding tail"); | ||
// HeaderMask will be handled using EVL. | ||
VPValue *Mask; | ||
if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) | ||
return Mask; | ||
return HeaderMask == OrigMask ? nullptr : OrigMask; | ||
}; | ||
VPTypeAnalysis &TypeInfo, VPValue &EVL) { | ||
VPlan *Plan = CurRecipe.getParent()->getPlan(); | ||
VPValue *Addr, *Mask, *EndPtr; | ||
|
||
/// Adjust any end pointers so that they point to the end of EVL lanes not VF. | ||
auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { | ||
auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); | ||
if (!EndPtr) | ||
return Addr; | ||
assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && | ||
"VPVectorEndPointerRecipe with non-VF VF operand?"); | ||
assert( | ||
all_of(EndPtr->users(), | ||
[](VPUser *U) { | ||
return cast<VPWidenMemoryRecipe>(U)->isReverse(); | ||
}) && | ||
"VPVectorEndPointRecipe not used by reversed widened memory recipe?"); | ||
VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); | ||
EVLAddr->insertBefore(&CurRecipe); | ||
EVLAddr->setOperand(1, &EVL); | ||
return EVLAddr; | ||
auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { | ||
auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); | ||
EVLEndPtr->insertBefore(&CurRecipe); | ||
EVLEndPtr->setOperand(1, &EVL); | ||
return EVLEndPtr; | ||
}; | ||
|
||
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) | ||
.Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { | ||
VPValue *NewMask = GetNewMask(L->getMask()); | ||
VPValue *NewAddr = GetNewAddr(L->getAddr()); | ||
return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); | ||
}) | ||
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { | ||
VPValue *NewMask = GetNewMask(S->getMask()); | ||
VPValue *NewAddr = GetNewAddr(S->getAddr()); | ||
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); | ||
}) | ||
.Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { | ||
VPValue *NewMask = GetNewMask(IR->getMask()); | ||
return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); | ||
}) | ||
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { | ||
VPValue *NewMask = GetNewMask(Red->getCondOp()); | ||
return new VPReductionEVLRecipe(*Red, EVL, NewMask); | ||
}) | ||
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { | ||
VPValue *LHS, *RHS; | ||
// Transform select with a header mask condition | ||
// select(header_mask, LHS, RHS) | ||
// into vector predication merge. | ||
// vp.merge(all-true, LHS, RHS, EVL) | ||
if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), | ||
m_VPValue(RHS)))) | ||
return nullptr; | ||
// Use all true as the condition because this transformation is | ||
// limited to selects whose condition is a header mask. | ||
return new VPWidenIntrinsicRecipe( | ||
Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, | ||
TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); | ||
}) | ||
.Default([&](VPRecipeBase *R) { return nullptr; }); | ||
if (match(&CurRecipe, | ||
m_Load(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask)))) | ||
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, | ||
EVL, Mask); | ||
|
||
if (match(&CurRecipe, | ||
m_ReverseLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && | ||
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) | ||
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), | ||
AdjustEndPtr(EndPtr), EVL, Mask); | ||
Comment on lines
+2480
to
+2489
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you merge this into one if-block?
After we have reverse recipe, I can match it by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried this but it ends up not being correct to merge the two if blocks, because we can only match the reverse load when the pointer address is a VPVectorEndPointer, e.g.: if (match(&CurRecipe,
m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask)))) {
if (cast<VPWidenLoadRecipe>(CurRecipe).isReverse() &&
match(Addr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
// Transform VPVectorEndPointer(ptr, VF) to VPVectorEndPointer(ptr, EVL)
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
}
// Incorrect if reverse recipe falls through w/ non VPVectorEndPointer address
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
EVL, Mask);
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After #155579 I think the pattern should look like: if (match(&CurRecipe,
m_Reverse(m_Load(m_VPValue(EndPtr), m_Reverse(m_RemoveMask(HeaderMask, Mask))))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
auto *Load = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
return Builder.createVPReverse(Load, EVL);
} Because we also need to convert the reverse to a VP reverse There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
My first question is: why do we need to additionally check whether the address is a VPVectorEndPointer for reverse accesses? Isn’t the whole reason for transforming if (match(&CurRecipe, m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask)))) {
if (cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) {
bool IsVecEndPtr = match(Addr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())));
assert(IsVecEndPtr);
// Transform VPVectorEndPointer(ptr, VF) to VPVectorEndPointer(ptr, EVL)
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), AdjustEndPtr(Addr), EVL, Mask);
}
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, EVL, Mask);
} My second question is: if we keep the check on both isReverse and m_VPVectorEndPointer, do we actually have any cases is a reverse access with a non-VPVectorEndPointer address? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Exactly, that's why it's not correct to transform a reverse VPWidenLoadRecipe w/ a non-VPVectorEndPointer address to a reverse VPWidenLoadEVLRecipe. Consider the case with an arbitrary address that we don't call
But with a VPVectorEndPointer that we adjust it's correct:
Hopefully not, I think this is an invariant as you suggest. I'll try out the assert that you wrote above and see if it triggers. Enjoy your vacation by the way, see you in October :) |
||
|
||
if (match(&CurRecipe, m_Store(m_VPValue(Addr), m_VPValue(), | ||
m_RemoveMask(HeaderMask, Mask)))) | ||
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, | ||
EVL, Mask); | ||
|
||
if (match(&CurRecipe, m_ReverseStore(m_VPValue(EndPtr), m_VPValue(), | ||
m_RemoveMask(HeaderMask, Mask))) && | ||
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) | ||
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), | ||
AdjustEndPtr(EndPtr), EVL, Mask); | ||
|
||
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) | ||
if (Rdx->isConditional() && | ||
match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) | ||
return new VPReductionEVLRecipe(*Rdx, EVL, Mask); | ||
|
||
if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) | ||
if (Interleave->getMask() && | ||
match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) | ||
return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); | ||
|
||
VPValue *LHS, *RHS; | ||
if (match(&CurRecipe, | ||
m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) | ||
return new VPWidenIntrinsicRecipe( | ||
Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, | ||
TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); | ||
|
||
return nullptr; | ||
} | ||
|
||
/// Replace recipes with their EVL variants. | ||
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | ||
VPTypeAnalysis TypeInfo(Plan); | ||
VPValue *AllOneMask = Plan.getTrue(); | ||
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); | ||
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); | ||
|
||
|
@@ -2572,7 +2580,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |
ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); | ||
VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( | ||
Intrinsic::experimental_vp_splice, | ||
{V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, | ||
{V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, | ||
TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); | ||
VPSplice->insertBefore(&R); | ||
R.getVPSingleValue()->replaceAllUsesWith(VPSplice); | ||
|
@@ -2606,7 +2614,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |
for (VPUser *U : collectUsersRecursively(EVLMask)) { | ||
auto *CurRecipe = cast<VPRecipeBase>(U); | ||
VPRecipeBase *EVLRecipe = | ||
optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); | ||
optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); | ||
if (!EVLRecipe) | ||
continue; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don’t rely on isReverse to determine reverse access, since we plan to separate the reverse mask from load/store recipes and remove Reverse from VPWidenMemoryRecipe.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My plan was that we would land this first and rebase #146525 on top of it.
#146525 would then remove m_ReverseLoad/m_ReverseStore.
This way you wouldn't need to separately handle the reverse addresses here https://github.com/llvm/llvm-project/pull/146525/files#diff-53267225b83e943ceae51c326c9941e323fd7aaf74a08b5e6998d6456f88d1ddR2628-R2659
Instead you would only need to adjust the reverse pattern in optimizeMaskToEVL:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This approach seems to work for now, but it’s not a long-term solution. In the future, a reverse recipe might be sunk, meaning the reverse recipe would no longer directly use the loaded result. For example, consider this simplification rule:
BinOp(reverse(V1), reverse(V2)) --> reverse(BinOp(V1, V2))
.But you gave me a really good idea! This won’t happen with reverse stores. The reverse of a stored value will only be eliminated, not sunk. So this method should work for reverse stores, but I don’t think it will work for reverse loads.
ef2d027
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see what you mean about the use-def chain for loads. I don't think that will be a problem though if we only do the
BinOp(reverse(V1), reverse(V2)) --> reverse(BinOp(V1, V2))
transform after optimizeMaskToEVL. So we'd have something likeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, my plan is a bit different. I think the order of transformations should be:
This order is based on the idea that reverse access might later be converted into strided access. If the reverse can be removed entirely, then we don’t need to convert it into strided access.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see, I think your plan makes sense. But surely convertToStrided access has the same issue with the
BinOp(reverse(V1), reverse(V2)) --> reverse(BinOp(V1, V2))
simplification? I think it will also have to peek through the binary op to find the loads too?If we do have to peek through the use-def chain then we can probably share the logic between convertToStridedAccess and optimizeMaskToEVL.