Skip to content
59 changes: 49 additions & 10 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6939,6 +6939,29 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
cast<VPRecipeWithIRFlags>(R).getPredicate() !=
cast<CmpInst>(UI)->getPredicate())
return true;

if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
bool IsReverse = CostCtx.CM.getWideningDecision(UI, VF) ==
LoopVectorizationCostModel::CM_Widen_Reverse;
Comment on lines +6944 to +6945
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fhahn I think this is the most straightforward approach.
Currently, we could check whether the operand or user is a reverse operation, but in the future reverse operations might be simplified away, so relying on reverse operation is not a long-term approach.
The last option is to use the address for the check, since only reverse operations need to use VPVectorEndPointer.
What do you think?

if (IsReverse) {
// The legacy model have not computed the cost of reverse mask.
if (CostCtx.CM.Legal->isMaskRequired(UI))
return true;
Comment on lines +6947 to +6949
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

@Mel-Chen Mel-Chen Aug 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fhahn This could lead to overestimating the cost.
Multiple VPWidenMemoryRecipes may be able to share a single reverse mask, but if each VPWidenMemoryRecipe computes the cost of the reverse mask separately, the cost would be overestimated.


// If the stored value of a reverse store is invariant, LICM will
// hoist the reverse operation to the preheader. In this case, the
// result of the VPlan-based cost model will diverge from that of
// the legacy model.
if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;

if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;
}
}

SeenInstrs.insert(UI);
}
}
Expand Down Expand Up @@ -7608,9 +7631,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}

VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");

Expand Down Expand Up @@ -7667,14 +7690,30 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
Reverse, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
if (Reverse && Mask)
Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc());

if (auto *Load = dyn_cast<LoadInst>(I)) {
auto *LoadR =
new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive,
VPIRMetadata(*Load, LVer), Load->getDebugLoc());
if (Reverse) {
Builder.insert(LoadR);
return new VPInstruction(VPInstruction::Reverse, {LoadR},
LoadR->getDebugLoc());
}
return LoadR;
}

auto *Store = cast<StoreInst>(I);
VPValue *StoredVal = Operands[0];
if (Reverse)
StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
Store->getDebugLoc());
return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
VPIRMetadata(*Store, LVer),
Store->getDebugLoc());
}

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,8 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p I should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range);
VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range);

/// Check if an induction recipe should be constructed for \p Phi. If so build
/// and return it. If not, return null.
Expand Down
35 changes: 13 additions & 22 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// It produces the lane index across all unrolled iterations. Unrolling will
// add all copies of its original operand as additional operands.
FirstActiveLane,
// Returns a reversed vector for the operand.
Reverse,

// The opcodes below are used for VPInstructionWithType.
//
Expand Down Expand Up @@ -3012,9 +3014,6 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Whether the accessed addresses are consecutive.
bool Consecutive;

/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;

/// Whether the memory access is masked.
bool IsMasked = false;

Expand All @@ -3028,12 +3027,10 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,

VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
bool Consecutive, const VPIRMetadata &Metadata,
DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
Consecutive(Consecutive) {}

public:
VPWidenMemoryRecipe *clone() override {
Expand All @@ -3055,10 +3052,6 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Return whether the loaded-from / stored-to addresses are consecutive.
bool isConsecutive() const { return Consecutive; }

/// Return whether the consecutive loaded/stored addresses are in reverse
/// order.
bool isReverse() const { return Reverse; }

/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }

Expand Down Expand Up @@ -3089,18 +3082,16 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
bool Consecutive, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
Reverse, Metadata, DL),
Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}

VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
getMask(), Consecutive, Reverse, *this,
getDebugLoc());
getMask(), Consecutive, *this, getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
Expand Down Expand Up @@ -3131,7 +3122,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
{Addr, &EVL}, L.isConsecutive(), L,
L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
Expand Down Expand Up @@ -3169,17 +3160,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
VPValue *Mask, bool Consecutive,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
Consecutive, Reverse, Metadata, DL) {
Consecutive, Metadata, DL) {
setMask(Mask);
}

VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
Reverse, *this, getDebugLoc());
*this, getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
Expand Down Expand Up @@ -3214,7 +3205,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
S.isReverse(), S, S.getDebugLoc()) {
S, S.getDebugLoc()) {
setMask(Mask);
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ struct Recipe_match {
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
std::is_same<RecipeTy, VPWidenGEPRecipe>::value ||
std::is_same<RecipeTy, VPVectorEndPointerRecipe>::value)
return DefR;
else
return DefR && DefR->getOpcode() == Opcode;
Expand Down Expand Up @@ -550,6 +551,16 @@ m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
return m_Select(Op0, m_True(), Op1);
}

template <typename Op0_t, typename Op1_t>
using VPVectorEndPointer_match =
Recipe_match<std::tuple<Op0_t, Op1_t>, 0, false, VPVectorEndPointerRecipe>;

template <typename Op0_t, typename Op1_t>
inline VPVectorEndPointer_match<Op0_t, Op1_t>
m_VectorEndPointer(const Op0_t &Op0, const Op1_t &Op1) {
return VPVectorEndPointer_match<Op0_t, Op1_t>({Op0, Op1});
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0,
false, VPScalarIVStepsRecipe>;
Expand Down
Loading
Loading