Skip to content

[VP][EVL] Support select instruction with EVL-vectorization #109614

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 65 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenEVLSC:
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPWidenSelectEVLSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
Expand Down Expand Up @@ -1712,10 +1713,17 @@ class VPHistogramRecipe : public VPRecipeBase {

/// A recipe for widening select instructions.
struct VPWidenSelectRecipe : public VPSingleDefRecipe {

protected:
template <typename IterT>
VPWidenSelectRecipe(unsigned VPDefOpcode, SelectInst &I,
iterator_range<IterT> Operands)
: VPSingleDefRecipe(VPDefOpcode, Operands, &I, I.getDebugLoc()) {}

public:
template <typename IterT>
VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
: VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
I.getDebugLoc()) {}
: VPWidenSelectRecipe(VPDef::VPWidenSelectSC, I, Operands) {}

~VPWidenSelectRecipe() override = default;

Expand All @@ -1724,7 +1732,15 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
operands());
}

VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSelectEVLSC;
}

static inline bool classof(const VPUser *U) {
auto *R = dyn_cast<VPRecipeBase>(U);
return R && classof(R);
}

/// Produce a widened version of the select instruction.
void execute(VPTransformState &State) override;
Expand All @@ -1744,6 +1760,52 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
}
};

// A recipe for widening select instruction with vector-predication intrinsics
// with explicit vector length (EVL).
struct VPWidenSelectEVLRecipe : public VPWidenSelectRecipe {

template <typename IterT>
VPWidenSelectEVLRecipe(SelectInst &I, iterator_range<IterT> Operands,
VPValue &EVL)
: VPWidenSelectRecipe(VPDef::VPWidenSelectEVLSC, I, Operands) {
addOperand(&EVL);
}

VPWidenSelectEVLRecipe(VPWidenSelectRecipe &W, VPValue &EVL)
: VPWidenSelectEVLRecipe(*cast<SelectInst>(W.getUnderlyingInstr()),
W.operands(), EVL) {}

~VPWidenSelectEVLRecipe() override = default;

VPWidenSelectEVLRecipe *clone() final {
llvm_unreachable("VPWidenSelectEVLRecipe cannot be cloned");
return nullptr;
}

VP_CLASSOF_IMPL(VPDef::VPWidenSelectEVLSC)

VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }

/// Produce a vp-intrinsic version of the select instruction.
void execute(VPTransformState &State) final;

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// EVL in that recipe is always the last operand, thus any use before means
// the VPValue should be vectorized.
return getEVL() == Op;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const final;
#endif
};

/// A recipe for handling GEP instructions.
class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
bool isPointerLoopInvariant() const {
Expand Down
49 changes: 46 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenPHISC:
case VPWidenSC:
case VPWidenEVLSC:
case VPWidenSelectSC: {
case VPWidenSelectSC:
case VPWidenSelectEVLSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
Expand Down Expand Up @@ -136,7 +137,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenPHISC:
case VPWidenSC:
case VPWidenEVLSC:
case VPWidenSelectSC: {
case VPWidenSelectSC:
case VPWidenSelectEVLSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
Expand Down Expand Up @@ -173,7 +175,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPointerInductionSC:
case VPWidenSC:
case VPWidenEVLSC:
case VPWidenSelectSC: {
case VPWidenSelectSC:
case VPWidenSelectEVLSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
Expand Down Expand Up @@ -1127,6 +1130,21 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(2)->printAsOperand(O, SlotTracker);
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
}

void VPWidenSelectEVLRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
printAsOperand(O, SlotTracker);
O << " = vp.select ";
getOperand(0)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(1)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(3)->printAsOperand(O, SlotTracker);
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
}
#endif

void VPWidenSelectRecipe::execute(VPTransformState &State) {
Expand All @@ -1147,6 +1165,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}

void VPWidenSelectEVLRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
Value *EVLArg = State.get(getEVL(), /*NeedsScalar=*/true);
IRBuilderBase &BuilderIR = State.Builder;
VectorBuilder Builder(BuilderIR);
Builder.setEVL(EVLArg);
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
auto *InvarCond =
isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;

Value *Cond = InvarCond ? InvarCond : State.get(getCond(), 0);
assert(isa<VectorType>(Cond->getType()) && "CondType must be vector Type.");

Value *Op0 = State.get(getOperand(1), 0);
Value *Op1 = State.get(getOperand(2), 0);
Value *VPInst = Builder.createVectorInstruction(
Instruction::Select, Op0->getType(), {Cond, Op0, Op1}, "vp.select");
State.set(this, VPInst, 0);
State.addMetadata(VPInst,
dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}

VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
const FastMathFlags &FMF) {
AllowReassoc = FMF.allowReassoc();
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,6 +1379,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
return nullptr;
return new VPWidenEVLRecipe(*W, EVL);
})
.Case<VPWidenSelectRecipe>(
[&](VPWidenSelectRecipe *W) -> VPRecipeBase * {
return new VPWidenSelectEVLRecipe(*W, EVL);
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ class VPDef {
VPWidenSC,
VPWidenEVLSC,
VPWidenSelectSC,
VPWidenSelectEVLSC,
VPBlendSC,
VPHistogramSC,
// START: Phi-like recipes. Need to be kept together.
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
return VerifyEVLUse(
*W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2);
})
.Case<VPWidenSelectEVLRecipe>(
[&](const VPWidenSelectEVLRecipe *S) {
return VerifyEVLUse(*S, 3);
})
.Case<VPReductionEVLRecipe>([&](const VPReductionEVLRecipe *R) {
return VerifyEVLUse(*R, 2);
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; REQUIRES: asserts

; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
; RUN: -force-tail-folding-style=data-with-evl \
; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s

define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
; IF-EVL-NEXT: Live-in ir<%N> = original trip-count

; IF-EVL: vector.ph:
; IF-EVL-NEXT: Successor(s): vector loop

; IF-EVL: <x1> vector loop: {
; IF-EVL-NEXT: vector.body:
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-SELECT ir<[[SELECT:%.+]]> = vp.select ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add ir<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
; IF-EVL-NEXT: No successors
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
%cmp4 = icmp sgt i32 %0, %1
%2 = sub i32 0, %1
%cond.p = select i1 %cmp4, i32 %1, i32 %2
%cond = add i32 %cond.p, %0
%arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %cond, ptr %arrayidx15, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body

exit:
ret void
}
Loading