-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[VPlan] Add support for VPWidenIntOrFpInductionRecipe in predicated D… #115274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Shih-Po Hung (arcbbb) Changes…ataWithEVL vectorization mode. As an alternative approach to #82021, this patch lowers VPWidenIntOrFpInductionRecipe into a widen phi recipe and step recipes, computed using EVL in the EVL transformation phase. Patch is 75.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115274.diff 12 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index cd5cf0443541fc..0c4df3a85c0947 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -115,6 +115,10 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
/// Identifies if the vector form of the intrinsic has a scalar operand.
bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
unsigned ScalarOpdIdx) {
+ if (VPIntrinsic::isVPIntrinsic(ID) &&
+ (ScalarOpdIdx == VPIntrinsic::getVectorLengthParamPos(ID)))
+ return true;
+
switch (ID) {
case Intrinsic::abs:
case Intrinsic::ctlz:
@@ -127,6 +131,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::umul_fix:
case Intrinsic::umul_fix_sat:
return (ScalarOpdIdx == 2);
+ case Intrinsic::experimental_vp_splat:
+ return (ScalarOpdIdx == 0);
default:
return false;
}
@@ -148,6 +154,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
return OpdIdx == 0;
case Intrinsic::powi:
return OpdIdx == -1 || OpdIdx == 1;
+ case Intrinsic::experimental_vp_splat:
+ return OpdIdx == -1;
default:
return OpdIdx == -1;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 6344bc4664d3b6..3016622cddd226 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1191,6 +1191,16 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
CostKind);
+ case Intrinsic::experimental_vp_splat: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ if (RetTy->getScalarSizeInBits() == 1) {
+ return LT.first *
+ (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+ LT.second, CostKind));
+ }
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c07af8519049c4..8442479229db3f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2939,8 +2939,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (EnableVPlanNativePath)
- fixNonInductionPHIs(State);
+ fixNonInductionPHIs(State);
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 00ba2f49017899..583925e8d9bbbc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -285,15 +285,15 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
return Shuf;
};
- if (!hasScalarValue(Def, {0})) {
- assert(Def->isLiveIn() && "expected a live-in");
- Value *IRV = Def->getLiveInIRValue();
- Value *B = GetBroadcastInstrs(IRV);
+ Value *ScalarValue = hasScalarValue(Def, {0}) ? get(Def, VPLane(0)) : nullptr;
+ if (!ScalarValue || isa<Constant>(ScalarValue)) {
+ assert((ScalarValue || Def->isLiveIn()) && "expected a live-in");
+ Value *B = ScalarValue ? GetBroadcastInstrs(ScalarValue)
+ : GetBroadcastInstrs(Def->getLiveInIRValue());
set(Def, B);
return B;
}
- Value *ScalarValue = get(Def, VPLane(0));
// If we aren't vectorizing, we can just copy the scalar map values over
// to the vector map.
if (VF.isScalar()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6254ea15191819..0bfb29483282a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -648,7 +648,8 @@ bool VPInstruction::isVectorToScalar() const {
}
bool VPInstruction::isSingleScalar() const {
- return getOpcode() == VPInstruction::ResumePhi;
+ return getOpcode() == VPInstruction::ResumePhi ||
+ getOpcode() == VPInstruction::ExplicitVectorLength;
}
#if !defined(NDEBUG)
@@ -1022,6 +1023,8 @@ bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
// Vector predication intrinsics only demand the the first lane the last
// operand (the EVL operand).
+ if (VectorIntrinsicID == Intrinsic::experimental_vp_splat)
+ return Op == getOperand(0);
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
Op == getOperand(getNumOperands() - 1);
}
@@ -2309,9 +2312,8 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
- assert(vputils::onlyFirstLaneUsed(this) &&
- "Codegen only implemented for first lane.");
switch (Opcode) {
+ case Instruction::UIToFP:
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::Trunc: {
@@ -3414,9 +3416,6 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- assert(EnableVPlanNativePath &&
- "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ea8845eaa75d4d..ecd649b1048991 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1523,6 +1523,126 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
+/// This function adds (0 * Step, 1 * Step, 2 * Step, ...) to StartValue of
+/// an induction variable at the preheader.
+static VPSingleDefRecipe *createStepVector(VPValue *StartValue, VPValue *Step,
+ Type *InductionTy,
+ const InductionDescriptor &ID,
+ VPBasicBlock *VectorPHVPBB,
+ DebugLoc DL) {
+ Type *IntTy = InductionTy->isIntegerTy()
+ ? InductionTy
+ : IntegerType::get(InductionTy->getContext(),
+ InductionTy->getScalarSizeInBits());
+ // Create a vector of consecutive numbers from zero to VF.
+ VPSingleDefRecipe *InitVec =
+ new VPWidenIntrinsicRecipe(Intrinsic::stepvector, {}, IntTy, DL);
+ VectorPHVPBB->appendRecipe(InitVec);
+
+ if (InductionTy->isIntegerTy()) {
+ auto *Mul = new VPInstruction(Instruction::Mul, {InitVec, Step}, DL);
+ VectorPHVPBB->appendRecipe(Mul);
+ auto *SteppedStart =
+ new VPInstruction(Instruction::Add, {StartValue, Mul}, {}, "induction");
+ VectorPHVPBB->appendRecipe(SteppedStart);
+ return SteppedStart;
+ } else {
+ FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags();
+ InitVec = new VPWidenCastRecipe(Instruction::UIToFP, InitVec, InductionTy);
+ VectorPHVPBB->appendRecipe(InitVec);
+ auto *Mul = new VPInstruction(Instruction::FMul, {InitVec, Step}, FMF, DL);
+ VectorPHVPBB->appendRecipe(Mul);
+ Instruction::BinaryOps BinOp = ID.getInductionOpcode();
+ auto *SteppedStart =
+ new VPInstruction(BinOp, {StartValue, Mul}, FMF, DL, "induction");
+ VectorPHVPBB->appendRecipe(SteppedStart);
+ return SteppedStart;
+ }
+}
+
+/// Lower widen iv recipes into recipes with EVL.
+static void
+transformWidenIVRecipestoEVLRecipes(VPWidenIntOrFpInductionRecipe *WidenIV,
+ VPlan &Plan, VPValue *EVL) {
+ DebugLoc DL = WidenIV->getDebugLoc();
+ const InductionDescriptor &ID = WidenIV->getInductionDescriptor();
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(Plan.getCanonicalIV()->getBackedgeValue());
+ VPBasicBlock *VectorPHVPBB = Plan.getVectorLoopRegion()->getPreheaderVPBB();
+ VPBasicBlock *ExitingVPBB =
+ Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPValue *StartValue = WidenIV->getStartValue();
+ VPValue *Step = WidenIV->getStepValue();
+ if (TruncInst *I = WidenIV->getTruncInst()) {
+ Type *TruncTy = I->getType();
+ auto *R = new VPScalarCastRecipe(Instruction::Trunc, StartValue, TruncTy);
+ VectorPHVPBB->appendRecipe(R);
+ StartValue = R;
+ R = new VPScalarCastRecipe(Instruction::Trunc, Step, TruncTy);
+ VectorPHVPBB->appendRecipe(R);
+ Step = R;
+ }
+ Type *InductionTy = TypeInfo.inferScalarType(StartValue);
+ LLVMContext &Ctx = InductionTy->getContext();
+ VPValue *TrueMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ VPSingleDefRecipe *SteppedStart =
+ createStepVector(StartValue, Step, InductionTy, ID, VectorPHVPBB, DL);
+
+ // Create the vector phi node for both int. and fp. induction variables
+ // and determine the kind of arithmetic we will perform
+ auto *VecInd = new VPWidenPHIRecipe(WidenIV->getPHINode());
+ VecInd->insertBefore(WidenIV);
+ WidenIV->replaceAllUsesWith(VecInd);
+ Intrinsic::ID VPArithOp;
+ Instruction::BinaryOps MulOp;
+ if (InductionTy->isIntegerTy()) {
+ VPArithOp = Intrinsic::vp_add;
+ MulOp = Instruction::Mul;
+ } else {
+ VPArithOp = ID.getInductionOpcode() == Instruction::FAdd
+ ? Intrinsic::vp_fadd
+ : Intrinsic::vp_fsub;
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the runtime VF by the step
+ VPSingleDefRecipe *ScalarMul;
+ if (InductionTy->isFloatingPointTy()) {
+ FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags();
+ auto *CastEVL =
+ new VPScalarCastRecipe(Instruction::UIToFP, EVL, InductionTy);
+ CastEVL->insertBefore(CanonicalIVIncrement);
+ ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, FMF, DL);
+ } else {
+ unsigned InductionSz = InductionTy->getScalarSizeInBits();
+ unsigned EVLSz = TypeInfo.inferScalarType(EVL)->getScalarSizeInBits();
+ VPValue *CastEVL = EVL;
+ if (InductionSz != EVLSz) {
+ auto *R = new VPScalarCastRecipe(EVLSz > InductionSz ? Instruction::Trunc
+ : Instruction::ZExt,
+ EVL, InductionTy);
+ R->insertBefore(CanonicalIVIncrement);
+ CastEVL = R;
+ }
+ ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, DL);
+ }
+ ScalarMul->insertBefore(CanonicalIVIncrement);
+ // Create a vector splat to use in the induction update.
+ auto *SplatVF =
+ new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splat,
+ {ScalarMul, TrueMask, EVL}, InductionTy, DL);
+ SplatVF->insertBefore(CanonicalIVIncrement);
+ // TODO: We may need to add the step a number of times if UF > 1
+ auto *LastInduction = new VPWidenIntrinsicRecipe(
+ VPArithOp, {VecInd, SplatVF, TrueMask, EVL}, InductionTy, DL);
+ LastInduction->insertBefore(CanonicalIVIncrement);
+ VecInd->addIncoming(SteppedStart, VectorPHVPBB);
+ VecInd->addIncoming(LastInduction, ExitingVPBB);
+}
+
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1569,8 +1689,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
// of the VF directly. At the moment, widened inductions cannot be updated, so
// bail out if the plan contains any.
bool ContainsWidenInductions = any_of(Header->phis(), [](VPRecipeBase &Phi) {
- return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
- &Phi);
+ return isa<VPWidenPointerInductionRecipe>(&Phi);
});
if (ContainsWidenInductions)
return false;
@@ -1615,6 +1734,16 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
transformRecipestoEVLRecipes(Plan, *VPEVL);
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ SmallVector<VPRecipeBase *> ToRemove;
+ for (VPRecipeBase &Phi : HeaderVPBB->phis())
+ if (auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi)) {
+ transformWidenIVRecipestoEVLRecipes(WidenIV, Plan, VPEVL);
+ ToRemove.push_back(WidenIV);
+ }
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 8bdb3133243582..9d64f5c03274e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,7 +156,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPScalarCastRecipe>(
[&](const VPScalarCastRecipe *S) { return true; })
.Case<VPInstruction>([&](const VPInstruction *I) {
- if (I->getOpcode() != Instruction::Add) {
+ if ((I->getOpcode() != Instruction::Add) &&
+ (I->getOpcode() != Instruction::Mul)) {
errs()
<< "EVL is used as an operand in non-VPInstruction::Add\n";
return false;
@@ -166,11 +167,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
"users\n";
return false;
}
- if (!isa<VPEVLBasedIVPHIRecipe>(*I->users().begin())) {
- errs() << "Result of VPInstruction::Add with EVL operand is "
- "not used by VPEVLBasedIVPHIRecipe\n";
- return false;
- }
return true;
})
.Default([&](const VPUser *U) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index e40f51fd7bd705..27e8bb618803e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -8,14 +8,55 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
; CHECK-LABEL: define void @test_wide_integer_induction(
; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_PHI]], ptr align 8 [[TMP14]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP20]]
+; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 [[TMP16]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP17]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: store i64 [[IV]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-NEXT: store i64 [[IV1]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
@@ -34,6 +75,86 @@ for.cond.cleanup:
ret void
}
+define void @test_wide_fp_induction(ptr noalias %a, i64 %N) {
+; CHECK-LABEL: define void @test_wide_fp_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Shih-Po Hung (arcbbb) Changes…ataWithEVL vectorization mode. As an alternative approach to #82021, this patch lowers VPWidenIntOrFpInductionRecipe into a widen phi recipe and step recipes, computed using EVL in the EVL transformation phase. Patch is 75.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115274.diff 12 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index cd5cf0443541fc..0c4df3a85c0947 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -115,6 +115,10 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
/// Identifies if the vector form of the intrinsic has a scalar operand.
bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
unsigned ScalarOpdIdx) {
+ if (VPIntrinsic::isVPIntrinsic(ID) &&
+ (ScalarOpdIdx == VPIntrinsic::getVectorLengthParamPos(ID)))
+ return true;
+
switch (ID) {
case Intrinsic::abs:
case Intrinsic::ctlz:
@@ -127,6 +131,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::umul_fix:
case Intrinsic::umul_fix_sat:
return (ScalarOpdIdx == 2);
+ case Intrinsic::experimental_vp_splat:
+ return (ScalarOpdIdx == 0);
default:
return false;
}
@@ -148,6 +154,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
return OpdIdx == 0;
case Intrinsic::powi:
return OpdIdx == -1 || OpdIdx == 1;
+ case Intrinsic::experimental_vp_splat:
+ return OpdIdx == -1;
default:
return OpdIdx == -1;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 6344bc4664d3b6..3016622cddd226 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1191,6 +1191,16 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
CostKind);
+ case Intrinsic::experimental_vp_splat: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ if (RetTy->getScalarSizeInBits() == 1) {
+ return LT.first *
+ (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+ LT.second, CostKind));
+ }
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c07af8519049c4..8442479229db3f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2939,8 +2939,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (EnableVPlanNativePath)
- fixNonInductionPHIs(State);
+ fixNonInductionPHIs(State);
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 00ba2f49017899..583925e8d9bbbc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -285,15 +285,15 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
return Shuf;
};
- if (!hasScalarValue(Def, {0})) {
- assert(Def->isLiveIn() && "expected a live-in");
- Value *IRV = Def->getLiveInIRValue();
- Value *B = GetBroadcastInstrs(IRV);
+ Value *ScalarValue = hasScalarValue(Def, {0}) ? get(Def, VPLane(0)) : nullptr;
+ if (!ScalarValue || isa<Constant>(ScalarValue)) {
+ assert((ScalarValue || Def->isLiveIn()) && "expected a live-in");
+ Value *B = ScalarValue ? GetBroadcastInstrs(ScalarValue)
+ : GetBroadcastInstrs(Def->getLiveInIRValue());
set(Def, B);
return B;
}
- Value *ScalarValue = get(Def, VPLane(0));
// If we aren't vectorizing, we can just copy the scalar map values over
// to the vector map.
if (VF.isScalar()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6254ea15191819..0bfb29483282a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -648,7 +648,8 @@ bool VPInstruction::isVectorToScalar() const {
}
bool VPInstruction::isSingleScalar() const {
- return getOpcode() == VPInstruction::ResumePhi;
+ return getOpcode() == VPInstruction::ResumePhi ||
+ getOpcode() == VPInstruction::ExplicitVectorLength;
}
#if !defined(NDEBUG)
@@ -1022,6 +1023,8 @@ bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
// Vector predication intrinsics only demand the the first lane the last
// operand (the EVL operand).
+ if (VectorIntrinsicID == Intrinsic::experimental_vp_splat)
+ return Op == getOperand(0);
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
Op == getOperand(getNumOperands() - 1);
}
@@ -2309,9 +2312,8 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
- assert(vputils::onlyFirstLaneUsed(this) &&
- "Codegen only implemented for first lane.");
switch (Opcode) {
+ case Instruction::UIToFP:
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::Trunc: {
@@ -3414,9 +3416,6 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- assert(EnableVPlanNativePath &&
- "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ea8845eaa75d4d..ecd649b1048991 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1523,6 +1523,126 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
+/// This function adds (0 * Step, 1 * Step, 2 * Step, ...) to StartValue of
+/// an induction variable at the preheader.
+static VPSingleDefRecipe *createStepVector(VPValue *StartValue, VPValue *Step,
+ Type *InductionTy,
+ const InductionDescriptor &ID,
+ VPBasicBlock *VectorPHVPBB,
+ DebugLoc DL) {
+ Type *IntTy = InductionTy->isIntegerTy()
+ ? InductionTy
+ : IntegerType::get(InductionTy->getContext(),
+ InductionTy->getScalarSizeInBits());
+ // Create a vector of consecutive numbers from zero to VF.
+ VPSingleDefRecipe *InitVec =
+ new VPWidenIntrinsicRecipe(Intrinsic::stepvector, {}, IntTy, DL);
+ VectorPHVPBB->appendRecipe(InitVec);
+
+ if (InductionTy->isIntegerTy()) {
+ auto *Mul = new VPInstruction(Instruction::Mul, {InitVec, Step}, DL);
+ VectorPHVPBB->appendRecipe(Mul);
+ auto *SteppedStart =
+ new VPInstruction(Instruction::Add, {StartValue, Mul}, {}, "induction");
+ VectorPHVPBB->appendRecipe(SteppedStart);
+ return SteppedStart;
+ } else {
+ FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags();
+ InitVec = new VPWidenCastRecipe(Instruction::UIToFP, InitVec, InductionTy);
+ VectorPHVPBB->appendRecipe(InitVec);
+ auto *Mul = new VPInstruction(Instruction::FMul, {InitVec, Step}, FMF, DL);
+ VectorPHVPBB->appendRecipe(Mul);
+ Instruction::BinaryOps BinOp = ID.getInductionOpcode();
+ auto *SteppedStart =
+ new VPInstruction(BinOp, {StartValue, Mul}, FMF, DL, "induction");
+ VectorPHVPBB->appendRecipe(SteppedStart);
+ return SteppedStart;
+ }
+}
+
+/// Lower widen iv recipes into recipes with EVL.
+static void
+transformWidenIVRecipestoEVLRecipes(VPWidenIntOrFpInductionRecipe *WidenIV,
+ VPlan &Plan, VPValue *EVL) {
+ DebugLoc DL = WidenIV->getDebugLoc();
+ const InductionDescriptor &ID = WidenIV->getInductionDescriptor();
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(Plan.getCanonicalIV()->getBackedgeValue());
+ VPBasicBlock *VectorPHVPBB = Plan.getVectorLoopRegion()->getPreheaderVPBB();
+ VPBasicBlock *ExitingVPBB =
+ Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPValue *StartValue = WidenIV->getStartValue();
+ VPValue *Step = WidenIV->getStepValue();
+ if (TruncInst *I = WidenIV->getTruncInst()) {
+ Type *TruncTy = I->getType();
+ auto *R = new VPScalarCastRecipe(Instruction::Trunc, StartValue, TruncTy);
+ VectorPHVPBB->appendRecipe(R);
+ StartValue = R;
+ R = new VPScalarCastRecipe(Instruction::Trunc, Step, TruncTy);
+ VectorPHVPBB->appendRecipe(R);
+ Step = R;
+ }
+ Type *InductionTy = TypeInfo.inferScalarType(StartValue);
+ LLVMContext &Ctx = InductionTy->getContext();
+ VPValue *TrueMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ VPSingleDefRecipe *SteppedStart =
+ createStepVector(StartValue, Step, InductionTy, ID, VectorPHVPBB, DL);
+
+ // Create the vector phi node for both int. and fp. induction variables
+ // and determine the kind of arithmetic we will perform
+ auto *VecInd = new VPWidenPHIRecipe(WidenIV->getPHINode());
+ VecInd->insertBefore(WidenIV);
+ WidenIV->replaceAllUsesWith(VecInd);
+ Intrinsic::ID VPArithOp;
+ Instruction::BinaryOps MulOp;
+ if (InductionTy->isIntegerTy()) {
+ VPArithOp = Intrinsic::vp_add;
+ MulOp = Instruction::Mul;
+ } else {
+ VPArithOp = ID.getInductionOpcode() == Instruction::FAdd
+ ? Intrinsic::vp_fadd
+ : Intrinsic::vp_fsub;
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the runtime VF by the step
+ VPSingleDefRecipe *ScalarMul;
+ if (InductionTy->isFloatingPointTy()) {
+ FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags();
+ auto *CastEVL =
+ new VPScalarCastRecipe(Instruction::UIToFP, EVL, InductionTy);
+ CastEVL->insertBefore(CanonicalIVIncrement);
+ ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, FMF, DL);
+ } else {
+ unsigned InductionSz = InductionTy->getScalarSizeInBits();
+ unsigned EVLSz = TypeInfo.inferScalarType(EVL)->getScalarSizeInBits();
+ VPValue *CastEVL = EVL;
+ if (InductionSz != EVLSz) {
+ auto *R = new VPScalarCastRecipe(EVLSz > InductionSz ? Instruction::Trunc
+ : Instruction::ZExt,
+ EVL, InductionTy);
+ R->insertBefore(CanonicalIVIncrement);
+ CastEVL = R;
+ }
+ ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, DL);
+ }
+ ScalarMul->insertBefore(CanonicalIVIncrement);
+ // Create a vector splat to use in the induction update.
+ auto *SplatVF =
+ new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splat,
+ {ScalarMul, TrueMask, EVL}, InductionTy, DL);
+ SplatVF->insertBefore(CanonicalIVIncrement);
+ // TODO: We may need to add the step a number of times if UF > 1
+ auto *LastInduction = new VPWidenIntrinsicRecipe(
+ VPArithOp, {VecInd, SplatVF, TrueMask, EVL}, InductionTy, DL);
+ LastInduction->insertBefore(CanonicalIVIncrement);
+ VecInd->addIncoming(SteppedStart, VectorPHVPBB);
+ VecInd->addIncoming(LastInduction, ExitingVPBB);
+}
+
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1569,8 +1689,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
// of the VF directly. At the moment, widened inductions cannot be updated, so
// bail out if the plan contains any.
bool ContainsWidenInductions = any_of(Header->phis(), [](VPRecipeBase &Phi) {
- return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
- &Phi);
+ return isa<VPWidenPointerInductionRecipe>(&Phi);
});
if (ContainsWidenInductions)
return false;
@@ -1615,6 +1734,16 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
transformRecipestoEVLRecipes(Plan, *VPEVL);
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ SmallVector<VPRecipeBase *> ToRemove;
+ for (VPRecipeBase &Phi : HeaderVPBB->phis())
+ if (auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi)) {
+ transformWidenIVRecipestoEVLRecipes(WidenIV, Plan, VPEVL);
+ ToRemove.push_back(WidenIV);
+ }
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 8bdb3133243582..9d64f5c03274e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,7 +156,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPScalarCastRecipe>(
[&](const VPScalarCastRecipe *S) { return true; })
.Case<VPInstruction>([&](const VPInstruction *I) {
- if (I->getOpcode() != Instruction::Add) {
+ if ((I->getOpcode() != Instruction::Add) &&
+ (I->getOpcode() != Instruction::Mul)) {
errs()
<< "EVL is used as an operand in non-VPInstruction::Add\n";
return false;
@@ -166,11 +167,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
"users\n";
return false;
}
- if (!isa<VPEVLBasedIVPHIRecipe>(*I->users().begin())) {
- errs() << "Result of VPInstruction::Add with EVL operand is "
- "not used by VPEVLBasedIVPHIRecipe\n";
- return false;
- }
return true;
})
.Default([&](const VPUser *U) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index e40f51fd7bd705..27e8bb618803e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -8,14 +8,55 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
; CHECK-LABEL: define void @test_wide_integer_induction(
; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_PHI]], ptr align 8 [[TMP14]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP20]]
+; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 [[TMP16]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP17]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: store i64 [[IV]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-NEXT: store i64 [[IV1]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
@@ -34,6 +75,86 @@ for.cond.cleanup:
ret void
}
+define void @test_wide_fp_induction(ptr noalias %a, i64 %N) {
+; CHECK-LABEL: define void @test_wide_fp_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP...
[truncated]
|
case Intrinsic::experimental_vp_splat: { | ||
auto LT = getTypeLegalizationCost(RetTy); | ||
if (RetTy->getScalarSizeInBits() == 1) { | ||
return LT.first * | ||
(1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, | ||
LT.second, CostKind)); | ||
} | ||
return LT.first * | ||
getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These changes are included to prevent invalid costs in the tests, and will be moved to a separate patch for backend.
// Fix widened non-induction PHIs by setting up the PHI operands. | ||
if (EnableVPlanNativePath) | ||
fixNonInductionPHIs(State); | ||
fixNonInductionPHIs(State); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch lowers VPWidenIntOrFpInductionRecipe
into VPWidenPhiRecipe
, therefore it removes the restriction that VPWidenPhiRecipe
is only supported in the VPlan native path.
if (VectorIntrinsicID == Intrinsic::experimental_vp_splat) | ||
return Op == getOperand(0); | ||
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && | ||
Op == getOperand(getNumOperands() - 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the condition Op == getOperand(getNumOperands() - 1)
only checks for EVL operand, but for vp.splat, the first operand is also a scalar use.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So where you will need this?
IIUC, If this change is only for VPIntrinsicRecipe::execute
, the change in isVectorIntrinsicWithScalarOpAtArg
should be good enough.
if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
Arg = State.get(I.value(), VPLane(0));
Is there anything else I'm missing that needs this change?
llvm/lib/Analysis/VectorUtils.cpp
Outdated
if (VPIntrinsic::isVPIntrinsic(ID) && | ||
(ScalarOpdIdx == VPIntrinsic::getVectorLengthParamPos(ID))) | ||
return true; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Otherwise it falls to the default case which returns false, but vector length is a scalar operand.
@@ -156,7 +156,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { | |||
.Case<VPScalarCastRecipe>( | |||
[&](const VPScalarCastRecipe *S) { return true; }) | |||
.Case<VPInstruction>([&](const VPInstruction *I) { | |||
if (I->getOpcode() != Instruction::Add) { | |||
if ((I->getOpcode() != Instruction::Add) && | |||
(I->getOpcode() != Instruction::Mul)) { | |||
errs() | |||
<< "EVL is used as an operand in non-VPInstruction::Add\n"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need Update?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The step increment mul(step, evl)
is added to the plan, so we need to relax the constraint.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a misunderstanding. I mean update error message.
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
This is split off from llvm#115274. There doesn't seem to be an easy way to share this with getShuffleCost since that requires passing in a real insert_element operand to get it to recognise it's a scalar splat. There's no tests for i1 vectors, since we can't lower vp splats of them yet and currently crash. Co-authored-by: Shih-Po Hung <[email protected]>
This is split off from #115274. There doesn't seem to be an easy way to share this with getShuffleCost since that requires passing in a real insert_element operand to get it to recognise it's a scalar splat. For i1 vectors we can't currently lower them so it returns an invalid cost. --------- Co-authored-by: Shih-Po Hung <[email protected]>
…ataWithEVL vectorization mode. As an alternative approach to llvm#82021, this patch lowers VPWidenIntOrFpInductionRecipe into a widen phi recipe and step recipes, computed using EVL in the EVL transformation phase.
798c165
to
ec472b9
Compare
; CHECK-NEXT: Live-in ir<%N> = original trip-count | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: vector.ph: | ||
; CHECK-NEXT: WIDEN-INTRINSIC vp<%3> = callllvm.stepvector() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
callllvm.stepvector() --->call llvm.stepvector()
; CHECK-EMPTY: | ||
; CHECK-NEXT: <x1> vector loop: { | ||
; CHECK-NEXT: vector.body: | ||
; CHECK-NEXT: EMIT vp<%5> = CANONICAL-INDUCTION ir<0>, vp<%index.next> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use patten to match the numbers here
; CHECK-NEXT: Live-in ir<%N> = original trip-count | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: vector.ph: | ||
; CHECK-NEXT: WIDEN-INTRINSIC vp<%3> = callllvm.stepvector() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
…ataWithEVL vectorization mode.
As an alternative approach to #82021, this patch lowers VPWidenIntOrFpInductionRecipe into a widen phi recipe and step recipes, computed using EVL in the EVL transformation phase.