diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 989090b80e1c8..4521dd0c9efaf 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -132,6 +132,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, case Intrinsic::umul_fix: case Intrinsic::umul_fix_sat: return (ScalarOpdIdx == 2); + case Intrinsic::experimental_vp_splat: + return (ScalarOpdIdx == 0); default: return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 90312c1a28df3..6c5438521e413 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2932,8 +2932,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. - if (EnableVPlanNativePath) - fixNonInductionPHIs(State); + fixNonInductionPHIs(State); // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 529108a5aaa97..a6ab1e02e2e3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -286,15 +286,15 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { return Shuf; }; - if (!hasScalarValue(Def, {0})) { - assert(Def->isLiveIn() && "expected a live-in"); - Value *IRV = Def->getLiveInIRValue(); - Value *B = GetBroadcastInstrs(IRV); + Value *ScalarValue = hasScalarValue(Def, {0}) ? get(Def, VPLane(0)) : nullptr; + if (!ScalarValue || isa(ScalarValue)) { + assert((ScalarValue || Def->isLiveIn()) && "expected a live-in"); + Value *B = ScalarValue ? GetBroadcastInstrs(ScalarValue) + : GetBroadcastInstrs(Def->getLiveInIRValue()); set(Def, B); return B; } - Value *ScalarValue = get(Def, VPLane(0)); // If we aren't vectorizing, we can just copy the scalar map values over // to the vector map. if (VF.isScalar()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8a44b5b176c46..bf1e7ca0bb87d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -648,7 +648,8 @@ bool VPInstruction::isVectorToScalar() const { } bool VPInstruction::isSingleScalar() const { - return getOpcode() == VPInstruction::ResumePhi; + return getOpcode() == VPInstruction::ResumePhi || + getOpcode() == VPInstruction::ExplicitVectorLength; } #if !defined(NDEBUG) @@ -1034,6 +1035,8 @@ bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Vector predication intrinsics only demand the the first lane the last // operand (the EVL operand). + if (VectorIntrinsicID == Intrinsic::experimental_vp_splat) + return Op == getOperand(0) || Op == getOperand(2); return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && Op == getOperand(getNumOperands() - 1); } @@ -2317,9 +2320,8 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, #endif Value *VPScalarCastRecipe ::generate(VPTransformState &State) { - assert(vputils::onlyFirstLaneUsed(this) && - "Codegen only implemented for first lane."); switch (Opcode) { + case Instruction::UIToFP: case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc: { @@ -3425,9 +3427,6 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenPHIRecipe::execute(VPTransformState &State) { - assert(EnableVPlanNativePath && - "Non-native vplans are not expected to have VPWidenPHIRecipes."); - Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1b333bdc30ff1..b1cc3c8a7e2b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1547,6 +1547,126 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } } +/// This function adds (0 * Step, 1 * Step, 2 * Step, ...) to StartValue of +/// an induction variable at the preheader. +static VPSingleDefRecipe *createStepVector(VPValue *StartValue, VPValue *Step, + Type *InductionTy, + const InductionDescriptor &ID, + VPBasicBlock *VectorPHVPBB, + DebugLoc DL) { + Type *IntTy = InductionTy->isIntegerTy() + ? InductionTy + : IntegerType::get(InductionTy->getContext(), + InductionTy->getScalarSizeInBits()); + // Create a vector of consecutive numbers from zero to VF. + VPSingleDefRecipe *InitVec = + new VPWidenIntrinsicRecipe(Intrinsic::stepvector, {}, IntTy, DL); + VectorPHVPBB->appendRecipe(InitVec); + + if (InductionTy->isIntegerTy()) { + auto *Mul = new VPInstruction(Instruction::Mul, {InitVec, Step}, DL); + VectorPHVPBB->appendRecipe(Mul); + auto *SteppedStart = + new VPInstruction(Instruction::Add, {StartValue, Mul}, {}, "induction"); + VectorPHVPBB->appendRecipe(SteppedStart); + return SteppedStart; + } else { + FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags(); + InitVec = new VPWidenCastRecipe(Instruction::UIToFP, InitVec, InductionTy); + VectorPHVPBB->appendRecipe(InitVec); + auto *Mul = new VPInstruction(Instruction::FMul, {InitVec, Step}, FMF, DL); + VectorPHVPBB->appendRecipe(Mul); + Instruction::BinaryOps BinOp = ID.getInductionOpcode(); + auto *SteppedStart = + new VPInstruction(BinOp, {StartValue, Mul}, FMF, DL, "induction"); + VectorPHVPBB->appendRecipe(SteppedStart); + return SteppedStart; + } +} + +/// Lower widen iv recipes into recipes with EVL. +static void +transformWidenIVRecipestoEVLRecipes(VPWidenIntOrFpInductionRecipe *WidenIV, + VPlan &Plan, VPValue *EVL) { + DebugLoc DL = WidenIV->getDebugLoc(); + const InductionDescriptor &ID = WidenIV->getInductionDescriptor(); + auto *CanonicalIVIncrement = + cast(Plan.getCanonicalIV()->getBackedgeValue()); + VPBasicBlock *VectorPHVPBB = Plan.getVectorLoopRegion()->getPreheaderVPBB(); + VPBasicBlock *ExitingVPBB = + Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPValue *StartValue = WidenIV->getStartValue(); + VPValue *Step = WidenIV->getStepValue(); + if (TruncInst *I = WidenIV->getTruncInst()) { + Type *TruncTy = I->getType(); + auto *R = new VPScalarCastRecipe(Instruction::Trunc, StartValue, TruncTy); + VectorPHVPBB->appendRecipe(R); + StartValue = R; + R = new VPScalarCastRecipe(Instruction::Trunc, Step, TruncTy); + VectorPHVPBB->appendRecipe(R); + Step = R; + } + Type *InductionTy = TypeInfo.inferScalarType(StartValue); + LLVMContext &Ctx = InductionTy->getContext(); + VPValue *TrueMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + + // Construct the initial value of the vector IV in the vector loop preheader + VPSingleDefRecipe *SteppedStart = + createStepVector(StartValue, Step, InductionTy, ID, VectorPHVPBB, DL); + + // Create the vector phi node for both int. and fp. induction variables + // and determine the kind of arithmetic we will perform + auto *VecInd = new VPWidenPHIRecipe(WidenIV->getPHINode()); + VecInd->insertBefore(WidenIV); + WidenIV->replaceAllUsesWith(VecInd); + Intrinsic::ID VPArithOp; + Instruction::BinaryOps MulOp; + if (InductionTy->isIntegerTy()) { + VPArithOp = Intrinsic::vp_add; + MulOp = Instruction::Mul; + } else { + VPArithOp = ID.getInductionOpcode() == Instruction::FAdd + ? Intrinsic::vp_fadd + : Intrinsic::vp_fsub; + MulOp = Instruction::FMul; + } + + // Multiply the runtime VF by the step + VPSingleDefRecipe *ScalarMul; + if (InductionTy->isFloatingPointTy()) { + FastMathFlags FMF = ID.getInductionBinOp()->getFastMathFlags(); + auto *CastEVL = + new VPScalarCastRecipe(Instruction::UIToFP, EVL, InductionTy); + CastEVL->insertBefore(CanonicalIVIncrement); + ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, FMF, DL); + } else { + unsigned InductionSz = InductionTy->getScalarSizeInBits(); + unsigned EVLSz = TypeInfo.inferScalarType(EVL)->getScalarSizeInBits(); + VPValue *CastEVL = EVL; + if (InductionSz != EVLSz) { + auto *R = new VPScalarCastRecipe(EVLSz > InductionSz ? Instruction::Trunc + : Instruction::ZExt, + EVL, InductionTy); + R->insertBefore(CanonicalIVIncrement); + CastEVL = R; + } + ScalarMul = new VPInstruction(MulOp, {Step, CastEVL}, DL); + } + ScalarMul->insertBefore(CanonicalIVIncrement); + // Create a vector splat to use in the induction update. + auto *SplatVF = + new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splat, + {ScalarMul, TrueMask, EVL}, InductionTy, DL); + SplatVF->insertBefore(CanonicalIVIncrement); + // TODO: We may need to add the step a number of times if UF > 1 + auto *LastInduction = new VPWidenIntrinsicRecipe( + VPArithOp, {VecInd, SplatVF, TrueMask, EVL}, InductionTy, DL); + LastInduction->insertBefore(CanonicalIVIncrement); + VecInd->addIncoming(SteppedStart, VectorPHVPBB); + VecInd->addIncoming(LastInduction, ExitingVPBB); +} + /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and /// replaces all uses except the canonical IV increment of /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe @@ -1592,9 +1712,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength( // The transform updates all users of inductions to work based on EVL, instead // of the VF directly. At the moment, widened inductions cannot be updated, so // bail out if the plan contains any. - bool ContainsWidenInductions = any_of( - Header->phis(), - IsaPred); + bool ContainsWidenInductions = + any_of(Header->phis(), IsaPred); if (ContainsWidenInductions) return false; @@ -1638,6 +1757,16 @@ bool VPlanTransforms::tryAddExplicitVectorLength( transformRecipestoEVLRecipes(Plan, *VPEVL); + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + SmallVector ToRemove; + for (VPRecipeBase &Phi : HeaderVPBB->phis()) + if (auto *WidenIV = dyn_cast(&Phi)) { + transformWidenIVRecipestoEVLRecipes(WidenIV, Plan, VPEVL); + ToRemove.push_back(WidenIV); + } + for (VPRecipeBase *R : ToRemove) + R->eraseFromParent(); + // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 71c7d547ac7d9..ef9be79e88a5e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -150,7 +150,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case( [&](const VPScalarCastRecipe *S) { return VerifyEVLUse(*S, 0); }) .Case([&](const VPInstruction *I) { - if (I->getOpcode() != Instruction::Add) { + if ((I->getOpcode() != Instruction::Add) && + (I->getOpcode() != Instruction::Mul)) { errs() << "EVL is used as an operand in non-VPInstruction::Add\n"; return false; } @@ -159,11 +160,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { "users\n"; return false; } - if (!isa(*I->users().begin())) { - errs() << "Result of VPInstruction::Add with EVL operand is " - "not used by VPEVLBasedIVPHIRecipe\n"; - return false; - } return true; }) .Default([&](const VPUser *U) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index e40f51fd7bd70..27e8bb618803e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -8,14 +8,55 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { ; CHECK-LABEL: define void @test_wide_integer_induction( ; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ [[INDUCTION]], [[ENTRY]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VEC_PHI]], ptr align 8 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP20]] +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.experimental.vp.splat.nxv2i64(i64 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP17]] = call @llvm.vp.add.nxv2i64( [[VEC_PHI]], [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i64 [[IV]], ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]] +; CHECK-NEXT: store i64 [[IV1]], ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -34,6 +75,86 @@ for.cond.cleanup: ret void } +define void @test_wide_fp_induction(ptr noalias %a, i64 %N) { +; CHECK-LABEL: define void @test_wide_fp_induction( +; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast float -3.000000e+00, [[DOTCAST]] +; CHECK-NEXT: [[IND_END:%.*]] = fsub fast float 0.000000e+00, [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP11:%.*]] = uitofp [[TMP10]] to +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast [[TMP11]], shufflevector ( insertelement ( poison, float -3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = fsub fast zeroinitializer, [[TMP12]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VEC_PHI]], ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = uitofp i32 [[TMP13]] to float +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float -3.000000e+00, [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.experimental.vp.splat.nxv4f32(float [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP21]] = call @llvm.vp.fsub.nxv4f32( [[VEC_PHI]], [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[F:%.*]] = phi float [ [[F_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store float [[F]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[F_NEXT]] = fsub fast float [[F]], -3.000000e+00 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %f = phi float [ %f.next, %for.body ], [ 0.000000e+00, %entry ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + store float %f, ptr %arrayidx, align 4 + %f.next = fsub fast float %f, -3.000000e+00 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + ; Make sure we do not vectorize a loop with a widened ptr induction. define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) { ; CHECK-LABEL: define void @test_wide_ptr_induction( @@ -68,3 +189,11 @@ for.body: for.cond.cleanup: ret void } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll index f404b89492886..8573a53e3b351 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll @@ -4,12 +4,12 @@ ; REQUIRES: asserts -; Make sure we do not vectorize a loop with a widened int induction. +; Now we vectorize loops with widened ivs. define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { -; CHECK-NOT: LV: Vector loop of width {{.+}} costs: +; CHECK: LV: Vector loop of width {{.+}} costs: ; ; CHECK: define void @test_wide_integer_induction( -; CHECK-NOT: vector.body +; CHECK: vector.body ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 66140dee01e52..afa5cdf7eb985 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -550,41 +550,135 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add( ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI1:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV1]], 0 +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI1]] +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP16]], [[VP_OP_LOAD]], zeroinitializer, i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP17]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP18]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP19]], [[EVL_BASED_IV1]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call @llvm.experimental.vp.splat.nxv4i32(i32 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP22]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI1]], [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 ; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP37]], [[IV_TRUNC]] ; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP37]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] -; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 -; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add( ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { ; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] ; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] -; IF-EVL-INLOOP: for.body: -; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]] +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV1]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI1]] +; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP15]], [[VP_OP_LOAD]], zeroinitializer, i32 [[TMP11]]) +; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-INLOOP-NEXT: [[ADD]] = add i32 [[TMP17]], [[RDX]] +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP19]], [[EVL_BASED_IV1]] +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]] +; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call @llvm.experimental.vp.splat.nxv4i32(i32 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-INLOOP-NEXT: [[TMP22]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI1]], [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] +; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[RDX1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-INLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 ; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], [[IV_TRUNC]] ; IF-EVL-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP28]], i32 0 -; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] -; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 -; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP-NEXT: [[ADD1]] = add nsw i32 [[SELECT]], [[RDX1]] +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL-INLOOP: for.end: -; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD1]], [[FOR_BODY]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] ; ; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add( @@ -736,49 +830,169 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add_pred( ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] -; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP37:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI2:%.*]] = phi [ [[TMP13]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI3:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_PHI]], [[BROADCAST_SPLAT]] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-OUTLOOP-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI3]] +; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI2]], [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP16]], [[TMP20]], zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI:%.*]] = select [[TMP21]], [[VEC_PHI2]], [[VP_OP]] +; IF-EVL-OUTLOOP-NEXT: [[TMP22]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[PREDPHI]], [[VEC_PHI2]], i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP14]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = zext i32 [[TMP14]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = mul i64 1, [[TMP24]] +; IF-EVL-OUTLOOP-NEXT: [[TMP26:%.*]] = call @llvm.experimental.vp.splat.nxv4i64(i64 [[TMP25]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP27]] = call @llvm.vp.add.nxv4i64( [[VEC_PHI]], [[TMP26]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP28:%.*]] = mul i32 1, [[TMP14]] +; IF-EVL-OUTLOOP-NEXT: [[TMP29:%.*]] = call @llvm.experimental.vp.splat.nxv4i32(i32 [[TMP28]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP30]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI3]], [[TMP29]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP22]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK1]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK1]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX1]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; IF-EVL-OUTLOOP-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32 ; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP38]], [[IV_TRUNC]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] ; IF-EVL-OUTLOOP: if.then: ; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP38]] ; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]] ; IF-EVL-OUTLOOP: for.inc: -; IF-EVL-OUTLOOP-NEXT: [[TMP37]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP32]], [[MIDDLE_BLOCK1]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add_pred( ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { ; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] -; IF-EVL-INLOOP: for.body: -; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP32:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI3:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_PHI]], [[BROADCAST_SPLAT]] ; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-INLOOP-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI3]] +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], [[TMP19]], i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI2]] +; IF-EVL-INLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP13]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP13]] to i64 +; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = mul i64 1, [[TMP23]] +; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = call @llvm.experimental.vp.splat.nxv4i64(i64 [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[TMP26]] = call @llvm.vp.add.nxv4i64( [[VEC_PHI]], [[TMP25]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[TMP27:%.*]] = mul i32 1, [[TMP13]] +; IF-EVL-INLOOP-NEXT: [[TMP28:%.*]] = call @llvm.experimental.vp.splat.nxv4i32(i32 [[TMP27]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[TMP29]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI3]], [[TMP28]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP13]]) +; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-INLOOP-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK1]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK1]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX1]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; IF-EVL-INLOOP-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32 ; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP35]], [[IV_TRUNC]] ; IF-EVL-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] ; IF-EVL-INLOOP: if.then: ; IF-EVL-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP35]] ; IF-EVL-INLOOP-NEXT: br label [[MIDDLE_BLOCK]] ; IF-EVL-INLOOP: for.inc: -; IF-EVL-INLOOP-NEXT: [[TMP32]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] -; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL-INLOOP: for.end: -; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP21]], [[MIDDLE_BLOCK1]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] ; ; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add_pred( @@ -950,8 +1164,10 @@ for.end: ; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; IF-EVL-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; IF-EVL-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]]} -; IF-EVL-OUTLOOP: [[META7]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; IF-EVL-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. ; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -959,8 +1175,10 @@ for.end: ; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; IF-EVL-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; IF-EVL-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]]} -; IF-EVL-INLOOP: [[META7]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; IF-EVL-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. ; NO-VP-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll index a52da79ee3963..a28a0039d9a34 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll @@ -12,18 +12,61 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) { ; IF-EVL-LABEL: @gather_scatter( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP22]], 2 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP21]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[INDUCTION]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_PHI]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP16]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call @llvm.experimental.vp.splat.nxv2i64(i64 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP19]] = call @llvm.vp.add.nxv2i64( [[VEC_PHI]], [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP8]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV1]] ; IF-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 -; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 -; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] ; IF-EVL-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4 -; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; IF-EVL-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-widen-iv-with-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-widen-iv-with-evl.ll new file mode 100644 index 0000000000000..802606756a464 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-widen-iv-with-evl.ll @@ -0,0 +1,147 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -disable-output < %s 2>&1 | FileCheck %s + +; REQUIRES: asserts + +define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { +; CHECK-NEXT: Live-in vp<%0> = VF * UF +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: WIDEN-INTRINSIC vp<%3> = callllvm.stepvector() +; CHECK-NEXT: EMIT vp<%4> = mul vp<%3>, ir<1> +; CHECK-NEXT: EMIT vp<%induction> = add ir<0>, vp<%4> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%5> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<%6> = phi ir<0>, vp<%index.evl.next> +; CHECK-NEXT: WIDEN-PHI ir<%iv> = phi vp<%induction>, vp<%14> +; CHECK-NEXT: EMIT vp<%avl> = sub ir<%N>, vp<%6> +; CHECK-NEXT: EMIT vp<%7> = EXPLICIT-VECTOR-LENGTH vp<%avl> +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%6>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%8> +; CHECK-NEXT: vp<%9> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN vp.store vp<%9>, ir<%iv>, vp<%7> +; CHECK-NEXT: SCALAR-CAST vp<%10> = zext vp<%7> to i64 +; CHECK-NEXT: EMIT vp<%index.evl.next> = add vp<%10>, vp<%6> +; CHECK-NEXT: SCALAR-CAST vp<%11> = zext vp<%7> to i64 +; CHECK-NEXT: EMIT vp<%12> = mul ir<1>, vp<%11> +; CHECK-NEXT: WIDEN-INTRINSIC vp<%13> = call llvm.experimental.vp.splat(vp<%12>, ir, vp<%7>) +; CHECK-NEXT: WIDEN-INTRINSIC vp<%14> = call llvm.vp.add(ir<%iv>, vp<%13>, ir, vp<%7>) +; CHECK-NEXT: EMIT vp<%index.next> = add vp<%5>, vp<%0> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT branch-on-cond ir +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv +; CHECK-NEXT: IR store i64 %iv, ptr %arrayidx, align 8 +; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %N +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv + store i64 %iv, ptr %arrayidx, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @test_wide_fp_induction(ptr noalias %a, i64 %N) { +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; CHECK-NEXT: Live-in vp<%0> = VF * UF +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: WIDEN-INTRINSIC vp<%3> = callllvm.stepvector() +; CHECK-NEXT: WIDEN-CAST vp<%4> = uitofp vp<%3> to float +; CHECK-NEXT: EMIT vp<%5> = fmul reassoc nnan ninf nsz arcp contract afn vp<%4>, ir<3.000000e+00> +; CHECK-NEXT: EMIT vp<%induction> = fsub reassoc nnan ninf nsz arcp contract afn ir<0.000000e+00>, vp<%5> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<%7> = phi ir<0>, vp<%index.evl.next> +; CHECK-NEXT: WIDEN-PHI ir<%f> = phi vp<%induction>, vp<%15> +; CHECK-NEXT: EMIT vp<%avl> = sub ir<%N>, vp<%7> +; CHECK-NEXT: EMIT vp<%8> = EXPLICIT-VECTOR-LENGTH vp<%avl> +; CHECK-NEXT: vp<%9> = SCALAR-STEPS vp<%7>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%9> +; CHECK-NEXT: vp<%10> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN vp.store vp<%10>, ir<%f>, vp<%8> +; CHECK-NEXT: SCALAR-CAST vp<%11> = zext vp<%8> to i64 +; CHECK-NEXT: EMIT vp<%index.evl.next> = add vp<%11>, vp<%7> +; CHECK-NEXT: SCALAR-CAST vp<%12> = uitofp vp<%8> to float +; CHECK-NEXT: EMIT vp<%13> = fmul reassoc nnan ninf nsz arcp contract afn ir<3.000000e+00>, vp<%12> +; CHECK-NEXT: WIDEN-INTRINSIC vp<%14> = call llvm.experimental.vp.splat(vp<%13>, ir, vp<%8>) +; CHECK-NEXT: WIDEN-INTRINSIC vp<%15> = call llvm.vp.fsub(ir<%f>, vp<%14>, ir, vp<%8>) +; CHECK-NEXT: EMIT vp<%index.next> = add vp<%6>, vp<%0> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT branch-on-cond ir +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %f = phi float [ %f.next, %for.body ], [ 0.000000e+00, %entry ] +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: IR store float %f, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %f.next = fsub fast float %f, 3.000000e+00 +; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %N +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %f = phi float [ %f.next, %for.body ], [ 0.000000e+00, %entry ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + store float %f, ptr %arrayidx, align 4 + %f.next = fsub fast float %f, 3.000000e+00 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +}