diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 68a62638b9d58..71f3e3bef5e0e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1649,6 +1649,16 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { MayWriteToMemory(CI.mayWriteToMemory()), MayHaveSideEffects(CI.mayHaveSideEffects()) {} + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, + ArrayRef<VPValue *> CallArguments, Type *Ty, + bool MayReadFromMemory, bool MayWriteToMemory, + bool MayHaveSideEffects, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), + MayReadFromMemory(MayReadFromMemory), + MayWriteToMemory(MayWriteToMemory), + MayHaveSideEffects(MayHaveSideEffects) {} + ~VPWidenIntrinsicRecipe() override = default; VPWidenIntrinsicRecipe *clone() override { @@ -1686,6 +1696,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + bool onlyFirstLaneUsed(const VPValue *Op) const override; }; /// A recipe for widening Call instructions using library calls. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 5a5b3ac19c46a..3eb5f3f40f842 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case Instruction::ICmp: case VPInstruction::ActiveLaneMask: return inferScalarType(R->getOperand(1)); + case VPInstruction::ExplicitVectorLength: + return Type::getIntNTy(Ctx, 32); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: return SetResultTyFromOp(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2948ecc580edc..dbccf419cd590 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -79,7 +79,6 @@ bool VPRecipeBase::mayWriteToMemory() const { return !cast<VPWidenCallRecipe>(this) ->getCalledScalarFunction() ->onlyReadsMemory(); - case VPWidenIntrinsicSC: return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: @@ -1042,6 +1041,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } +bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); + // Vector predication intrinsics only demand the the first lane the last + // operand (the EVL operand). + return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && + Op == getOperand(getNumOperands() - 1); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 379bfc0a4394b..4443a7be4ad45 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1353,6 +1353,7 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = dyn_cast<VPRecipeBase>(U); @@ -1384,6 +1385,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); }) + .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) { + SmallVector<VPValue *> Ops(Sel->operands()); + Ops.push_back(&EVL); + return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops, + TypeInfo.inferScalarType(Sel), + false, false, false); + }) + .Default([&](VPRecipeBase *R) { return nullptr; }); if (!NewRecipe) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 99bc4c38a3c3c..7ea5ee341cc54 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -138,6 +138,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { }; for (const VPUser *U : EVL.users()) { if (!TypeSwitch<const VPUser *, bool>(U) + .Case<VPWidenIntrinsicRecipe>( + [&](const VPWidenIntrinsicRecipe *S) { + return VerifyEVLUse(*S, S->getNumOperands() - 1); + }) .Case<VPWidenStoreEVLRecipe>([&](const VPWidenStoreEVLRecipe *S) { return VerifyEVLUse(*S, 2); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 41796e848632e..fc12dd54f88df 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) -; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] ; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll new file mode 100644 index 0000000000000..c26ab2017280a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -0,0 +1,65 @@ +; REQUIRES: asserts + + ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ + ; RUN: -force-tail-folding-style=data-with-evl \ + ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ + ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + + define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { + ; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { + ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF + ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count + ; IF-EVL-NEXT: Live-in ir<%N> = original trip-count + + ; IF-EVL: vector.ph: + ; IF-EVL-NEXT: Successor(s): vector loop + + ; IF-EVL: <x1> vector loop: { + ; IF-EVL-NEXT: vector.body: + ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION + ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]> + ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> + ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> + ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>) + ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> + ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 + ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> + ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> + ; IF-EVL-NEXT: No successors + ; IF-EVL-NEXT: } + + entry: + br label %for.body + + for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx3, align 4 + %cmp4 = icmp sgt i32 %0, %1 + %2 = sub i32 0, %1 + %cond.p = select i1 %cmp4, i32 %1, i32 %2 + %cond = add i32 %cond.p, %0 + %arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %cond, ptr %arrayidx15, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + + exit: + ret void + }