llvm · LiqinWeng · Sep 24, 2024 · Sep 19, 2024 · Sep 25, 2024
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -889,6 +889,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenSC:
     case VPRecipeBase::VPWidenEVLSC:
     case VPRecipeBase::VPWidenSelectSC:
+    case VPRecipeBase::VPWidenSelectEVLSC:
     case VPRecipeBase::VPBlendSC:
     case VPRecipeBase::VPPredInstPHISC:
     case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1712,10 +1713,17 @@ class VPHistogramRecipe : public VPRecipeBase {
 
 /// A recipe for widening select instructions.
 struct VPWidenSelectRecipe : public VPSingleDefRecipe {
+
+protected:
+  template <typename IterT>
+  VPWidenSelectRecipe(unsigned VPDefOpcode, SelectInst &I,
+                      iterator_range<IterT> Operands)
+      : VPSingleDefRecipe(VPDefOpcode, Operands, &I, I.getDebugLoc()) {}
+
+public:
   template <typename IterT>
   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
-      : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
-                          I.getDebugLoc()) {}
+      : VPWidenSelectRecipe(VPDef::VPWidenSelectSC, I, Operands) {}
 
   ~VPWidenSelectRecipe() override = default;
 
@@ -1724,7 +1732,15 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
                                    operands());
   }
 
-  VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenSelectEVLSC;
+  }
+
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
 
   /// Produce a widened version of the select instruction.
   void execute(VPTransformState &State) override;
@@ -1744,6 +1760,52 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
   }
 };
 
+// A recipe for widening select instruction with vector-predication intrinsics
+// with explicit vector length (EVL).
+struct VPWidenSelectEVLRecipe : public VPWidenSelectRecipe {
+
+  template <typename IterT>
+  VPWidenSelectEVLRecipe(SelectInst &I, iterator_range<IterT> Operands,
+                         VPValue &EVL)
+      : VPWidenSelectRecipe(VPDef::VPWidenSelectEVLSC, I, Operands) {
+    addOperand(&EVL);
+  }
+
+  VPWidenSelectEVLRecipe(VPWidenSelectRecipe &W, VPValue &EVL)
+      : VPWidenSelectEVLRecipe(*cast<SelectInst>(W.getUnderlyingInstr()),
+                               W.operands(), EVL) {}
+
+  ~VPWidenSelectEVLRecipe() override = default;
+
+  VPWidenSelectEVLRecipe *clone() final {
+    llvm_unreachable("VPWidenSelectEVLRecipe cannot be cloned");
+    return nullptr;
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenSelectEVLSC)
+
+  VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
+  const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+  /// Produce a vp-intrinsic version of the select instruction.
+  void execute(VPTransformState &State) final;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    // EVL in that recipe is always the last operand, thus any use before means
+    // the VPValue should be vectorized.
+    return getEVL() == Op;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const final;
+#endif
+};
+
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
   bool isPointerLoopInvariant() const {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -95,7 +95,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenEVLSC:
-  case VPWidenSelectSC: {
+  case VPWidenSelectSC:
+  case VPWidenSelectEVLSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
@@ -136,7 +137,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenEVLSC:
-  case VPWidenSelectSC: {
+  case VPWidenSelectSC:
+  case VPWidenSelectEVLSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
@@ -173,7 +175,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPWidenPointerInductionSC:
   case VPWidenSC:
   case VPWidenEVLSC:
-  case VPWidenSelectSC: {
+  case VPWidenSelectSC:
+  case VPWidenSelectEVLSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
@@ -1127,6 +1130,21 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
   getOperand(2)->printAsOperand(O, SlotTracker);
   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
 }
+
+void VPWidenSelectEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-SELECT ";
+  printAsOperand(O, SlotTracker);
+  O << " = vp.select ";
+  getOperand(0)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(1)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(2)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(3)->printAsOperand(O, SlotTracker);
+  O << (isInvariantCond() ? " (condition is loop invariant)" : "");
+}
 #endif
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
@@ -1147,6 +1165,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
   State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 }
 
+void VPWidenSelectEVLRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  Value *EVLArg = State.get(getEVL(), /*NeedsScalar=*/true);
+  IRBuilderBase &BuilderIR = State.Builder;
+  VectorBuilder Builder(BuilderIR);
+  Builder.setEVL(EVLArg);
+  // The condition can be loop invariant but still defined inside the
+  // loop. This means that we can't just use the original 'cond' value.
+  // We have to take the 'vectorized' value and pick the first lane.
+  // Instcombine will make this a no-op.
+  auto *InvarCond =
+      isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
+
+  Value *Cond = InvarCond ? InvarCond : State.get(getCond(), 0);
+  assert(isa<VectorType>(Cond->getType()) && "CondType must be vector Type.");
+
+  Value *Op0 = State.get(getOperand(1), 0);
+  Value *Op1 = State.get(getOperand(2), 0);
+  Value *VPInst = Builder.createVectorInstruction(
+      Instruction::Select, Op0->getType(), {Cond, Op0, Op1}, "vp.select");
+  State.set(this, VPInst, 0);
+  State.addMetadata(VPInst,
+                    dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+}
+
 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
     const FastMathFlags &FMF) {
   AllowReassoc = FMF.allowReassoc();

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1379,6 +1379,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                   return nullptr;
                 return new VPWidenEVLRecipe(*W, EVL);
               })
+              .Case<VPWidenSelectRecipe>(
+                  [&](VPWidenSelectRecipe *W) -> VPRecipeBase * {
+                    return new VPWidenSelectEVLRecipe(*W, EVL);
+                  })
               .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
                 VPValue *NewMask = GetNewMask(Red->getCondOp());
                 return new VPReductionEVLRecipe(*Red, EVL, NewMask);

diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -357,6 +357,7 @@ class VPDef {
     VPWidenSC,
     VPWidenEVLSC,
     VPWidenSelectSC,
+    VPWidenSelectEVLSC,
     VPBlendSC,
     VPHistogramSC,
     // START: Phi-like recipes. Need to be kept together.

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -148,6 +148,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
                return VerifyEVLUse(
                    *W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2);
              })
+             .Case<VPWidenSelectEVLRecipe>(
+                 [&](const VPWidenSelectEVLRecipe *S) {
+                   return VerifyEVLUse(*S, 3);
+                 })
              .Case<VPReductionEVLRecipe>([&](const VPReductionEVLRecipe *R) {
                return VerifyEVLUse(*R, 2);
              })

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP12]] to i64

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT:   vector.body:
+; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
+; IF-EVL-NEXT:     WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN-SELECT ir<[[SELECT:%.+]]> = vp.select ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN ir<[[ADD:%.+]]> = vp.add ir<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
+; IF-EVL-NEXT:   No successors
+; IF-EVL-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
+  %cmp4 = icmp sgt i32 %0, %1
+  %2 = sub i32 0, %1
+  %cond.p = select i1 %cmp4, i32 %1, i32 %2
+  %cond = add i32 %cond.p, %0
+  %arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %cond, ptr %arrayidx15, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}