diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index db2fd300cb8f5..9b0751f996181 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7559,62 +7559,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } - // The legacy cost model has special logic to compute the cost of in-loop - // reductions, which may be smaller than the sum of all instructions involved - // in the reduction. - // TODO: Switch to costing based on VPlan once the logic has been ported. - for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { - if (ForceTargetInstructionCost.getNumOccurrences()) - continue; - - if (!CM.isInLoopReduction(RedPhi)) - continue; - - const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); - SetVector ChainOpsAndOperands(llvm::from_range, ChainOps); - auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { - return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; - }; - // Also include the operands of instructions in the chain, as the cost-model - // may mark extends as free. - // - // For ARM, some of the instruction can folded into the reducion - // instruction. So we need to mark all folded instructions free. - // For example: We can fold reduce(mul(ext(A), ext(B))) into one - // instruction. - for (auto *ChainOp : ChainOps) { - for (Value *Op : ChainOp->operands()) { - if (auto *I = dyn_cast(Op)) { - ChainOpsAndOperands.insert(I); - if (I->getOpcode() == Instruction::Mul) { - auto *Ext0 = dyn_cast(I->getOperand(0)); - auto *Ext1 = dyn_cast(I->getOperand(1)); - if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && - Ext0->getOpcode() == Ext1->getOpcode()) { - ChainOpsAndOperands.insert(Ext0); - ChainOpsAndOperands.insert(Ext1); - } - } - } - } - } - - // Pre-compute the cost for I, if it has a reduction pattern cost. - for (Instruction *I : ChainOpsAndOperands) { - auto ReductionCost = - CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF)); - if (!ReductionCost) - continue; - - assert(!CostCtx.SkipCostComputation.contains(I) && - "reduction op visited multiple times"); - CostCtx.SkipCostComputation.insert(I); - LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF - << ":\n in-loop reduction " << *I << "\n"); - Cost += *ReductionCost; - } - } - // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. @@ -9757,10 +9701,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - for (ElementCount VF : Range) - Plan->addVF(VF); - Plan->setName("Initial VPlan"); - // Update wide induction increments to use the same step as the corresponding // wide induction. This enables detecting induction increments directly in // VPlan and removes redundant splats. @@ -9796,6 +9736,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Transform recipes to abstract recipes if it is legal and beneficial and + // clamp the range for better cost estimation. + // TODO: Enable following transform when the EVL-version of extended-reduction + // and mulacc-reduction are implemented. + if (!CM.foldTailWithEVL()) { + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); + VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, + CostCtx, Range); + } + + for (ElementCount VF : Range) + Plan->addVF(VF); + Plan->setName("Initial VPlan"); + // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 7084676af6d5b..adf4bc877ded6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -522,6 +522,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: + case VPRecipeBase::VPMulAccumulateReductionSC: + case VPRecipeBase::VPExtendedReductionSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -606,13 +608,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} }; + struct NonNegFlagsTy { + char NonNeg : 1; + NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {} + }; + private: struct ExactFlagsTy { char IsExact : 1; }; - struct NonNegFlagsTy { - char NonNeg : 1; - }; struct FastMathFlagsTy { char AllowReassoc : 1; char NoNaNs : 1; @@ -706,6 +710,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), DisjointFlags(DisjointFlags) {} + template + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + NonNegFlagsTy NonNegFlags, DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp), + NonNegFlags(NonNegFlags) {} + protected: template VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, @@ -725,7 +735,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || - R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -811,6 +823,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { FastMathFlags getFastMathFlags() const; + /// Returns true if the recipe has non-negative flag. + bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; } + + bool isNonNeg() const { + assert(OpType == OperationType::NonNegOp && + "recipe doesn't have a NNEG flag"); + return NonNegFlags.NonNeg; + } + bool hasNoUnsignedWrap() const { assert(OpType == OperationType::OverflowingBinOp && "recipe doesn't have a NUW flag"); @@ -1203,11 +1224,22 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { iterator_range Operands) : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} + template + VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, + iterator_range Operands, bool NUW, bool NSW, DebugLoc DL) + : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL), + Opcode(Opcode) {} + public: template VPWidenRecipe(Instruction &I, iterator_range Operands) : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} + template + VPWidenRecipe(unsigned Opcode, iterator_range Operands, bool NUW, + bool NSW, DebugLoc DL) + : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {} + ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { @@ -1252,10 +1284,17 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags { "opcode of underlying cast doesn't match"); } - VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode), + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), Opcode(Opcode), ResultTy(ResultTy) {} + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + bool IsNonNeg, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg), + DL), + Opcode(Opcode), ResultTy(ResultTy) {} + ~VPWidenCastRecipe() override = default; VPWidenCastRecipe *clone() override { @@ -2341,6 +2380,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { setUnderlyingValue(I); } + /// For VPExtendedReductionRecipe. + /// Note that the debug location is from the extend. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + + /// For VPMulAccumulateReductionRecipe. + /// Note that the NUW/NSW flags and the debug location are from the Mul. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, @@ -2349,6 +2410,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { ArrayRef({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} + VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, + bool IsOrdered, DebugLoc DL = {}) + : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, + ArrayRef({ChainOp, VecOp}), CondOp, + IsOrdered, DL) {} + ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { @@ -2359,7 +2427,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -2399,7 +2469,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { /// A recipe to represent inloop reduction operations with vector-predication /// intrinsics, performing a reduction on a vector operand with the explicit /// vector length (EVL) into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, EVL, [Condition]}. +/// The operands are {ChainOp, VecOp, EVL, [Condition]}. class VPReductionEVLRecipe : public VPReductionRecipe { public: VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp, @@ -2439,6 +2509,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe { } }; +/// A recipe to represent inloop extended reduction operations, performing a +/// reduction on a extended vector operand into a scalar value, and adding the +/// result to a chain. This recipe is abstract and needs to be lowered to +/// concrete recipes before codegen. The operands are {ChainOp, VecOp, +/// [Condition]}. +class VPExtendedReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend recipe will be lowered to. + Instruction::CastOps ExtOp; + + Type *ResultTy; + + /// For cloning VPExtendedReductionRecipe. + VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed) + : VPReductionRecipe( + VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(), + {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(), + ExtRed->isOrdered(), ExtRed->getDebugLoc()), + ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { + transferFlags(*ExtRed); + } + +public: + VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext) + : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(), + R->isOrdered(), Ext->getDebugLoc()), + ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { + // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from + // the original recipe to prevent setting wrong flags. + transferFlags(*Ext); + } + + ~VPExtendedReductionRecipe() override = default; + + VPExtendedReductionRecipe *clone() override { + auto *Copy = new VPExtendedReductionRecipe(this); + Copy->transferFlags(*this); + return Copy; + } + + VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPExtendedReductionRecipe should be transform to " + "VPExtendedRecipe + VPReductionRecipe before execution."); + }; + + /// Return the cost of VPExtendedReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The scalar type after extending. + Type *getResultType() const { return ResultTy; } + + /// Is the extend ZExt? + bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } + + /// The opcode of extend recipe. + Instruction::CastOps getExtOpcode() const { return ExtOp; } +}; + +/// A recipe to represent inloop MulAccumulateReduction operations, performing a +/// reduction.add on the result of vector operands (might be extended) +/// multiplication into a scalar value, and adding the result to a chain. This +/// recipe is abstract and needs to be lowered to concrete recipes before +/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}. +class VPMulAccumulateReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend recipe. + Instruction::CastOps ExtOp; + + /// Non-neg flag of the extend recipe. + bool IsNonNeg = false; + + Type *ResultTy; + + /// For cloning VPMulAccumulateReductionRecipe. + VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(), + {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()}, + MulAcc->getCondOp(), MulAcc->isOrdered(), + WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), + MulAcc->getDebugLoc()), + ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), + ResultTy(MulAcc->getResultType()) {} + +public: + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, + VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, Type *ResultTy) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateteReductionRecipe must " + "be Add"); + // Only set the non-negative flag if the original recipe contains. + if (Ext0->hasNonNegFlag()) + IsNonNeg = Ext0->isNonNeg(); + } + + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Instruction::CastOps::CastOpsEnd) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateReductionRecipe must be " + "Add"); + } + + ~VPMulAccumulateReductionRecipe() override = default; + + VPMulAccumulateReductionRecipe *clone() override { + auto *Copy = new VPMulAccumulateReductionRecipe(this); + Copy->transferFlags(*this); + return Copy; + } + + VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPMulAccumulateReductionRecipe should transform to " + "VPWidenCastRecipe + " + "VPWidenRecipe + VPReductionRecipe before execution"); + } + + /// Return the cost of VPMulAccumulateReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + Type *getResultType() const { + assert(isExtended() && "Only support getResultType when this recipe " + "contains implicit extend."); + return ResultTy; + } + + /// The VPValue of the vector value to be extended and reduced. + VPValue *getVecOp0() const { return getOperand(1); } + VPValue *getVecOp1() const { return getOperand(2); } + + /// Return if this MulAcc recipe contains extended operands. + bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } + + /// Return the opcode of the extends for the operands. + Instruction::CastOps getExtOpcode() const { return ExtOp; } + + /// Return if the operands are zero extended. + bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } + + /// Return the non negative flag of the ext recipe. + bool isNonNeg() const { return IsNonNeg; } +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 375d4c9787994..b37cdfed97410 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -272,6 +272,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) + .Case( + [](const auto *R) { return R->getResultType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2cc558f49ccce..903b2042b9dc2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -71,6 +71,8 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -118,6 +120,8 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -155,6 +159,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: @@ -782,19 +788,25 @@ Value *VPInstruction::generate(VPTransformState &State) { InstructionCost VPInstruction::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (Instruction::isBinaryOp(getOpcode())) { + + Type *ResTy = Ctx.Types.inferScalarType(this); + if (!vputils::onlyFirstLaneUsed(this)) + ResTy = toVectorTy(ResTy, VF); + if (!getUnderlyingValue()) { - // TODO: Compute cost for VPInstructions without underlying values once - // the legacy cost model has been retired. - return 0; + switch (getOpcode()) { + case Instruction::FMul: + return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); + default: + // TODO: Compute cost for VPInstructions without underlying values once + // the legacy cost model has been retired. + return 0; + } } assert(!doesGeneratePerAllLanes() && "Should only generate a vector value or single scalar, not scalars " "for all lanes."); - Type *ResTy = Ctx.Types.inferScalarType(this); - if (!vputils::onlyFirstLaneUsed(this)) - ResTy = toVectorTy(ResTy, VF); - return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); } @@ -2478,28 +2490,47 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, auto *VectorTy = cast(toVectorTy(ElementTy, VF)); unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); + std::optional OptionalFMF = + ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; - // TODO: Support any-of and in-loop reductions. + // TODO: Support any-of reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); - assert( - (!cast(getOperand(0))->isInLoop() || - ForceTargetInstructionCost.getNumOccurrences() > 0) && - "In-loop reduction not implemented in VPlan-based cost model currently."); - // Cost = Reduction cost + BinOp cost - InstructionCost Cost = - Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); + // Note that TTI should model the cost of moving result to the scalar register + // and the BinOp cost in the getReductionCost(). if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Cost + - Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); + return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } - return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, - Ctx.CostKind); + return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, + Ctx.CostKind); +} + +InstructionCost +VPExtendedReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind()); + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF)); + assert(RedTy->isIntegerTy() && + "ExtendedReduction only support integer type currently."); + return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy, + std::nullopt, Ctx.CostKind); +} + +InstructionCost +VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF)); + return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2544,6 +2575,56 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, } O << ")"; } + +void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EXTENDED-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " +"; + O << " reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + O << " extended to " << *getResultType(); + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} + +void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "MULACC-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " + "; + O << "reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + O << "mul"; + printFlags(O); + if (isExtended()) + O << "("; + getVecOp0()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " extended to " << *getResultType() << "), ("; + else + O << ", "; + getVecOp1()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " extended to " << *getResultType() << ")"; + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} #endif bool VPReplicateRecipe::shouldPack() const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b80fe18d1bd66..b038782d652fa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2388,6 +2388,82 @@ void VPlanTransforms::createInterleaveGroups( } } +// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. +static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { + VPWidenCastRecipe *Ext; + // Only ZExt contains non-neg flags. + if (ExtRed->isZExt()) + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->isNonNeg(), + ExtRed->getDebugLoc()); + else + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->getDebugLoc()); + + auto *Red = new VPReductionRecipe( + ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext, + ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc()); + Ext->insertBefore(ExtRed); + Red->insertBefore(ExtRed); + ExtRed->replaceAllUsesWith(Red); + ExtRed->eraseFromParent(); +} + +// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) + +// VPReductionRecipe (reduce.add) +// + VPWidenCastRecipe (optional). +static void +expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { + // Generate inner VPWidenCastRecipes if necessary. + // Note that we will drop the extend after mul which transform + // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). + VPValue *Op0, *Op1; + if (MulAcc->isExtended()) { + Type *RedTy = MulAcc->getResultType(); + if (MulAcc->isZExt()) + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->getDebugLoc()); + Op0->getDefiningRecipe()->insertBefore(MulAcc); + // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate + // VPWidenCastRecipe. + if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) { + Op1 = Op0; + } else { + if (MulAcc->isZExt()) + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->getDebugLoc()); + Op1->getDefiningRecipe()->insertBefore(MulAcc); + } + } else { + // No extends in this MulAccRecipe. + Op0 = MulAcc->getVecOp0(); + Op1 = MulAcc->getVecOp1(); + } + + std::array MulOps = {Op0, Op1}; + auto *Mul = new VPWidenRecipe( + Instruction::Mul, make_range(MulOps.begin(), MulOps.end()), + MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(), + MulAcc->getDebugLoc()); + Mul->insertBefore(MulAcc); + + auto *Red = new VPReductionRecipe( + MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul, + MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc()); + Red->insertBefore(MulAcc); + + MulAcc->replaceAllUsesWith(Red); + MulAcc->eraseFromParent(); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; @@ -2450,6 +2526,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, VPI->replaceAllUsesWith(VectorStep); ToRemove.push_back(VPI); } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (auto *ExtRed = dyn_cast(&R)) + expandVPExtendedReduction(ExtRed); + if (auto *MulAcc = dyn_cast(&R)) + expandVPMulAccumulateReduction(MulAcc); + } } for (VPRecipeBase *R : ToRemove) @@ -2548,6 +2630,171 @@ void VPlanTransforms::handleUncountableEarlyExit( LatchExitingBranch->eraseFromParent(); } +/// This function tries convert extended in-loop reductions to +/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and +/// valid. The created recipe must be lowered to concrete +/// recipes before execution. +static VPExtendedReductionRecipe * +tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, + VFRange &Range) { + using namespace VPlanPatternMatch; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + VPValue *VecOp = Red->getVecOp(); + + // Clamp the range if using extended-reduction is profitable. + auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, + Type *SrcTy) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), + CostKind); + InstructionCost ExtCost = + cast(VecOp)->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; + }, + Range); + }; + + VPValue *A; + // Match reduce(ext)). + if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && + IsExtendedRedValidAndClampRange( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), + cast(VecOp)->getOpcode() == + Instruction::CastOps::ZExt, + Ctx.Types.inferScalarType(A))) + return new VPExtendedReductionRecipe(Red, cast(VecOp)); + + return nullptr; +} + +/// This function tries convert extended in-loop reductions to +/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial +/// and valid. The created VPExtendedReductionRecipe must be lower to concrete +/// recipes before execution. Patterns of MulAccumulateReduction: +/// reduce.add(mul(...)), +/// reduce.add(mul(ext(A), ext(B))), +/// reduce.add(ext(mul(ext(A), ext(B)))). +static VPMulAccumulateReductionRecipe * +tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, + VPCostContext &Ctx, VFRange &Range) { + using namespace VPlanPatternMatch; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + + // Clamp the range if using multiply-accumulate-reduction is profitable. + auto IsMulAccValidAndClampRange = + [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Type *SrcTy = + Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + InstructionCost MulAccCost = + Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulCost = Mul->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + InstructionCost ExtCost = 0; + if (Ext0) + ExtCost += Ext0->computeCost(VF, Ctx); + if (Ext1) + ExtCost += Ext1->computeCost(VF, Ctx); + if (OuterExt) + ExtCost += OuterExt->computeCost(VF, Ctx); + + return MulAccCost.isValid() && + MulAccCost < ExtCost + MulCost + RedCost; + }, + Range); + }; + + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + if (Opcode != Instruction::Add) + return nullptr; + + VPValue *VecOp = Red->getVecOp(); + VPValue *A, *B; + // Try to match reduce.add(mul(...)) + if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { + auto *RecipeA = + dyn_cast_if_present(A->getDefiningRecipe()); + auto *RecipeB = + dyn_cast_if_present(B->getDefiningRecipe()); + auto *Mul = cast(VecOp->getDefiningRecipe()); + + // Match reduce.add(mul(ext, ext)) + if (RecipeA && RecipeB && + (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && + match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + match(RecipeB, m_ZExtOrSExt(m_VPValue())) && + IsMulAccValidAndClampRange(RecipeA->getOpcode() == + Instruction::CastOps::ZExt, + Mul, RecipeA, RecipeB, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB, + RecipeA->getResultType()); + // Match reduce.add(mul) + if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul); + } + // Match reduce.add(ext(mul(ext(A), ext(B)))) + // All extend recipes must have same opcode or A == B + // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), + m_ZExtOrSExt(m_VPValue()))))) { + auto *Ext = cast(VecOp->getDefiningRecipe()); + auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); + auto *Ext0 = + cast(Mul->getOperand(0)->getDefiningRecipe()); + auto *Ext1 = + cast(Mul->getOperand(1)->getDefiningRecipe()); + if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + Ext0->getOpcode() == Ext1->getOpcode() && + IsMulAccValidAndClampRange(Ext0->getOpcode() == + Instruction::CastOps::ZExt, + Mul, Ext0, Ext1, Ext)) + return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1, + Ext->getResultType()); + } + return nullptr; +} + +/// This function tries to create abstract recipes from the reduction recipe for +/// following optimizations and cost estimation. +static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, + VPCostContext &Ctx, + VFRange &Range) { + VPReductionRecipe *AbstractR = nullptr; + + if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range)) + AbstractR = MulAcc; + else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range)) + AbstractR = ExtRed; + // Cannot create abstract inloop reduction recipes. + if (!AbstractR) + return; + + AbstractR->insertBefore(Red); + Red->replaceAllUsesWith(AbstractR); +} + +void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : *VPBB) { + if (auto *Red = dyn_cast(&R)) + tryToCreateAbstractReductionRecipe(Red, Ctx, Range); + } + } +} + void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 1200e8eaab0ba..582115182f43c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -28,7 +28,8 @@ class PredicatedScalarEvolution; class TargetLibraryInfo; class VPBuilder; class VPRecipeBuilder; -class VFRange; +struct VPCostContext; +struct VFRange; extern cl::opt VerifyEachVPlan; @@ -181,6 +182,13 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// This function converts initial recipes to the abstract recipes and clamps + /// \p Range based on cost model for following optimizations and cost + /// estimations. The converted abstract recipes will lower to concrete + /// recipes before codegen. + static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 638156eab7a84..64065edd315f9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -339,6 +339,8 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPMulAccumulateReductionSC, + VPExtendedReductionSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll index 0efdf077dca66..4208773e94734 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -12,14 +12,14 @@ target triple="aarch64-unknown-linux-gnu" ; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict32' ; CHECK-VSCALE2: Cost of 4 for VF vscale x 2: -; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE2: Cost of 8 for VF vscale x 4: -; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict32' ; CHECK-VSCALE1: Cost of 2 for VF vscale x 2: -; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1: Cost of 4 for VF vscale x 4: -; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -42,10 +42,10 @@ for.end: ; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict64' ; CHECK-VSCALE2: Cost of 4 for VF vscale x 2: -; CHECK-VSCALE2: in-loop reduction %add = fadd double %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict64' ; CHECK-VSCALE1: Cost of 2 for VF vscale x 2: -; CHECK-VSCALE1: in-loop reduction %add = fadd double %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 { entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index 2078a10d04ce7..ce3b2a9f216f2 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -23,11 +23,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) @@ -105,11 +105,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index a11cc15a8a85b..759c296a3193d 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -646,12 +646,11 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -726,12 +725,11 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -802,11 +800,11 @@ define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] @@ -855,10 +853,10 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -910,10 +908,10 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) @@ -963,11 +961,11 @@ define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonl ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP7]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]] @@ -1016,10 +1014,10 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]]) @@ -1069,11 +1067,11 @@ define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly % ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i8 [[TMP4]], [[VEC_PHI]] @@ -1122,10 +1120,10 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -1183,11 +1181,11 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[B1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP11]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[TMP4]] to <4 x i64> @@ -1206,10 +1204,10 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[I_011]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]] ; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B1]], i32 [[I_011]] ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 ; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[TMP10]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] @@ -1268,12 +1266,12 @@ define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias noc ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP0]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP9]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) @@ -1410,8 +1408,8 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] @@ -1459,9 +1457,8 @@ define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -1528,12 +1525,12 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -1667,24 +1664,55 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3:%.*]], [[ADD:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]] ; CHECK-NEXT: ret i64 [[DIV]] ; CHECK: for.body: -; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY1]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD3]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X:%.*]], i32 [[I_013]] +; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8 ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64 -; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_014]], [[CONV]] +; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]] -; CHECK-NEXT: [[ADD3]] = add nuw nsw i64 [[MUL]], [[T_012]] +; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]] ; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -1720,10 +1748,10 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -1731,28 +1759,26 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[TMP11]] to <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP8]]) +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <8 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <8 x i64> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP12]]) ; CHECK-NEXT: [[TMP16]] = add i64 [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] @@ -1787,7 +1813,7 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]] ; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]] ; entry: %cmp23 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index 05f26b8a0a273..d98d6a02a8124 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -424,62 +424,62 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i64 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP22]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 ; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i64 2 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP32]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP24]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP33]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 ; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.if7: ; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i64 3 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP38]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP48]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP50:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP49]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP50]], [[TMP39]] ; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) ; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index f6a1ebf8b0fe9..3df34cffebbf4 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -221,13 +221,13 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]]) ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -329,11 +329,11 @@ define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocap ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 120, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COEFF:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COEFF1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -1289,15 +1289,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31) ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3) ; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31) ; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index d0d276ab967fa..9630590cf83b7 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -268,3 +268,148 @@ loop: exit: ret i64 %cond } + +define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_extended_reduction' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> +; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv + %load0 = load i32, ptr %arrayidx, align 4 + %conv0 = zext i32 %load0 to i64 + %rdx.next = add nsw i64 %rdx, %conv0 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, ptr %x, i32 %iv + %load0 = load i64, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %iv + %load1 = load i64, ptr %arrayidx1, align 4 + %mul = mul nsw i64 %load0, %load1 + %rdx.next = add nsw i64 %rdx, %mul + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc_extended' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv + %load0 = load i16, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv + %load1 = load i16, ptr %arrayidx1, align 4 + %conv0 = sext i16 %load0 to i32 + %conv1 = sext i16 %load1 to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv = sext i32 %mul to i64 + %rdx.next = add nsw i64 %rdx, %conv + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +}