-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Optimise test of the LSB of a paired whileCC instruction #81141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-analysis Author: Momchil Velikov (momchil-velikov) ChangesTry to directly use the flags set by a Patch is 503.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81141.diff 25 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c..67e1b45cce29c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1228,6 +1228,8 @@ class TargetTransformInfo {
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(ElementCount VF) const;
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueInfo getOperandInfo(const Value *V);
@@ -1981,6 +1983,9 @@ class TargetTransformInfo::Concept {
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
+
+ virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
+
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2601,6 +2606,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
+
+ ElementCount getMaxPredicateLength(ElementCount VF) const override {
+ return Impl.getMaxPredicateLength(VF);
+ }
+
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JTSize,
ProfileSummaryInfo *PSI,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3d5db96e86b80..b6d01e0764ab1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -528,6 +528,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bb17298daba03..2b0d0f3ed6f70 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -881,6 +881,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620..daea8e48981ec 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -808,6 +808,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
+ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
+ return TTIImpl->getMaxPredicateLength(VF);
+}
+
TargetTransformInfo::OperandValueInfo
TargetTransformInfo::getOperandInfo(const Value *V) {
OperandValueKind OpInfo = OK_AnyValue;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939b04389..7d40721b24fcc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1813,8 +1813,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVE())
+ // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVEorSME())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -18032,22 +18032,49 @@ static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
AArch64CC::CondCode Cond);
-static bool isPredicateCCSettingOp(SDValue N) {
- if ((N.getOpcode() == ISD::SETCC) ||
- (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
- (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
- // get_active_lane_mask is lowered to a whilelo instruction.
- N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
- return true;
+static SDValue getPredicateCCSettingOp(SDValue N) {
+ if (N.getOpcode() == ISD::SETCC) {
+ EVT VT = N.getValueType();
+ return VT.isScalableVector() && VT.getVectorElementType() == MVT::i1
+ ? N
+ : SDValue();
+ }
- return false;
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isNullConstant(N.getOperand(1)))
+ N = N.getOperand(0);
+
+ if (N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+ return SDValue();
+
+ switch (N.getConstantOperandVal(0)) {
+ default:
+ return SDValue();
+ case Intrinsic::aarch64_sve_whilege_x2:
+ case Intrinsic::aarch64_sve_whilegt_x2:
+ case Intrinsic::aarch64_sve_whilehi_x2:
+ case Intrinsic::aarch64_sve_whilehs_x2:
+ case Intrinsic::aarch64_sve_whilele_x2:
+ case Intrinsic::aarch64_sve_whilelo_x2:
+ case Intrinsic::aarch64_sve_whilels_x2:
+ case Intrinsic::aarch64_sve_whilelt_x2:
+ if (N.getResNo() != 0)
+ return SDValue();
+ [[fallthrough]];
+ case Intrinsic::aarch64_sve_whilege:
+ case Intrinsic::aarch64_sve_whilegt:
+ case Intrinsic::aarch64_sve_whilehi:
+ case Intrinsic::aarch64_sve_whilehs:
+ case Intrinsic::aarch64_sve_whilele:
+ case Intrinsic::aarch64_sve_whilelo:
+ case Intrinsic::aarch64_sve_whilels:
+ case Intrinsic::aarch64_sve_whilelt:
+ case Intrinsic::get_active_lane_mask:
+ assert(N.getValueType().isScalableVector() &&
+ N.getValueType().getVectorElementType() == MVT::i1 &&
+ "Intrinsic expected to yield scalable i1 vector");
+ return N;
+ }
}
// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
@@ -18061,21 +18088,17 @@ performFirstTrueTestVectorCombine(SDNode *N,
if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
return SDValue();
- SDValue N0 = N->getOperand(0);
- EVT VT = N0.getValueType();
-
- if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
- !isNullConstant(N->getOperand(1)))
- return SDValue();
-
- // Restricted the DAG combine to only cases where we're extracting from a
- // flag-setting operation.
- if (!isPredicateCCSettingOp(N0))
+ // Restrict the DAG combine to only cases where we're extracting the zero-th
+ // element from the result of a flag-setting operation.
+ SDValue N0;
+ if (!isNullConstant(N->getOperand(1)) ||
+ !(N0 = getPredicateCCSettingOp(N->getOperand(0))))
return SDValue();
// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
SelectionDAG &DAG = DCI.DAG;
- SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
+ SDValue Pg =
+ getPTrue(DAG, SDLoc(N), N0.getValueType(), AArch64SVEPredPattern::all);
return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
}
@@ -20004,47 +20027,98 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
return SDValue();
}
-static SDValue performIntrinsicCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
+static SDValue tryCombineGetActiveLaneMask(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- unsigned IID = getIntrinsicID(N);
- switch (IID) {
- default:
- break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+ EVT WhileVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
return Res;
}
+
+ if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ return SDValue();
+
+ if (!N->hasNUsesOfValue(2, 0))
+ return SDValue();
+
+ auto It = N->use_begin();
+ SDNode *Lo = *It++;
+ SDNode *Hi = *It;
+
+ const uint64_t HalfSize = VT.getVectorMinNumElements() / 2;
+ uint64_t OffLo, OffHi;
+ if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+ (OffLo != 0 && OffLo != HalfSize) ||
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Hi->getOperand(1).getNode(), OffHi) ||
+ (OffHi != 0 && OffHi != HalfSize))
+ return SDValue();
+
+ if (OffLo > OffHi) {
+ std::swap(Lo, Hi);
+ std::swap(OffLo, OffHi);
+ }
+
+ if (OffLo != 0 || OffHi != HalfSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+ SDValue Idx = N->getOperand(1);
+ SDValue TC = N->getOperand(2);
+ if (Idx.getValueType() != MVT::i64) {
+ Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+ TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+ }
+ auto R =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+ {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+ DCI.CombineTo(Lo, R.getValue(0));
+ DCI.CombineTo(Hi, R.getValue(1));
+
+ return SDValue(N, 0);
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::get_active_lane_mask:
+ return tryCombineGetActiveLaneMask(N, DCI, Subtarget);
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9add7d87017a7..e2068b2d88ec9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1358,11 +1358,22 @@ bool AArch64InstrInfo::optimizePTestInstr(
const MachineRegisterInfo *MRI) const {
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
auto *Pred = MRI->getUniqueVRegDef(PredReg);
- auto NewOp = Pred->getOpcode();
+ unsigned NewOp;
bool OpChanged = false;
unsigned MaskOpcode = Mask->getOpcode();
unsigned PredOpcode = Pred->getOpcode();
+
+ // Handle a COPY from the LSB of a paired WHILEcc instruction.
+ if ((PredOpcode == TargetOpcode::COPY &&
+ Pred->getOperand(1).getSubReg() == AArch64::psub0)) {
+ MachineInstr *MI = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (MI && isWhileOpcode(MI->getOpcode())) {
+ Pred = MI;
+ PredOpcode = MI->getOpcode();
+ }
+ }
+
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
@@ -1478,9 +1489,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
// as they are prior to PTEST. Sometimes this requires the tested PTEST
// operand to be replaced with an equivalent instruction that also sets the
// flags.
- Pred->setDesc(get(NewOp));
PTest->eraseFromParent();
if (OpChanged) {
+ Pred->setDesc(get(NewOp));
bool succeeded = UpdateOperandRegClass(*Pred);
(void)succeeded;
assert(succeeded && "Operands have incompatible register classes!");
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c..73aca77305df1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3285,6 +3285,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
+ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
+ // Do not create masks bigger than `<vscale x 16 x i1>`.
+ unsigned N = ST->hasSVE() ? 16 : 0;
+ // Do not create masks that are more than twice the VF.
+ N = std::min(N, 2 * VF.getKnownMinValue());
+ return VF.isScalable() ? ElementCount::getScalable(N)
+ : ElementCount::getFixed(N);
+}
+
// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e..6501cc4a85e8d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getMaxInterleaveFactor(ElementCount VF);
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
bool prefersVectorizedAddressing() const;
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 789ec817d3d8b..718b245c6d829 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9754,7 +9754,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {
// SVE integer compare scalar count and limit (predicate pair)
class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
- RegisterOperand ppr_ty>
+ RegisterOperand ppr_ty, ElementSizeEnum EltSz>
: I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
mnemonic, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
@@ -9772,16 +9772,18 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
let Inst{3-1} = Pd;
let Inst{0} = opc{0};
+ let ElementSize = EltSz;
let Defs = [NZCV];
let hasSideEffects = 0;
+ let isWhile = 1;
}
multiclass sve2p1_int_while_rr_pair<string mnemonic, bits<3> opc> {
- def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r>;
- def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r>;
- def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r>;
- def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r>;
+ def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r, ElementSizeB>;
+ def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r, ElementSizeH>;
+ def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r, ElementSizeS>;
+ def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r, ElementSizeD>;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb..0e681c8080bfd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,14 @@ class VPBuilder {
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = {}, const Twine &Name = "");
+ VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
+ const Twine &Name = "") {
+ auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
+ if (BB)
+ BB->insert(ALM, InsertPt);
+ return ALM;
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a7b301c35f2b..bac66e633a6f3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -599,6 +599,10 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const {
+ return TTI->getMaxPredicateLength(VF);
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -7550,7 +7554,8 @@ LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
+ DT, ILV.Builder, &ILV, &BestVPlan,
OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent code into the preheader...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Momchil Velikov (momchil-velikov) ChangesTry to directly use the flags set by a Patch is 503.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81141.diff 25 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c0..67e1b45cce29c5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1228,6 +1228,8 @@ class TargetTransformInfo {
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(ElementCount VF) const;
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueInfo getOperandInfo(const Value *V);
@@ -1981,6 +1983,9 @@ class TargetTransformInfo::Concept {
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
+
+ virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
+
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2601,6 +2606,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
+
+ ElementCount getMaxPredicateLength(ElementCount VF) const override {
+ return Impl.getMaxPredicateLength(VF);
+ }
+
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JTSize,
ProfileSummaryInfo *PSI,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3d5db96e86b804..b6d01e0764ab14 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -528,6 +528,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bb17298daba03a..2b0d0f3ed6f706 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -881,6 +881,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620e..daea8e48981ecb 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -808,6 +808,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
+ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
+ return TTIImpl->getMaxPredicateLength(VF);
+}
+
TargetTransformInfo::OperandValueInfo
TargetTransformInfo::getOperandInfo(const Value *V) {
OperandValueKind OpInfo = OK_AnyValue;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939b04389f..7d40721b24fcc1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1813,8 +1813,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVE())
+ // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVEorSME())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -18032,22 +18032,49 @@ static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
AArch64CC::CondCode Cond);
-static bool isPredicateCCSettingOp(SDValue N) {
- if ((N.getOpcode() == ISD::SETCC) ||
- (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
- (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
- N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
- // get_active_lane_mask is lowered to a whilelo instruction.
- N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
- return true;
+static SDValue getPredicateCCSettingOp(SDValue N) {
+ if (N.getOpcode() == ISD::SETCC) {
+ EVT VT = N.getValueType();
+ return VT.isScalableVector() && VT.getVectorElementType() == MVT::i1
+ ? N
+ : SDValue();
+ }
- return false;
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isNullConstant(N.getOperand(1)))
+ N = N.getOperand(0);
+
+ if (N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+ return SDValue();
+
+ switch (N.getConstantOperandVal(0)) {
+ default:
+ return SDValue();
+ case Intrinsic::aarch64_sve_whilege_x2:
+ case Intrinsic::aarch64_sve_whilegt_x2:
+ case Intrinsic::aarch64_sve_whilehi_x2:
+ case Intrinsic::aarch64_sve_whilehs_x2:
+ case Intrinsic::aarch64_sve_whilele_x2:
+ case Intrinsic::aarch64_sve_whilelo_x2:
+ case Intrinsic::aarch64_sve_whilels_x2:
+ case Intrinsic::aarch64_sve_whilelt_x2:
+ if (N.getResNo() != 0)
+ return SDValue();
+ [[fallthrough]];
+ case Intrinsic::aarch64_sve_whilege:
+ case Intrinsic::aarch64_sve_whilegt:
+ case Intrinsic::aarch64_sve_whilehi:
+ case Intrinsic::aarch64_sve_whilehs:
+ case Intrinsic::aarch64_sve_whilele:
+ case Intrinsic::aarch64_sve_whilelo:
+ case Intrinsic::aarch64_sve_whilels:
+ case Intrinsic::aarch64_sve_whilelt:
+ case Intrinsic::get_active_lane_mask:
+ assert(N.getValueType().isScalableVector() &&
+ N.getValueType().getVectorElementType() == MVT::i1 &&
+ "Intrinsic expected to yield scalable i1 vector");
+ return N;
+ }
}
// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
@@ -18061,21 +18088,17 @@ performFirstTrueTestVectorCombine(SDNode *N,
if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
return SDValue();
- SDValue N0 = N->getOperand(0);
- EVT VT = N0.getValueType();
-
- if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
- !isNullConstant(N->getOperand(1)))
- return SDValue();
-
- // Restricted the DAG combine to only cases where we're extracting from a
- // flag-setting operation.
- if (!isPredicateCCSettingOp(N0))
+ // Restrict the DAG combine to only cases where we're extracting the zero-th
+ // element from the result of a flag-setting operation.
+ SDValue N0;
+ if (!isNullConstant(N->getOperand(1)) ||
+ !(N0 = getPredicateCCSettingOp(N->getOperand(0))))
return SDValue();
// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
SelectionDAG &DAG = DCI.DAG;
- SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
+ SDValue Pg =
+ getPTrue(DAG, SDLoc(N), N0.getValueType(), AArch64SVEPredPattern::all);
return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
}
@@ -20004,47 +20027,98 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
return SDValue();
}
-static SDValue performIntrinsicCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
+static SDValue tryCombineGetActiveLaneMask(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- unsigned IID = getIntrinsicID(N);
- switch (IID) {
- default:
- break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+ EVT WhileVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
return Res;
}
+
+ if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ return SDValue();
+
+ if (!N->hasNUsesOfValue(2, 0))
+ return SDValue();
+
+ auto It = N->use_begin();
+ SDNode *Lo = *It++;
+ SDNode *Hi = *It;
+
+ const uint64_t HalfSize = VT.getVectorMinNumElements() / 2;
+ uint64_t OffLo, OffHi;
+ if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+ (OffLo != 0 && OffLo != HalfSize) ||
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Hi->getOperand(1).getNode(), OffHi) ||
+ (OffHi != 0 && OffHi != HalfSize))
+ return SDValue();
+
+ if (OffLo > OffHi) {
+ std::swap(Lo, Hi);
+ std::swap(OffLo, OffHi);
+ }
+
+ if (OffLo != 0 || OffHi != HalfSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+ SDValue Idx = N->getOperand(1);
+ SDValue TC = N->getOperand(2);
+ if (Idx.getValueType() != MVT::i64) {
+ Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+ TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+ }
+ auto R =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+ {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+ DCI.CombineTo(Lo, R.getValue(0));
+ DCI.CombineTo(Hi, R.getValue(1));
+
+ return SDValue(N, 0);
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::get_active_lane_mask:
+ return tryCombineGetActiveLaneMask(N, DCI, Subtarget);
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9add7d87017a73..e2068b2d88ec9f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1358,11 +1358,22 @@ bool AArch64InstrInfo::optimizePTestInstr(
const MachineRegisterInfo *MRI) const {
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
auto *Pred = MRI->getUniqueVRegDef(PredReg);
- auto NewOp = Pred->getOpcode();
+ unsigned NewOp;
bool OpChanged = false;
unsigned MaskOpcode = Mask->getOpcode();
unsigned PredOpcode = Pred->getOpcode();
+
+ // Handle a COPY from the LSB of a paired WHILEcc instruction.
+ if ((PredOpcode == TargetOpcode::COPY &&
+ Pred->getOperand(1).getSubReg() == AArch64::psub0)) {
+ MachineInstr *MI = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (MI && isWhileOpcode(MI->getOpcode())) {
+ Pred = MI;
+ PredOpcode = MI->getOpcode();
+ }
+ }
+
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
@@ -1478,9 +1489,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
// as they are prior to PTEST. Sometimes this requires the tested PTEST
// operand to be replaced with an equivalent instruction that also sets the
// flags.
- Pred->setDesc(get(NewOp));
PTest->eraseFromParent();
if (OpChanged) {
+ Pred->setDesc(get(NewOp));
bool succeeded = UpdateOperandRegClass(*Pred);
(void)succeeded;
assert(succeeded && "Operands have incompatible register classes!");
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c9..73aca77305df1f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3285,6 +3285,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
+ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
+ // Do not create masks bigger than `<vscale x 16 x i1>`.
+ unsigned N = ST->hasSVE() ? 16 : 0;
+ // Do not create masks that are more than twice the VF.
+ N = std::min(N, 2 * VF.getKnownMinValue());
+ return VF.isScalable() ? ElementCount::getScalable(N)
+ : ElementCount::getFixed(N);
+}
+
// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e1..6501cc4a85e8d3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getMaxInterleaveFactor(ElementCount VF);
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
bool prefersVectorizedAddressing() const;
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 789ec817d3d8b8..718b245c6d8290 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9754,7 +9754,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {
// SVE integer compare scalar count and limit (predicate pair)
class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
- RegisterOperand ppr_ty>
+ RegisterOperand ppr_ty, ElementSizeEnum EltSz>
: I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
mnemonic, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
@@ -9772,16 +9772,18 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
let Inst{3-1} = Pd;
let Inst{0} = opc{0};
+ let ElementSize = EltSz;
let Defs = [NZCV];
let hasSideEffects = 0;
+ let isWhile = 1;
}
multiclass sve2p1_int_while_rr_pair<string mnemonic, bits<3> opc> {
- def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r>;
- def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r>;
- def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r>;
- def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r>;
+ def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r, ElementSizeB>;
+ def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r, ElementSizeH>;
+ def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r, ElementSizeS>;
+ def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r, ElementSizeD>;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb6..0e681c8080bfd1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,14 @@ class VPBuilder {
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = {}, const Twine &Name = "");
+ VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
+ const Twine &Name = "") {
+ auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
+ if (BB)
+ BB->insert(ALM, InsertPt);
+ return ALM;
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a7b301c35f2b8..bac66e633a6f3f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -599,6 +599,10 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const {
+ return TTI->getMaxPredicateLength(VF);
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -7550,7 +7554,8 @@ LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
+ DT, ILV.Builder, &ILV, &BestVPlan,
OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent c...
[truncated]
|
7b9549f
to
6c608fe
Compare
6c608fe
to
7b91475
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
7b91475
to
984466f
Compare
5c9cbf4
to
c8a1c66
Compare
4aac7b7
to
e049772
Compare
e049772
to
ccaf625
Compare
27904f9
to
b8c3459
Compare
This patch refactors `AArch64InstrInfo::optimizePTestInstr` to simplify the convoluted conditions and control flow and make it easier to add the optimisation in #81141
…d whileCC instruction Change-Id: I5058e24c631ede0a04399b39e5096f898fa8f792
Change-Id: Iefc0eb7e4b90715ae08c154dde5bda1091f9de07
b8c3459
to
3990749
Compare
E: | ||
%wide.mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i32 %i, i32 %n) | ||
%mask = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> %wide.mask, i64 0) | ||
%elt = extractelement <vscale x 8 x i1> %mask, i64 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we reduce the whole %wide.mask
into an i1
and branch based on that, we already seem to fold away the ptest
, see example. What is the use-case for extracting the first element as opposed to reducing the whole vector?
(The case of folding away the ptest when reducing the partial vector is not yet handled)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The use case is a loop that tests the LSB of a lane mask to decide whether to go for the next iteration #81140
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't the use case for interleaving in the vectoriser when using tail-folding? Suppose your vectorisation factor is vscale x 8, but you want to interleave as well. Then you'd create a double-sized mask (vscale x 16) that still has efficient lowering, and the vectoriser extracts the low half of that and uses that for control flow?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The use case is a loop that tests the LSB of a lane mask to decide whether to go for the next iteration
Is there a reason that it requires testing the first element for this, as opposed to testing the whole vector?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. The reason is that this what the vectoriser emits.
Even if the vectoriser emitted a reduction, it does not seem the ptest
is folded: https://gcc.godbolt.org/z/sxGhjjnfh
Also the reduction might be incorrect if the mask was produced by something other than get_active_lane_mask
Try to directly use the flags set by a
whileCC
instruction.