Skip to content

Commit 6c608fe

Browse files
[AArch64] Optimise test of the LSB of a paired whileCC insntruction
1 parent b65dbdf commit 6c608fe

File tree

4 files changed

+89
-69
lines changed

4 files changed

+89
-69
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+49-26
Original file line numberDiff line numberDiff line change
@@ -18082,22 +18082,49 @@ static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
1808218082
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
1808318083
AArch64CC::CondCode Cond);
1808418084

18085-
static bool isPredicateCCSettingOp(SDValue N) {
18086-
if ((N.getOpcode() == ISD::SETCC) ||
18087-
(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18088-
(N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18089-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18090-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18091-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18092-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18093-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18094-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18095-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18096-
// get_active_lane_mask is lowered to a whilelo instruction.
18097-
N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18098-
return true;
18085+
static SDValue getPredicateCCSettingOp(SDValue N) {
18086+
if (N.getOpcode() == ISD::SETCC) {
18087+
EVT VT = N.getValueType();
18088+
return VT.isScalableVector() && VT.getVectorElementType() == MVT::i1
18089+
? N
18090+
: SDValue();
18091+
}
1809918092

18100-
return false;
18093+
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18094+
isNullConstant(N.getOperand(1)))
18095+
N = N.getOperand(0);
18096+
18097+
if (N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18098+
return SDValue();
18099+
18100+
switch (N.getConstantOperandVal(0)) {
18101+
default:
18102+
return SDValue();
18103+
case Intrinsic::aarch64_sve_whilege_x2:
18104+
case Intrinsic::aarch64_sve_whilegt_x2:
18105+
case Intrinsic::aarch64_sve_whilehi_x2:
18106+
case Intrinsic::aarch64_sve_whilehs_x2:
18107+
case Intrinsic::aarch64_sve_whilele_x2:
18108+
case Intrinsic::aarch64_sve_whilelo_x2:
18109+
case Intrinsic::aarch64_sve_whilels_x2:
18110+
case Intrinsic::aarch64_sve_whilelt_x2:
18111+
if (N.getResNo() != 0)
18112+
return SDValue();
18113+
[[fallthrough]];
18114+
case Intrinsic::aarch64_sve_whilege:
18115+
case Intrinsic::aarch64_sve_whilegt:
18116+
case Intrinsic::aarch64_sve_whilehi:
18117+
case Intrinsic::aarch64_sve_whilehs:
18118+
case Intrinsic::aarch64_sve_whilele:
18119+
case Intrinsic::aarch64_sve_whilelo:
18120+
case Intrinsic::aarch64_sve_whilels:
18121+
case Intrinsic::aarch64_sve_whilelt:
18122+
case Intrinsic::get_active_lane_mask:
18123+
assert(N.getValueType().isScalableVector() &&
18124+
N.getValueType().getVectorElementType() == MVT::i1 &&
18125+
"Intrinsic expected to yield scalable i1 vector");
18126+
return N;
18127+
}
1810118128
}
1810218129

1810318130
// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
@@ -18111,21 +18138,17 @@ performFirstTrueTestVectorCombine(SDNode *N,
1811118138
if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
1811218139
return SDValue();
1811318140

18114-
SDValue N0 = N->getOperand(0);
18115-
EVT VT = N0.getValueType();
18116-
18117-
if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18118-
!isNullConstant(N->getOperand(1)))
18119-
return SDValue();
18120-
18121-
// Restricted the DAG combine to only cases where we're extracting from a
18122-
// flag-setting operation.
18123-
if (!isPredicateCCSettingOp(N0))
18141+
// Restrict the DAG combine to only cases where we're extracting the zero-th
18142+
// element from the result of a flag-setting operation.
18143+
SDValue N0;
18144+
if (!isNullConstant(N->getOperand(1)) ||
18145+
!(N0 = getPredicateCCSettingOp(N->getOperand(0))))
1812418146
return SDValue();
1812518147

1812618148
// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
1812718149
SelectionDAG &DAG = DCI.DAG;
18128-
SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18150+
SDValue Pg =
18151+
getPTrue(DAG, SDLoc(N), N0.getValueType(), AArch64SVEPredPattern::all);
1812918152
return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
1813018153
}
1813118154

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -1358,11 +1358,22 @@ bool AArch64InstrInfo::optimizePTestInstr(
13581358
const MachineRegisterInfo *MRI) const {
13591359
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
13601360
auto *Pred = MRI->getUniqueVRegDef(PredReg);
1361-
auto NewOp = Pred->getOpcode();
1361+
unsigned NewOp;
13621362
bool OpChanged = false;
13631363

13641364
unsigned MaskOpcode = Mask->getOpcode();
13651365
unsigned PredOpcode = Pred->getOpcode();
1366+
1367+
// Handle a COPY from the LSB of a paired WHILEcc instruction.
1368+
if ((PredOpcode == TargetOpcode::COPY &&
1369+
Pred->getOperand(1).getSubReg() == AArch64::psub0)) {
1370+
MachineInstr *MI = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1371+
if (MI && isWhileOpcode(MI->getOpcode())) {
1372+
Pred = MI;
1373+
PredOpcode = MI->getOpcode();
1374+
}
1375+
}
1376+
13661377
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
13671378
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
13681379

@@ -1478,9 +1489,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
14781489
// as they are prior to PTEST. Sometimes this requires the tested PTEST
14791490
// operand to be replaced with an equivalent instruction that also sets the
14801491
// flags.
1481-
Pred->setDesc(get(NewOp));
14821492
PTest->eraseFromParent();
14831493
if (OpChanged) {
1494+
Pred->setDesc(get(NewOp));
14841495
bool succeeded = UpdateOperandRegClass(*Pred);
14851496
(void)succeeded;
14861497
assert(succeeded && "Operands have incompatible register classes!");

llvm/lib/Target/AArch64/SVEInstrFormats.td

+7-5
Original file line numberDiff line numberDiff line change
@@ -9764,7 +9764,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {
97649764

97659765
// SVE integer compare scalar count and limit (predicate pair)
97669766
class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
9767-
RegisterOperand ppr_ty>
9767+
RegisterOperand ppr_ty, ElementSizeEnum EltSz>
97689768
: I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
97699769
mnemonic, "\t$Pd, $Rn, $Rm",
97709770
"", []>, Sched<[]> {
@@ -9782,16 +9782,18 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
97829782
let Inst{3-1} = Pd;
97839783
let Inst{0} = opc{0};
97849784

9785+
let ElementSize = EltSz;
97859786
let Defs = [NZCV];
97869787
let hasSideEffects = 0;
9788+
let isWhile = 1;
97879789
}
97889790

97899791

97909792
multiclass sve2p1_int_while_rr_pair<string mnemonic, bits<3> opc> {
9791-
def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r>;
9792-
def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r>;
9793-
def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r>;
9794-
def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r>;
9793+
def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r, ElementSizeB>;
9794+
def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r, ElementSizeH>;
9795+
def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r, ElementSizeS>;
9796+
def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r, ElementSizeD>;
97959797
}
97969798

97979799

llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll

+20-36
Original file line numberDiff line numberDiff line change
@@ -351,9 +351,7 @@ define void @f_h_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
351351
; CHECK-SVE-NEXT: whilelo p1.b, x8, x9
352352
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
353353
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
354-
; CHECK-SVE-NEXT: mov z0.h, p1/z, #1 // =0x1
355-
; CHECK-SVE-NEXT: fmov w13, s0
356-
; CHECK-SVE-NEXT: tbnz w13, #0, .LBB4_2
354+
; CHECK-SVE-NEXT: b.mi .LBB4_2
357355
; CHECK-SVE-NEXT: .LBB4_3: // %for.cond.cleanup
358356
; CHECK-SVE-NEXT: ret
359357
;
@@ -378,9 +376,7 @@ define void @f_h_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
378376
; CHECK-SVE2p1-NEXT: st1h { z1.h }, p1, [x10, x8, lsl #1]
379377
; CHECK-SVE2p1-NEXT: addvl x8, x8, #1
380378
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x8, x9
381-
; CHECK-SVE2p1-NEXT: mov z0.h, p0/z, #1 // =0x1
382-
; CHECK-SVE2p1-NEXT: fmov w12, s0
383-
; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB4_2
379+
; CHECK-SVE2p1-NEXT: b.mi .LBB4_2
384380
; CHECK-SVE2p1-NEXT: .LBB4_3: // %for.cond.cleanup
385381
; CHECK-SVE2p1-NEXT: ret
386382
entry:
@@ -456,9 +452,7 @@ define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
456452
; CHECK-SVE-NEXT: whilelo p1.s, x8, x9
457453
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
458454
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
459-
; CHECK-SVE-NEXT: mov z1.d, p1/z, #1 // =0x1
460-
; CHECK-SVE-NEXT: fmov x13, d1
461-
; CHECK-SVE-NEXT: tbnz w13, #0, .LBB5_2
455+
; CHECK-SVE-NEXT: b.mi .LBB5_2
462456
; CHECK-SVE-NEXT: .LBB5_3: // %for.cond.cleanup
463457
; CHECK-SVE-NEXT: ret
464458
;
@@ -484,9 +478,7 @@ define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
484478
; CHECK-SVE2p1-NEXT: st1d { z2.d }, p1, [x10, x8, lsl #3]
485479
; CHECK-SVE2p1-NEXT: incw x8
486480
; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x8, x9
487-
; CHECK-SVE2p1-NEXT: mov z1.d, p0/z, #1 // =0x1
488-
; CHECK-SVE2p1-NEXT: fmov x12, d1
489-
; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB5_2
481+
; CHECK-SVE2p1-NEXT: b.mi .LBB5_2
490482
; CHECK-SVE2p1-NEXT: .LBB5_3: // %for.cond.cleanup
491483
; CHECK-SVE2p1-NEXT: ret
492484
entry:
@@ -752,15 +744,13 @@ define void @f_h_4(ptr noalias %dst, ptr %src, i32 %n) #0 {
752744
; CHECK-SVE-NEXT: st1h { z2.h }, p1, [x13, x10, lsl #1]
753745
; CHECK-SVE-NEXT: st1h { z3.h }, p0, [x12, x10, lsl #1]
754746
; CHECK-SVE-NEXT: add x10, x9, x10
755-
; CHECK-SVE-NEXT: whilelo p1.b, x18, x8
747+
; CHECK-SVE-NEXT: whilelo p2.b, x18, x8
756748
; CHECK-SVE-NEXT: whilelo p3.b, x10, x8
757-
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
758-
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
749+
; CHECK-SVE-NEXT: punpkhi p0.h, p2.b
750+
; CHECK-SVE-NEXT: punpklo p1.h, p2.b
759751
; CHECK-SVE-NEXT: punpkhi p2.h, p3.b
760752
; CHECK-SVE-NEXT: punpklo p3.h, p3.b
761-
; CHECK-SVE-NEXT: mov z0.h, p3/z, #1 // =0x1
762-
; CHECK-SVE-NEXT: fmov w18, s0
763-
; CHECK-SVE-NEXT: tbnz w18, #0, .LBB7_2
753+
; CHECK-SVE-NEXT: b.mi .LBB7_2
764754
; CHECK-SVE-NEXT: .LBB7_3: // %for.cond.cleanup
765755
; CHECK-SVE-NEXT: ret
766756
;
@@ -799,9 +789,7 @@ define void @f_h_4(ptr noalias %dst, ptr %src, i32 %n) #0 {
799789
; CHECK-SVE2p1-NEXT: addvl x9, x9, #2
800790
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x16, x8
801791
; CHECK-SVE2p1-NEXT: whilelo { p2.h, p3.h }, x9, x8
802-
; CHECK-SVE2p1-NEXT: mov z0.h, p2/z, #1 // =0x1
803-
; CHECK-SVE2p1-NEXT: fmov w16, s0
804-
; CHECK-SVE2p1-NEXT: tbnz w16, #0, .LBB7_2
792+
; CHECK-SVE2p1-NEXT: b.mi .LBB7_2
805793
; CHECK-SVE2p1-NEXT: .LBB7_3: // %for.cond.cleanup
806794
; CHECK-SVE2p1-NEXT: ret
807795
entry:
@@ -918,21 +906,19 @@ define void @f_d_4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
918906
; CHECK-SVE-NEXT: ld1d { z4.d }, p0/z, [x13, x9, lsl #3]
919907
; CHECK-SVE-NEXT: fmul z2.d, z2.d, z0.d
920908
; CHECK-SVE-NEXT: fmul z3.d, z3.d, z0.d
921-
; CHECK-SVE-NEXT: fmul z4.d, z4.d, z0.d
922909
; CHECK-SVE-NEXT: st1d { z1.d }, p3, [x0, x9, lsl #3]
910+
; CHECK-SVE-NEXT: fmul z1.d, z4.d, z0.d
923911
; CHECK-SVE-NEXT: st1d { z2.d }, p2, [x12, x9, lsl #3]
924912
; CHECK-SVE-NEXT: st1d { z3.d }, p1, [x11, x9, lsl #3]
925-
; CHECK-SVE-NEXT: st1d { z4.d }, p0, [x10, x9, lsl #3]
913+
; CHECK-SVE-NEXT: whilelo p3.s, x18, x8
914+
; CHECK-SVE-NEXT: st1d { z1.d }, p0, [x10, x9, lsl #3]
926915
; CHECK-SVE-NEXT: add x9, x16, x9
927-
; CHECK-SVE-NEXT: whilelo p1.s, x18, x8
928-
; CHECK-SVE-NEXT: whilelo p3.s, x9, x8
929-
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
930-
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
931-
; CHECK-SVE-NEXT: punpkhi p2.h, p3.b
932-
; CHECK-SVE-NEXT: punpklo p3.h, p3.b
933-
; CHECK-SVE-NEXT: mov z1.d, p3/z, #1 // =0x1
934-
; CHECK-SVE-NEXT: fmov x18, d1
935-
; CHECK-SVE-NEXT: tbnz w18, #0, .LBB8_2
916+
; CHECK-SVE-NEXT: punpkhi p0.h, p3.b
917+
; CHECK-SVE-NEXT: punpklo p1.h, p3.b
918+
; CHECK-SVE-NEXT: whilelo p4.s, x9, x8
919+
; CHECK-SVE-NEXT: punpkhi p2.h, p4.b
920+
; CHECK-SVE-NEXT: punpklo p3.h, p4.b
921+
; CHECK-SVE-NEXT: b.mi .LBB8_2
936922
; CHECK-SVE-NEXT: .LBB8_3: // %for.cond.cleanup
937923
; CHECK-SVE-NEXT: ret
938924
;
@@ -968,15 +954,13 @@ define void @f_d_4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
968954
; CHECK-SVE2p1-NEXT: fmul z4.d, z4.d, z0.d
969955
; CHECK-SVE2p1-NEXT: st1d { z1.d }, p2, [x0, x9, lsl #3]
970956
; CHECK-SVE2p1-NEXT: st1d { z2.d }, p3, [x12, x9, lsl #3]
971-
; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, x15, x8
972-
; CHECK-SVE2p1-NEXT: mov z1.d, p2/z, #1 // =0x1
973957
; CHECK-SVE2p1-NEXT: st1d { z3.d }, p0, [x11, x9, lsl #3]
974958
; CHECK-SVE2p1-NEXT: st1d { z4.d }, p1, [x10, x9, lsl #3]
975959
; CHECK-SVE2p1-NEXT: incw x9, all, mul #3
976960
; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x9, x8
961+
; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, x15, x8
977962
; CHECK-SVE2p1-NEXT: mov x9, x15
978-
; CHECK-SVE2p1-NEXT: fmov x17, d1
979-
; CHECK-SVE2p1-NEXT: tbnz w17, #0, .LBB8_2
963+
; CHECK-SVE2p1-NEXT: b.mi .LBB8_2
980964
; CHECK-SVE2p1-NEXT: .LBB8_3: // %for.cond.cleanup
981965
; CHECK-SVE2p1-NEXT: ret
982966
entry:

0 commit comments

Comments
 (0)