Skip to content

Commit 7b9549f

Browse files
[AArch64] Optimise test of the LSB of a paired whileCC insntruction
1 parent 39f1b89 commit 7b9549f

File tree

4 files changed

+89
-69
lines changed

4 files changed

+89
-69
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18032,22 +18032,49 @@ static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
1803218032
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
1803318033
AArch64CC::CondCode Cond);
1803418034

18035-
static bool isPredicateCCSettingOp(SDValue N) {
18036-
if ((N.getOpcode() == ISD::SETCC) ||
18037-
(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18038-
(N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18039-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18040-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18041-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18042-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18043-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18044-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18045-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18046-
// get_active_lane_mask is lowered to a whilelo instruction.
18047-
N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18048-
return true;
18035+
static SDValue getPredicateCCSettingOp(SDValue N) {
18036+
if (N.getOpcode() == ISD::SETCC) {
18037+
EVT VT = N.getValueType();
18038+
return VT.isScalableVector() && VT.getVectorElementType() == MVT::i1
18039+
? N
18040+
: SDValue();
18041+
}
1804918042

18050-
return false;
18043+
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18044+
isNullConstant(N.getOperand(1)))
18045+
N = N.getOperand(0);
18046+
18047+
if (N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18048+
return SDValue();
18049+
18050+
switch (N.getConstantOperandVal(0)) {
18051+
default:
18052+
return SDValue();
18053+
case Intrinsic::aarch64_sve_whilege_x2:
18054+
case Intrinsic::aarch64_sve_whilegt_x2:
18055+
case Intrinsic::aarch64_sve_whilehi_x2:
18056+
case Intrinsic::aarch64_sve_whilehs_x2:
18057+
case Intrinsic::aarch64_sve_whilele_x2:
18058+
case Intrinsic::aarch64_sve_whilelo_x2:
18059+
case Intrinsic::aarch64_sve_whilels_x2:
18060+
case Intrinsic::aarch64_sve_whilelt_x2:
18061+
if (N.getResNo() != 0)
18062+
return SDValue();
18063+
[[fallthrough]];
18064+
case Intrinsic::aarch64_sve_whilege:
18065+
case Intrinsic::aarch64_sve_whilegt:
18066+
case Intrinsic::aarch64_sve_whilehi:
18067+
case Intrinsic::aarch64_sve_whilehs:
18068+
case Intrinsic::aarch64_sve_whilele:
18069+
case Intrinsic::aarch64_sve_whilelo:
18070+
case Intrinsic::aarch64_sve_whilels:
18071+
case Intrinsic::aarch64_sve_whilelt:
18072+
case Intrinsic::get_active_lane_mask:
18073+
assert(N.getValueType().isScalableVector() &&
18074+
N.getValueType().getVectorElementType() == MVT::i1 &&
18075+
"Intrinsic expected to yield scalable i1 vector");
18076+
return N;
18077+
}
1805118078
}
1805218079

1805318080
// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
@@ -18061,21 +18088,17 @@ performFirstTrueTestVectorCombine(SDNode *N,
1806118088
if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
1806218089
return SDValue();
1806318090

18064-
SDValue N0 = N->getOperand(0);
18065-
EVT VT = N0.getValueType();
18066-
18067-
if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18068-
!isNullConstant(N->getOperand(1)))
18069-
return SDValue();
18070-
18071-
// Restricted the DAG combine to only cases where we're extracting from a
18072-
// flag-setting operation.
18073-
if (!isPredicateCCSettingOp(N0))
18091+
// Restrict the DAG combine to only cases where we're extracting the zero-th
18092+
// element from the result of a flag-setting operation.
18093+
SDValue N0;
18094+
if (!isNullConstant(N->getOperand(1)) ||
18095+
!(N0 = getPredicateCCSettingOp(N->getOperand(0))))
1807418096
return SDValue();
1807518097

1807618098
// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
1807718099
SelectionDAG &DAG = DCI.DAG;
18078-
SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18100+
SDValue Pg =
18101+
getPTrue(DAG, SDLoc(N), N0.getValueType(), AArch64SVEPredPattern::all);
1807918102
return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
1808018103
}
1808118104

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,11 +1358,22 @@ bool AArch64InstrInfo::optimizePTestInstr(
13581358
const MachineRegisterInfo *MRI) const {
13591359
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
13601360
auto *Pred = MRI->getUniqueVRegDef(PredReg);
1361-
auto NewOp = Pred->getOpcode();
1361+
unsigned NewOp;
13621362
bool OpChanged = false;
13631363

13641364
unsigned MaskOpcode = Mask->getOpcode();
13651365
unsigned PredOpcode = Pred->getOpcode();
1366+
1367+
// Handle a COPY from the LSB of a paired WHILEcc instruction.
1368+
if ((PredOpcode == TargetOpcode::COPY &&
1369+
Pred->getOperand(1).getSubReg() == AArch64::psub0)) {
1370+
MachineInstr *MI = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1371+
if (MI && isWhileOpcode(MI->getOpcode())) {
1372+
Pred = MI;
1373+
PredOpcode = MI->getOpcode();
1374+
}
1375+
}
1376+
13661377
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
13671378
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
13681379

@@ -1478,9 +1489,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
14781489
// as they are prior to PTEST. Sometimes this requires the tested PTEST
14791490
// operand to be replaced with an equivalent instruction that also sets the
14801491
// flags.
1481-
Pred->setDesc(get(NewOp));
14821492
PTest->eraseFromParent();
14831493
if (OpChanged) {
1494+
Pred->setDesc(get(NewOp));
14841495
bool succeeded = UpdateOperandRegClass(*Pred);
14851496
(void)succeeded;
14861497
assert(succeeded && "Operands have incompatible register classes!");

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9754,7 +9754,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {
97549754

97559755
// SVE integer compare scalar count and limit (predicate pair)
97569756
class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
9757-
RegisterOperand ppr_ty>
9757+
RegisterOperand ppr_ty, ElementSizeEnum EltSz>
97589758
: I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
97599759
mnemonic, "\t$Pd, $Rn, $Rm",
97609760
"", []>, Sched<[]> {
@@ -9772,16 +9772,18 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
97729772
let Inst{3-1} = Pd;
97739773
let Inst{0} = opc{0};
97749774

9775+
let ElementSize = EltSz;
97759776
let Defs = [NZCV];
97769777
let hasSideEffects = 0;
9778+
let isWhile = 1;
97779779
}
97789780

97799781

97809782
multiclass sve2p1_int_while_rr_pair<string mnemonic, bits<3> opc> {
9781-
def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r>;
9782-
def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r>;
9783-
def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r>;
9784-
def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r>;
9783+
def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r, ElementSizeB>;
9784+
def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r, ElementSizeH>;
9785+
def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r, ElementSizeS>;
9786+
def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r, ElementSizeD>;
97859787
}
97869788

97879789

llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -351,9 +351,7 @@ define void @f_h_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
351351
; CHECK-SVE-NEXT: whilelo p1.b, x8, x9
352352
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
353353
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
354-
; CHECK-SVE-NEXT: mov z0.h, p1/z, #1 // =0x1
355-
; CHECK-SVE-NEXT: fmov w13, s0
356-
; CHECK-SVE-NEXT: tbnz w13, #0, .LBB4_2
354+
; CHECK-SVE-NEXT: b.mi .LBB4_2
357355
; CHECK-SVE-NEXT: .LBB4_3: // %for.cond.cleanup
358356
; CHECK-SVE-NEXT: ret
359357
;
@@ -378,9 +376,7 @@ define void @f_h_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
378376
; CHECK-SVE2p1-NEXT: st1h { z1.h }, p1, [x10, x8, lsl #1]
379377
; CHECK-SVE2p1-NEXT: addvl x8, x8, #1
380378
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x8, x9
381-
; CHECK-SVE2p1-NEXT: mov z0.h, p0/z, #1 // =0x1
382-
; CHECK-SVE2p1-NEXT: fmov w12, s0
383-
; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB4_2
379+
; CHECK-SVE2p1-NEXT: b.mi .LBB4_2
384380
; CHECK-SVE2p1-NEXT: .LBB4_3: // %for.cond.cleanup
385381
; CHECK-SVE2p1-NEXT: ret
386382
entry:
@@ -456,9 +452,7 @@ define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
456452
; CHECK-SVE-NEXT: whilelo p1.s, x8, x9
457453
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
458454
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
459-
; CHECK-SVE-NEXT: mov z1.d, p1/z, #1 // =0x1
460-
; CHECK-SVE-NEXT: fmov x13, d1
461-
; CHECK-SVE-NEXT: tbnz w13, #0, .LBB5_2
455+
; CHECK-SVE-NEXT: b.mi .LBB5_2
462456
; CHECK-SVE-NEXT: .LBB5_3: // %for.cond.cleanup
463457
; CHECK-SVE-NEXT: ret
464458
;
@@ -484,9 +478,7 @@ define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
484478
; CHECK-SVE2p1-NEXT: st1d { z2.d }, p1, [x10, x8, lsl #3]
485479
; CHECK-SVE2p1-NEXT: incw x8
486480
; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x8, x9
487-
; CHECK-SVE2p1-NEXT: mov z1.d, p0/z, #1 // =0x1
488-
; CHECK-SVE2p1-NEXT: fmov x12, d1
489-
; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB5_2
481+
; CHECK-SVE2p1-NEXT: b.mi .LBB5_2
490482
; CHECK-SVE2p1-NEXT: .LBB5_3: // %for.cond.cleanup
491483
; CHECK-SVE2p1-NEXT: ret
492484
entry:
@@ -752,15 +744,13 @@ define void @f_h_4(ptr noalias %dst, ptr %src, i32 %n) #0 {
752744
; CHECK-SVE-NEXT: st1h { z2.h }, p1, [x13, x10, lsl #1]
753745
; CHECK-SVE-NEXT: st1h { z3.h }, p0, [x12, x10, lsl #1]
754746
; CHECK-SVE-NEXT: add x10, x9, x10
755-
; CHECK-SVE-NEXT: whilelo p1.b, x18, x8
747+
; CHECK-SVE-NEXT: whilelo p2.b, x18, x8
756748
; CHECK-SVE-NEXT: whilelo p3.b, x10, x8
757-
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
758-
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
749+
; CHECK-SVE-NEXT: punpkhi p0.h, p2.b
750+
; CHECK-SVE-NEXT: punpklo p1.h, p2.b
759751
; CHECK-SVE-NEXT: punpkhi p2.h, p3.b
760752
; CHECK-SVE-NEXT: punpklo p3.h, p3.b
761-
; CHECK-SVE-NEXT: mov z0.h, p3/z, #1 // =0x1
762-
; CHECK-SVE-NEXT: fmov w18, s0
763-
; CHECK-SVE-NEXT: tbnz w18, #0, .LBB7_2
753+
; CHECK-SVE-NEXT: b.mi .LBB7_2
764754
; CHECK-SVE-NEXT: .LBB7_3: // %for.cond.cleanup
765755
; CHECK-SVE-NEXT: ret
766756
;
@@ -799,9 +789,7 @@ define void @f_h_4(ptr noalias %dst, ptr %src, i32 %n) #0 {
799789
; CHECK-SVE2p1-NEXT: addvl x9, x9, #2
800790
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x16, x8
801791
; CHECK-SVE2p1-NEXT: whilelo { p2.h, p3.h }, x9, x8
802-
; CHECK-SVE2p1-NEXT: mov z0.h, p2/z, #1 // =0x1
803-
; CHECK-SVE2p1-NEXT: fmov w16, s0
804-
; CHECK-SVE2p1-NEXT: tbnz w16, #0, .LBB7_2
792+
; CHECK-SVE2p1-NEXT: b.mi .LBB7_2
805793
; CHECK-SVE2p1-NEXT: .LBB7_3: // %for.cond.cleanup
806794
; CHECK-SVE2p1-NEXT: ret
807795
entry:
@@ -918,21 +906,19 @@ define void @f_d_4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
918906
; CHECK-SVE-NEXT: ld1d { z4.d }, p0/z, [x13, x9, lsl #3]
919907
; CHECK-SVE-NEXT: fmul z2.d, z2.d, z0.d
920908
; CHECK-SVE-NEXT: fmul z3.d, z3.d, z0.d
921-
; CHECK-SVE-NEXT: fmul z4.d, z4.d, z0.d
922909
; CHECK-SVE-NEXT: st1d { z1.d }, p3, [x0, x9, lsl #3]
910+
; CHECK-SVE-NEXT: fmul z1.d, z4.d, z0.d
923911
; CHECK-SVE-NEXT: st1d { z2.d }, p2, [x12, x9, lsl #3]
924912
; CHECK-SVE-NEXT: st1d { z3.d }, p1, [x11, x9, lsl #3]
925-
; CHECK-SVE-NEXT: st1d { z4.d }, p0, [x10, x9, lsl #3]
913+
; CHECK-SVE-NEXT: whilelo p3.s, x18, x8
914+
; CHECK-SVE-NEXT: st1d { z1.d }, p0, [x10, x9, lsl #3]
926915
; CHECK-SVE-NEXT: add x9, x16, x9
927-
; CHECK-SVE-NEXT: whilelo p1.s, x18, x8
928-
; CHECK-SVE-NEXT: whilelo p3.s, x9, x8
929-
; CHECK-SVE-NEXT: punpkhi p0.h, p1.b
930-
; CHECK-SVE-NEXT: punpklo p1.h, p1.b
931-
; CHECK-SVE-NEXT: punpkhi p2.h, p3.b
932-
; CHECK-SVE-NEXT: punpklo p3.h, p3.b
933-
; CHECK-SVE-NEXT: mov z1.d, p3/z, #1 // =0x1
934-
; CHECK-SVE-NEXT: fmov x18, d1
935-
; CHECK-SVE-NEXT: tbnz w18, #0, .LBB8_2
916+
; CHECK-SVE-NEXT: punpkhi p0.h, p3.b
917+
; CHECK-SVE-NEXT: punpklo p1.h, p3.b
918+
; CHECK-SVE-NEXT: whilelo p4.s, x9, x8
919+
; CHECK-SVE-NEXT: punpkhi p2.h, p4.b
920+
; CHECK-SVE-NEXT: punpklo p3.h, p4.b
921+
; CHECK-SVE-NEXT: b.mi .LBB8_2
936922
; CHECK-SVE-NEXT: .LBB8_3: // %for.cond.cleanup
937923
; CHECK-SVE-NEXT: ret
938924
;
@@ -968,15 +954,13 @@ define void @f_d_4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
968954
; CHECK-SVE2p1-NEXT: fmul z4.d, z4.d, z0.d
969955
; CHECK-SVE2p1-NEXT: st1d { z1.d }, p2, [x0, x9, lsl #3]
970956
; CHECK-SVE2p1-NEXT: st1d { z2.d }, p3, [x12, x9, lsl #3]
971-
; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, x15, x8
972-
; CHECK-SVE2p1-NEXT: mov z1.d, p2/z, #1 // =0x1
973957
; CHECK-SVE2p1-NEXT: st1d { z3.d }, p0, [x11, x9, lsl #3]
974958
; CHECK-SVE2p1-NEXT: st1d { z4.d }, p1, [x10, x9, lsl #3]
975959
; CHECK-SVE2p1-NEXT: incw x9, all, mul #3
976960
; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x9, x8
961+
; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, x15, x8
977962
; CHECK-SVE2p1-NEXT: mov x9, x15
978-
; CHECK-SVE2p1-NEXT: fmov x17, d1
979-
; CHECK-SVE2p1-NEXT: tbnz w17, #0, .LBB8_2
963+
; CHECK-SVE2p1-NEXT: b.mi .LBB8_2
980964
; CHECK-SVE2p1-NEXT: .LBB8_3: // %for.cond.cleanup
981965
; CHECK-SVE2p1-NEXT: ret
982966
entry:

0 commit comments

Comments
 (0)