Skip to content

Commit ac09292

Browse files
authored
[SelectionDAG] Widen cttz to cttz_zero_undef (#92514)
Instead of widening e.g. i8 cttz(x) to i16 cttz(x | 0x100), use the more optimizable form cttz_zero_undef(x | 0x100) since the widened operand is definitely not zero.
1 parent 1e7d047 commit ac09292

File tree

5 files changed

+24
-36
lines changed

5 files changed

+24
-36
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -5032,35 +5032,37 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
50325032
case ISD::CTTZ_ZERO_UNDEF:
50335033
case ISD::CTLZ:
50345034
case ISD::CTLZ_ZERO_UNDEF:
5035-
case ISD::CTPOP:
5035+
case ISD::CTPOP: {
50365036
// Zero extend the argument unless its cttz, then use any_extend.
50375037
if (Node->getOpcode() == ISD::CTTZ ||
50385038
Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
50395039
Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
50405040
else
50415041
Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
50425042

5043-
if (Node->getOpcode() == ISD::CTTZ) {
5043+
unsigned NewOpc = Node->getOpcode();
5044+
if (NewOpc == ISD::CTTZ) {
50445045
// The count is the same in the promoted type except if the original
50455046
// value was zero. This can be handled by setting the bit just off
50465047
// the top of the original type.
50475048
auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
50485049
OVT.getSizeInBits());
50495050
Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
50505051
DAG.getConstant(TopBit, dl, NVT));
5052+
NewOpc = ISD::CTTZ_ZERO_UNDEF;
50515053
}
50525054
// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
50535055
// already the correct result.
5054-
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
5055-
if (Node->getOpcode() == ISD::CTLZ ||
5056-
Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
5056+
Tmp1 = DAG.getNode(NewOpc, dl, NVT, Tmp1);
5057+
if (NewOpc == ISD::CTLZ || NewOpc == ISD::CTLZ_ZERO_UNDEF) {
50575058
// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
50585059
Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
50595060
DAG.getConstant(NVT.getSizeInBits() -
50605061
OVT.getSizeInBits(), dl, NVT));
50615062
}
50625063
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
50635064
break;
5065+
}
50645066
case ISD::BITREVERSE:
50655067
case ISD::BSWAP: {
50665068
unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

+9-6
Original file line numberDiff line numberDiff line change
@@ -709,23 +709,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
709709
}
710710
}
711711

712-
if (N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::VP_CTTZ) {
712+
unsigned NewOpc = N->getOpcode();
713+
if (NewOpc == ISD::CTTZ || NewOpc == ISD::VP_CTTZ) {
713714
// The count is the same in the promoted type except if the original
714715
// value was zero. This can be handled by setting the bit just off
715716
// the top of the original type.
716717
auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(),
717718
OVT.getScalarSizeInBits());
718-
if (N->getOpcode() == ISD::CTTZ)
719+
if (NewOpc == ISD::CTTZ) {
719720
Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT));
720-
else
721+
NewOpc = ISD::CTTZ_ZERO_UNDEF;
722+
} else {
721723
Op =
722724
DAG.getNode(ISD::VP_OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT),
723725
N->getOperand(1), N->getOperand(2));
726+
NewOpc = ISD::VP_CTTZ_ZERO_UNDEF;
727+
}
724728
}
725729
if (!N->isVPOpcode())
726-
return DAG.getNode(N->getOpcode(), dl, NVT, Op);
727-
return DAG.getNode(N->getOpcode(), dl, NVT, Op, N->getOperand(1),
728-
N->getOperand(2));
730+
return DAG.getNode(NewOpc, dl, NVT, Op);
731+
return DAG.getNode(NewOpc, dl, NVT, Op, N->getOperand(1), N->getOperand(2));
729732
}
730733

731734
SDValue DAGTypeLegalizer::PromoteIntRes_VP_CttzElements(SDNode *N) {

llvm/test/CodeGen/AMDGPU/cttz.ll

-2
Original file line numberDiff line numberDiff line change
@@ -1408,7 +1408,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
14081408
; VI-NEXT: s_waitcnt vmcnt(0)
14091409
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
14101410
; VI-NEXT: v_ffbl_b32_e32 v2, v2
1411-
; VI-NEXT: v_min_u32_e32 v2, 32, v2
14121411
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
14131412
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
14141413
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -1451,7 +1450,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
14511450
; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
14521451
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
14531452
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
1454-
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
14551453
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
14561454
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
14571455
; GFX10-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

-1
Original file line numberDiff line numberDiff line change
@@ -1561,7 +1561,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
15611561
; VI-NEXT: v_or_b32_e32 v0, v2, v0
15621562
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
15631563
; VI-NEXT: v_ffbl_b32_e32 v2, v2
1564-
; VI-NEXT: v_min_u32_e32 v2, 32, v2
15651564
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
15661565
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
15671566
; VI-NEXT: v_mov_b32_e32 v0, s0

llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll

+8-22
Original file line numberDiff line numberDiff line change
@@ -4145,29 +4145,15 @@ define <vscale x 1 x i9> @vp_cttz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
41454145
; CHECK-NEXT: li a1, 512
41464146
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
41474147
; CHECK-NEXT: vor.vx v8, v8, a1, v0.t
4148-
; CHECK-NEXT: li a0, 1
4149-
; CHECK-NEXT: vsub.vx v9, v8, a0, v0.t
4150-
; CHECK-NEXT: vnot.v v8, v8, v0.t
4148+
; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t
41514149
; CHECK-NEXT: vand.vv v8, v8, v9, v0.t
4152-
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
4153-
; CHECK-NEXT: lui a0, 5
4154-
; CHECK-NEXT: addi a0, a0, 1365
4155-
; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
4156-
; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
4157-
; CHECK-NEXT: lui a0, 3
4158-
; CHECK-NEXT: addi a0, a0, 819
4159-
; CHECK-NEXT: vand.vx v9, v8, a0, v0.t
4160-
; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t
4161-
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
4162-
; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
4163-
; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t
4164-
; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
4165-
; CHECK-NEXT: lui a0, 1
4166-
; CHECK-NEXT: addi a0, a0, -241
4167-
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
4168-
; CHECK-NEXT: li a0, 257
4169-
; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t
4170-
; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
4150+
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
4151+
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
4152+
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
4153+
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
4154+
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
4155+
; CHECK-NEXT: li a0, 127
4156+
; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
41714157
; CHECK-NEXT: ret
41724158
;
41734159
; CHECK-ZVBB-LABEL: vp_cttz_nxv1i9:

0 commit comments

Comments
 (0)