Skip to content

Commit fb2c659

Browse files
PeddleSpamLeon Clark
and
Leon Clark
authored
[AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 (#88512)
Use LSH to lower ctlz_zero_undef instead of subtracting leading zeros for i8 and i16. Related to [77615](#77615). --------- Co-authored-by: Leon Clark <[email protected]>
1 parent 8786429 commit fb2c659

File tree

5 files changed

+169
-178
lines changed

5 files changed

+169
-178
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+16-6
Original file line numberDiff line numberDiff line change
@@ -3117,20 +3117,30 @@ static bool isCttzOpc(unsigned Opc) {
31173117
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
31183118
SelectionDAG &DAG) const {
31193119
auto SL = SDLoc(Op);
3120+
auto Opc = Op.getOpcode();
31203121
auto Arg = Op.getOperand(0u);
31213122
auto ResultVT = Op.getValueType();
31223123

31233124
if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
31243125
return {};
31253126

3126-
assert(isCtlzOpc(Op.getOpcode()));
3127+
assert(isCtlzOpc(Opc));
31273128
assert(ResultVT == Arg.getValueType());
31283129

3129-
auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3130-
auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3131-
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3132-
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3133-
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3130+
const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3131+
SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3132+
SDValue NewOp;
3133+
3134+
if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3135+
NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3136+
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3137+
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3138+
} else {
3139+
NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3140+
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3141+
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3142+
}
3143+
31343144
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
31353145
}
31363146

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+37-7
Original file line numberDiff line numberDiff line change
@@ -1270,13 +1270,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
12701270
.custom();
12711271

12721272
// The 64-bit versions produce 32-bit results, but only on the SALU.
1273-
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274-
.legalFor({{S32, S32}, {S32, S64}})
1275-
.clampScalar(0, S32, S32)
1276-
.clampScalar(1, S32, S64)
1277-
.scalarize(0)
1278-
.widenScalarToNextPow2(0, 32)
1279-
.widenScalarToNextPow2(1, 32);
1273+
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1274+
.legalFor({{S32, S32}, {S32, S64}})
1275+
.customIf(scalarNarrowerThan(1, 32))
1276+
.clampScalar(0, S32, S32)
1277+
.clampScalar(1, S32, S64)
1278+
.scalarize(0)
1279+
.widenScalarToNextPow2(0, 32)
1280+
.widenScalarToNextPow2(1, 32);
1281+
1282+
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1283+
.legalFor({{S32, S32}, {S32, S64}})
1284+
.clampScalar(0, S32, S32)
1285+
.clampScalar(1, S32, S64)
1286+
.scalarize(0)
1287+
.widenScalarToNextPow2(0, 32)
1288+
.widenScalarToNextPow2(1, 32);
12801289

12811290
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
12821291
// RegBankSelect.
@@ -2128,6 +2137,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
21282137
case TargetOpcode::G_CTLZ:
21292138
case TargetOpcode::G_CTTZ:
21302139
return legalizeCTLZ_CTTZ(MI, MRI, B);
2140+
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2141+
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
21312142
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
21322143
return legalizeFPTruncRound(MI, B);
21332144
case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4156,25 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
41454156
return true;
41464157
}
41474158

4159+
bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4160+
MachineRegisterInfo &MRI,
4161+
MachineIRBuilder &B) const {
4162+
Register Dst = MI.getOperand(0).getReg();
4163+
Register Src = MI.getOperand(1).getReg();
4164+
LLT SrcTy = MRI.getType(Src);
4165+
TypeSize NumBits = SrcTy.getSizeInBits();
4166+
4167+
assert(NumBits < 32u);
4168+
4169+
auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4170+
auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4171+
auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
4172+
auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4173+
B.buildTrunc(Dst, Ctlz);
4174+
MI.eraseFromParent();
4175+
return true;
4176+
}
4177+
41484178
// Check that this is a G_XOR x, -1
41494179
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
41504180
if (MI.getOpcode() != TargetOpcode::G_XOR)

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
108108
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
109109
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
110110
MachineIRBuilder &B) const;
111+
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
112+
MachineIRBuilder &B) const;
111113

112114
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
113115
const ArgDescriptor *Arg,

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir

+20-27
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,12 @@ body: |
8181
; CHECK: liveins: $vgpr0
8282
; CHECK-NEXT: {{ $}}
8383
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
84-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
85-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
86-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
87-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
88-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
89-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
90-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
91-
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
84+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
85+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
86+
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
87+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
88+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
89+
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
9290
%0:_(s32) = COPY $vgpr0
9391
%1:_(s16) = G_TRUNC %0
9492
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
149147
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
150148
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
151149
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
150+
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
151+
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
152+
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
153+
; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
152154
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
153-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
154-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
155-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
156-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
157-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
158-
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
159-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
160-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
161-
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
162-
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
163-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
155+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
156+
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
157+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
158+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
164159
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
165160
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
166161
%0:_(<2 x s16>) = COPY $vgpr0
@@ -179,14 +174,12 @@ body: |
179174
; CHECK: liveins: $vgpr0
180175
; CHECK-NEXT: {{ $}}
181176
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
182-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
183-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
184-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
185-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
186-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
187-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
188-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
189-
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
177+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
178+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
179+
; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
180+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
181+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
182+
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
190183
%0:_(s32) = COPY $vgpr0
191184
%1:_(s7) = G_TRUNC %0
192185
%2:_(s7) = G_CTLZ_ZERO_UNDEF %1

0 commit comments

Comments
 (0)