diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 55d95154c7587..41d3ab1140a5f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3116,9 +3116,13 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); if (!ZeroUndef) { - const SDValue ConstVal = DAG.getConstant( - Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); - NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); + // umin can be omitted if the operand is known to be non-zero. + auto KB = DAG.computeKnownBits(Src); + if (!KB.isNonZero()) { + const SDValue ConstVal = DAG.getConstant( + Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); + NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); + } } return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8e74d4c0e9459..0d0724584e500 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2090,7 +2090,7 @@ bool AMDGPULegalizerInfo::legalizeCustom( return legalizeMul(Helper, MI); case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: - return legalizeCTLZ_CTTZ(MI, MRI, B); + return legalizeCTLZ_CTTZ(Helper, MI, MRI, B); case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: return legalizeFPTruncRound(MI, B); case TargetOpcode::G_STACKSAVE: @@ -4072,7 +4072,8 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input // case with a single min instruction instead of a compare+select. -bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, +bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(LegalizerHelper &Helper, + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); @@ -4084,8 +4085,10 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, ? AMDGPU::G_AMDGPU_FFBH_U32 : AMDGPU::G_AMDGPU_FFBL_B32; auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); - B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); - + // min instruction can be omitted if the operand is known to be non-zero. + auto *KB = Helper.getKnownBits(); + if (!KB->getKnownBits(Src).isNonZero()) + B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 56aabd4f6ab71..44ef9024be3d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -106,8 +106,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const; bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const; - bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeCTLZ_CTTZ(LegalizerHelper &Helper, MachineInstr &MI, + MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 118d6c123046b..ee2894a66fbfc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1408,7 +1408,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -1451,7 +1450,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 71f1cd54d705c..392a44318b0a5 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1561,7 +1561,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0