Skip to content

Commit 1328a85

Browse files
authored
AMDGPU: Fix handling of -0 in round lowering (#65761)
1 parent b9a6b28 commit 1328a85

File tree

8 files changed

+1387
-1252
lines changed

8 files changed

+1387
-1252
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6586,23 +6586,25 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
65866586
// round(x) =>
65876587
// t = trunc(x);
65886588
// d = fabs(x - t);
6589-
// o = copysign(1.0f, x);
6590-
// return t + (d >= 0.5 ? o : 0.0);
6589+
// o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
6590+
// return t + o;
65916591

65926592
auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
65936593

65946594
auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
65956595
auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6596-
auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6597-
auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6596+
65986597
auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6599-
auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6598+
auto Cmp =
6599+
MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
66006600

6601-
auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6602-
Flags);
6603-
auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6601+
// Could emit G_UITOFP instead
6602+
auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6603+
auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6604+
auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
6605+
auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
66046606

6605-
MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6607+
MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
66066608

66076609
MI.eraseFromParent();
66086610
return Legalized;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2429,18 +2429,16 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
24292429

24302430
const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
24312431
const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2432-
const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2433-
2434-
SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
24352432

24362433
EVT SetCCVT =
24372434
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
24382435

2436+
const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
24392437
SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2438+
SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
24402439

2441-
SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2442-
2443-
return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2440+
SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2441+
return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
24442442
}
24452443

24462444
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir

Lines changed: 477 additions & 381 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3794,85 +3794,84 @@ define half @v_fneg_round_f16(half %a) #0 {
37943794
; SI-SAFE: ; %bb.0:
37953795
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37963796
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
3797-
; SI-SAFE-NEXT: s_brev_b32 s4, -2
37983797
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
3799-
; SI-SAFE-NEXT: v_trunc_f32_e32 v2, v0
3800-
; SI-SAFE-NEXT: v_bfi_b32 v1, s4, 1.0, v0
3801-
; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v2
3802-
; SI-SAFE-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
3803-
; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3804-
; SI-SAFE-NEXT: v_add_f32_e32 v0, v2, v0
3798+
; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0
3799+
; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1
3800+
; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3801+
; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3802+
; SI-SAFE-NEXT: s_brev_b32 s4, -2
3803+
; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3804+
; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0
38053805
; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
38063806
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
38073807
;
38083808
; SI-NSZ-LABEL: v_fneg_round_f16:
38093809
; SI-NSZ: ; %bb.0:
38103810
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38113811
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
3812-
; SI-NSZ-NEXT: s_brev_b32 s4, -2
38133812
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
3814-
; SI-NSZ-NEXT: v_trunc_f32_e32 v2, v0
3815-
; SI-NSZ-NEXT: v_bfi_b32 v1, s4, 1.0, v0
3816-
; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v2
3817-
; SI-NSZ-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
3818-
; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3819-
; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v2, v0
3813+
; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0
3814+
; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1
3815+
; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3816+
; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3817+
; SI-NSZ-NEXT: s_brev_b32 s4, -2
3818+
; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3819+
; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0
38203820
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
38213821
;
38223822
; VI-SAFE-LABEL: v_fneg_round_f16:
38233823
; VI-SAFE: ; %bb.0:
38243824
; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3825+
; VI-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3826+
; VI-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3827+
; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x3c00
3828+
; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3829+
; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
38253830
; VI-SAFE-NEXT: s_movk_i32 s4, 0x7fff
3826-
; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x3c00
3827-
; VI-SAFE-NEXT: v_trunc_f16_e32 v2, v0
3828-
; VI-SAFE-NEXT: v_bfi_b32 v1, s4, v1, v0
3829-
; VI-SAFE-NEXT: v_sub_f16_e32 v0, v0, v2
3830-
; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5
3831-
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3832-
; VI-SAFE-NEXT: v_add_f16_e32 v0, v2, v0
3831+
; VI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3832+
; VI-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
38333833
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
38343834
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
38353835
;
38363836
; VI-NSZ-LABEL: v_fneg_round_f16:
38373837
; VI-NSZ: ; %bb.0:
38383838
; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3839+
; VI-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3840+
; VI-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3841+
; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x3c00
3842+
; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3843+
; VI-NSZ-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
38393844
; VI-NSZ-NEXT: s_movk_i32 s4, 0x7fff
3840-
; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x3c00
3841-
; VI-NSZ-NEXT: v_trunc_f16_e32 v2, v0
3842-
; VI-NSZ-NEXT: v_bfi_b32 v1, s4, v1, v0
3843-
; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v2
3844-
; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5
3845-
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3846-
; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v2, v0
3845+
; VI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3846+
; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
38473847
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
38483848
;
38493849
; GFX11-SAFE-LABEL: v_fneg_round_f16:
38503850
; GFX11-SAFE: ; %bb.0:
38513851
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38523852
; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3853-
; GFX11-SAFE-NEXT: s_movk_i32 s0, 0x3c00
3854-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3853+
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
38553854
; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3856-
; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
3857-
; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5
3858-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3859-
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
3855+
; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3856+
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3857+
; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3858+
; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3859+
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
38603860
; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
3861-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
38623861
; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
38633862
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
38643863
;
38653864
; GFX11-NSZ-LABEL: v_fneg_round_f16:
38663865
; GFX11-NSZ: ; %bb.0:
38673866
; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38683867
; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3869-
; GFX11-NSZ-NEXT: s_movk_i32 s0, 0x3c00
3870-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3868+
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
38713869
; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3872-
; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
3873-
; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5
3874-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3875-
; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
3870+
; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3871+
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3872+
; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3873+
; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3874+
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
38763875
; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
38773876
; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
38783877
%round = call half @llvm.round.f16(half %a)

llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2207,26 +2207,26 @@ define float @v_fneg_round_f32(float %a) #0 {
22072207
; GCN-SAFE-LABEL: v_fneg_round_f32:
22082208
; GCN-SAFE: ; %bb.0:
22092209
; GCN-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2210+
; GCN-SAFE-NEXT: v_trunc_f32_e32 v1, v0
2211+
; GCN-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1
2212+
; GCN-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
2213+
; GCN-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
22102214
; GCN-SAFE-NEXT: s_brev_b32 s4, -2
2211-
; GCN-SAFE-NEXT: v_trunc_f32_e32 v2, v0
2212-
; GCN-SAFE-NEXT: v_bfi_b32 v1, s4, 1.0, v0
2213-
; GCN-SAFE-NEXT: v_sub_f32_e32 v0, v0, v2
2214-
; GCN-SAFE-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
2215-
; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2216-
; GCN-SAFE-NEXT: v_add_f32_e32 v0, v2, v0
2215+
; GCN-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
2216+
; GCN-SAFE-NEXT: v_add_f32_e32 v0, v1, v0
22172217
; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
22182218
; GCN-SAFE-NEXT: s_setpc_b64 s[30:31]
22192219
;
22202220
; GCN-NSZ-LABEL: v_fneg_round_f32:
22212221
; GCN-NSZ: ; %bb.0:
22222222
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2223+
; GCN-NSZ-NEXT: v_trunc_f32_e32 v1, v0
2224+
; GCN-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1
2225+
; GCN-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
2226+
; GCN-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
22232227
; GCN-NSZ-NEXT: s_brev_b32 s4, -2
2224-
; GCN-NSZ-NEXT: v_trunc_f32_e32 v2, v0
2225-
; GCN-NSZ-NEXT: v_bfi_b32 v1, s4, 1.0, v0
2226-
; GCN-NSZ-NEXT: v_sub_f32_e32 v0, v0, v2
2227-
; GCN-NSZ-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
2228-
; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2229-
; GCN-NSZ-NEXT: v_sub_f32_e64 v0, -v2, v0
2228+
; GCN-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
2229+
; GCN-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0
22302230
; GCN-NSZ-NEXT: s_setpc_b64 s[30:31]
22312231
%round = call float @llvm.round.f32(float %a)
22322232
%fneg = fneg float %round

llvm/test/CodeGen/AMDGPU/known-never-snan.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -455,13 +455,13 @@ define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 {
455455
; GCN-LABEL: v_test_known_not_snan_round_input_fmed3_r_i_i_f32:
456456
; GCN: ; %bb.0:
457457
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458+
; GCN-NEXT: v_trunc_f32_e32 v1, v0
459+
; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
460+
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
461+
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
458462
; GCN-NEXT: s_brev_b32 s4, -2
459-
; GCN-NEXT: v_trunc_f32_e32 v2, v0
460-
; GCN-NEXT: v_bfi_b32 v1, s4, 1.0, v0
461-
; GCN-NEXT: v_sub_f32_e32 v0, v0, v2
462-
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
463-
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
464-
; GCN-NEXT: v_add_f32_e32 v0, v2, v0
463+
; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
464+
; GCN-NEXT: v_add_f32_e32 v0, v1, v0
465465
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
466466
; GCN-NEXT: s_setpc_b64 s[30:31]
467467
%known.not.snan = call float @llvm.round.f32(float %a)

0 commit comments

Comments
 (0)