@@ -3794,85 +3794,84 @@ define half @v_fneg_round_f16(half %a) #0 {
3794
3794
; SI-SAFE: ; %bb.0:
3795
3795
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3796
3796
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
3797
- ; SI-SAFE-NEXT: s_brev_b32 s4, -2
3798
3797
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
3799
- ; SI-SAFE-NEXT: v_trunc_f32_e32 v2, v0
3800
- ; SI-SAFE-NEXT: v_bfi_b32 v1, s4, 1.0, v0
3801
- ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v2
3802
- ; SI-SAFE-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
3803
- ; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3804
- ; SI-SAFE-NEXT: v_add_f32_e32 v0, v2, v0
3798
+ ; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0
3799
+ ; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1
3800
+ ; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3801
+ ; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3802
+ ; SI-SAFE-NEXT: s_brev_b32 s4, -2
3803
+ ; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3804
+ ; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0
3805
3805
; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
3806
3806
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
3807
3807
;
3808
3808
; SI-NSZ-LABEL: v_fneg_round_f16:
3809
3809
; SI-NSZ: ; %bb.0:
3810
3810
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3811
3811
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
3812
- ; SI-NSZ-NEXT: s_brev_b32 s4, -2
3813
3812
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
3814
- ; SI-NSZ-NEXT: v_trunc_f32_e32 v2, v0
3815
- ; SI-NSZ-NEXT: v_bfi_b32 v1, s4, 1.0, v0
3816
- ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v2
3817
- ; SI-NSZ-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5
3818
- ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3819
- ; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v2, v0
3813
+ ; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0
3814
+ ; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1
3815
+ ; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3816
+ ; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3817
+ ; SI-NSZ-NEXT: s_brev_b32 s4, -2
3818
+ ; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3819
+ ; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0
3820
3820
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
3821
3821
;
3822
3822
; VI-SAFE-LABEL: v_fneg_round_f16:
3823
3823
; VI-SAFE: ; %bb.0:
3824
3824
; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3825
+ ; VI-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3826
+ ; VI-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3827
+ ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x3c00
3828
+ ; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3829
+ ; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
3825
3830
; VI-SAFE-NEXT: s_movk_i32 s4, 0x7fff
3826
- ; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x3c00
3827
- ; VI-SAFE-NEXT: v_trunc_f16_e32 v2, v0
3828
- ; VI-SAFE-NEXT: v_bfi_b32 v1, s4, v1, v0
3829
- ; VI-SAFE-NEXT: v_sub_f16_e32 v0, v0, v2
3830
- ; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5
3831
- ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3832
- ; VI-SAFE-NEXT: v_add_f16_e32 v0, v2, v0
3831
+ ; VI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3832
+ ; VI-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
3833
3833
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
3834
3834
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
3835
3835
;
3836
3836
; VI-NSZ-LABEL: v_fneg_round_f16:
3837
3837
; VI-NSZ: ; %bb.0:
3838
3838
; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3839
+ ; VI-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3840
+ ; VI-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3841
+ ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x3c00
3842
+ ; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3843
+ ; VI-NSZ-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
3839
3844
; VI-NSZ-NEXT: s_movk_i32 s4, 0x7fff
3840
- ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x3c00
3841
- ; VI-NSZ-NEXT: v_trunc_f16_e32 v2, v0
3842
- ; VI-NSZ-NEXT: v_bfi_b32 v1, s4, v1, v0
3843
- ; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v2
3844
- ; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5
3845
- ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
3846
- ; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v2, v0
3845
+ ; VI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3846
+ ; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
3847
3847
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
3848
3848
;
3849
3849
; GFX11-SAFE-LABEL: v_fneg_round_f16:
3850
3850
; GFX11-SAFE: ; %bb.0:
3851
3851
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3852
3852
; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3853
- ; GFX11-SAFE-NEXT: s_movk_i32 s0, 0x3c00
3854
- ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3853
+ ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3855
3854
; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3856
- ; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
3857
- ; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5
3858
- ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3859
- ; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
3855
+ ; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3856
+ ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3857
+ ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3858
+ ; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3859
+ ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3860
3860
; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
3861
- ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
3862
3861
; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
3863
3862
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
3864
3863
;
3865
3864
; GFX11-NSZ-LABEL: v_fneg_round_f16:
3866
3865
; GFX11-NSZ: ; %bb.0:
3867
3866
; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3868
3867
; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3869
- ; GFX11-NSZ-NEXT: s_movk_i32 s0, 0x3c00
3870
- ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3868
+ ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3871
3869
; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3872
- ; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
3873
- ; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5
3874
- ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3875
- ; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
3870
+ ; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3871
+ ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3872
+ ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3873
+ ; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3874
+ ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
3876
3875
; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
3877
3876
; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
3878
3877
%round = call half @llvm.round.f16 (half %a )
0 commit comments