Skip to content

Commit 5fa38fb

Browse files
committed
[AMDGPU] Extend virtual register use to redefined strict WQM registers
Remove the single value constraint from virtual register merging for pre-allocated strict WQM registers. Reassignment (multiple values) will occur with DPP, and this is a common pattern for LDS parameter loads. Explicitly check that the value sequence matches this behaviour. Change-Id: I4c37c0609fa25e43bc827b0031185056fea5c248
1 parent 3a57c86 commit 5fa38fb

File tree

2 files changed

+45
-28
lines changed

2 files changed

+45
-28
lines changed

llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,28 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
151151
for (Register Reg : Assignments[PhysReg]) {
152152
LiveInterval &LI = LIS->getInterval(Reg);
153153

154-
// Must be single value range with no subranges
155-
CanMerge = !LI.hasSubRanges() && LI.containsOneValue();
154+
// Must have no subranges
155+
CanMerge = !LI.hasSubRanges();
156156
if (!CanMerge)
157157
break;
158158

159+
// Out of an abundance of caution check that there are no PHI values,
160+
// and all values beyond the initial definition are tied operands.
161+
if (!LI.containsOneValue()) {
162+
for (unsigned Idx = 0; CanMerge && Idx < LI.getNumValNums(); ++Idx) {
163+
auto *VN = LI.getValNumInfo(Idx);
164+
MachineInstr *DefMI = LIS->getInstructionFromIndex(VN->def);
165+
CanMerge = !VN->isPHIDef() && DefMI;
166+
if (!CanMerge || Idx == 0)
167+
continue;
168+
MachineOperand &DefOp = DefMI->getOperand(0);
169+
CanMerge = DefOp.isReg() && DefOp.getReg() == Reg && DefOp.isTied() &&
170+
DefMI->isRegTiedToUseOperand(0);
171+
}
172+
if (!CanMerge)
173+
break;
174+
}
175+
159176
// Must be contained in a single basic block
160177
SlotIndex DefIdx = LI.beginIndex();
161178
MachineInstr *DefMI = LIS->getInstructionFromIndex(DefIdx);

llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,38 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
88
; GCN-NEXT: s_mov_b32 s1, exec_lo
99
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
1010
; GCN-NEXT: s_mov_b32 m0, s0
11-
; GCN-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0
12-
; GCN-NEXT: lds_param_load v4, attr1.x wait_vdst:15
13-
; GCN-NEXT: lds_param_load v5, attr1.y wait_vdst:15
14-
; GCN-NEXT: lds_param_load v2, attr1.z wait_vdst:15
15-
; GCN-NEXT: lds_param_load v7, attr1.w wait_vdst:15
16-
; GCN-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3
17-
; GCN-NEXT: v_interp_p10_f32 v6, v5, v0, v5 wait_exp:2
18-
; GCN-NEXT: v_interp_p10_f32 v9, v2, v0, v2 wait_exp:1
19-
; GCN-NEXT: v_interp_p10_f32 v8, v7, v0, v7 wait_exp:0
20-
; GCN-NEXT: v_interp_p10_f32 v0, v4, v0, v4 wait_exp:7
11+
; GCN-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
12+
; GCN-NEXT: lds_param_load v2, attr1.x wait_vdst:15
13+
; GCN-NEXT: lds_param_load v3, attr1.y wait_vdst:15
14+
; GCN-NEXT: lds_param_load v4, attr1.z wait_vdst:15
15+
; GCN-NEXT: lds_param_load v5, attr1.w wait_vdst:15
16+
; GCN-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
17+
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
18+
; GCN-NEXT: v_interp_p10_f32 v9, v4, v0, v4 wait_exp:1
19+
; GCN-NEXT: v_interp_p10_f32 v8, v5, v0, v5 wait_exp:0
20+
; GCN-NEXT: v_interp_p10_f32 v0, v2, v0, v2 wait_exp:7
2121
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22-
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v6 wait_exp:7
23-
; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v9 wait_exp:7
22+
; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
23+
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v9 wait_exp:7
2424
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
25-
; GCN-NEXT: v_interp_p2_f32 v7, v7, v1, v8 wait_exp:7
26-
; GCN-NEXT: v_interp_p2_f32 v1, v4, v1, v0 wait_exp:7
27-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
25+
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v8 wait_exp:7
26+
; GCN-NEXT: v_interp_p2_f32 v1, v2, v1, v0 wait_exp:7
27+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
28+
; GCN-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,3,2,5,4,7,6]
2829
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
29-
; GCN-NEXT: v_and_b32_e32 v3, 1, v3
30-
; GCN-NEXT: v_mov_b32_dpp v7, v7 dpp8:[1,0,3,2,5,4,7,6]
31-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
32-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
33-
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc_lo
34-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
35-
; GCN-NEXT: v_dual_cndmask_b32 v4, v2, v5 :: v_dual_cndmask_b32 v5, v1, v7
36-
; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo
37-
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
30+
; GCN-NEXT: v_and_b32_e32 v6, 1, v6
31+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
32+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6
33+
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
34+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
35+
; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v1, v5
36+
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
37+
; GCN-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,3,2,5,4,7,6]
3838
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
39-
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
39+
; GCN-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,3,2,5,4,7,6]
4040
; GCN-NEXT: s_mov_b32 exec_lo, s1
4141
; GCN-NEXT: exp dual_src_blend0 v0, v1, off, off
42-
; GCN-NEXT: exp dual_src_blend1 v4, v5, off, off done
42+
; GCN-NEXT: exp dual_src_blend1 v2, v3, off, off done
4343
; GCN-NEXT: s_endpgm
4444
.entry:
4545
%InterpCenter.i0 = extractelement <2 x float> %InterpCenter, i64 0

0 commit comments

Comments
 (0)