Skip to content

Commit a0eb6b8

Browse files
authored
[AMDGPU] Try to fix the block prologs broken by RA inserted instructions (llvm#69924)
The insertion point determined by RA while attempting spills and liverange split at the beginning of a block goes wrong at times, and the newly inserted vector instructions are placed before the exec-mask restore instruction which is wrong. It occurs mainly due to the dependency on isBasicBlockPrologue that doesn't account early inserted instructions (spills and splits) during RA and causes the block prolog break. A better approach for deciding the insertion point should be worked out. For now, improving the helper function to consider all possible early insertions. This patch includes the spill instructions. The copies associated with liverange split should also be included in the block prolog.
1 parent b0b8864 commit a0eb6b8

16 files changed

+632
-529
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8439,8 +8439,16 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
84398439
}
84408440

84418441
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
8442-
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
8443-
MI.modifiesRegister(AMDGPU::EXEC, &RI);
8442+
// We need to handle instructions which may be inserted during register
8443+
// allocation to handle the prolog. The initial prolog instruction may have
8444+
// been separated from the start of the block by spills and copies inserted
8445+
// needed by the prolog.
8446+
uint16_t Opc = MI.getOpcode();
8447+
8448+
// FIXME: Copies inserted in the block prolog for live-range split should also
8449+
// be included.
8450+
return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
8451+
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
84448452
}
84458453

84468454
MachineInstrBuilder

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
675675
return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
676676
}
677677

678+
bool isSpillOpcode(uint16_t Opcode) const {
679+
return get(Opcode).TSFlags &
680+
(SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
681+
}
682+
678683
static bool isWWMRegSpillOpcode(uint16_t Opcode) {
679684
return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
680685
Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
144144
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
145145
; CHECK-NEXT: s_mov_b32 exec_lo, s21
146146
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
147-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
148-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
149147
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
150148
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
151149
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -163,6 +161,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
163161
; CHECK-NEXT: v_readlane_b32 s17, v2, 1
164162
; CHECK-NEXT: v_readlane_b32 s18, v2, 2
165163
; CHECK-NEXT: v_readlane_b32 s19, v2, 3
164+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
165+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
166+
; CHECK-NEXT: s_waitcnt vmcnt(0)
166167
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
167168
; CHECK-NEXT: s_waitcnt vmcnt(0)
168169
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s
33

4-
; FIXME: There are two spill codes inserted wrongly in this test.
5-
; They are inserted during regalloc for the BBLiveIns - the spill restores for vgpr1 in the Flow block (bb.1) and for vgpr0 in the return block (bb.4).
4+
; Test to check if the bb prolog spills are inserted correctly during regalloc.
65
define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
76
; REGALLOC-LABEL: name: prolog_spill
87
; REGALLOC: bb.0.bb.0:
@@ -33,10 +32,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
3332
; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
3433
; REGALLOC-NEXT: {{ $}}
3534
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
36-
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
3735
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
3836
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
3937
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
38+
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
4039
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
4140
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
4241
; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -66,10 +65,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
6665
; REGALLOC-NEXT: {{ $}}
6766
; REGALLOC-NEXT: bb.4.bb.3:
6867
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
69-
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
7068
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
7169
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
7270
; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
71+
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
7372
; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 5
7473
; REGALLOC-NEXT: renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
7574
; REGALLOC-NEXT: KILL killed renamable $vgpr1

0 commit comments

Comments
 (0)