-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Add writelane and readlane pseudos for SGPR spilling #69923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesFor a future patch, is it important to keep the lowered SGPR spills to be recognized as spill instructions during regalloc. Directly lowering them into V_WRITELANE/V_READLANE won't allow us to attach the SPILL flag to their instructions. This patch introduces the readlane/writelane pseudo instructions with the SGPRSpill flag set in their Desc. They will get lowered to hardware instructions later during post RA pseudo expansion. Patch is 313.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69923.diff 34 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5de993ec3cba1d8..1d855d9bd83157d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -274,7 +274,7 @@ class PrologEpilogSGPRSpillBuilder {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_WRITELANE_PSEUDO), Spill[I].VGPR)
.addReg(SubReg)
.addImm(Spill[I].Lane)
.addReg(Spill[I].VGPR, RegState::Undef);
@@ -319,7 +319,7 @@ class PrologEpilogSGPRSpillBuilder {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_READLANE_PSEUDO), SubReg)
.addReg(Spill[I].VGPR)
.addImm(Spill[I].Lane);
}
@@ -1554,12 +1554,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
// TODO: Handle this elsewhere at an early point. Walking through all MBBs
// here would be a bad heuristic. A better way should be by calling
// allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers. That will
- // also help excluding the general use of WRITELANE/READLANE intrinsics
- // that won't really need any such special handling.
- if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
+ // register is allocated for the intended virtual registers.
+ if (MI.getOpcode() == AMDGPU::SI_WRITELANE_PSEUDO)
MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
+ else if (MI.getOpcode() == AMDGPU::SI_READLANE_PSEUDO)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4ff7b462f0f3295..981dd890fb13a41 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2090,6 +2090,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
break;
+ case AMDGPU::SI_WRITELANE_PSEUDO:
+ MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
+ break;
+
+ case AMDGPU::SI_READLANE_PSEUDO:
+ MI.setDesc(get(AMDGPU::V_READLANE_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -3907,7 +3915,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// However, executing them with EXEC = 0 causes them to operate on undefined
// data, which we avoid by returning true here.
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
- Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
+ Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
+ Opcode == AMDGPU::SI_READLANE_PSEUDO ||
+ Opcode == AMDGPU::SI_WRITELANE_PSEUDO)
return true;
return false;
@@ -4301,7 +4311,9 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
+ case AMDGPU::SI_READLANE_PSEUDO:
case AMDGPU::V_WRITELANE_B32:
+ case AMDGPU::SI_WRITELANE_PSEUDO:
return false;
}
@@ -8970,7 +8982,9 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+ if (opcode == AMDGPU::V_READLANE_B32 ||
+ opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+ opcode == AMDGPU::SI_READLANE_PSEUDO)
return InstructionUniformity::AlwaysUniform;
if (isCopyInstr(MI)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c1808c..6929dc4f4ab91cc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -875,6 +875,25 @@ defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
+let SGPRSpill = 1 in {
+def SI_WRITELANE_PSEUDO : PseudoInstSI <(outs VGPR_32:$vdst),
+ (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let VALU = 1;
+ let Constraints = "$vdst = $vdst_in";
+}
+
+def SI_READLANE_PSEUDO : PseudoInstSI <(outs SReg_32:$sdst),
+ (ins VGPR_32:$src0, i32imm:$src1)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let VALU = 1;
+}
+} // End SGPRSpill = 1
+
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d0a81673d6528c2..ed81fac6886adca 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1769,7 +1769,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.
auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
- SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+ SB.TII.get(AMDGPU::SI_WRITELANE_PSEUDO), Spill.VGPR)
.addReg(SubReg, getKillRegState(UseKill))
.addImm(Spill.Lane)
.addReg(Spill.VGPR);
@@ -1815,7 +1815,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
MachineInstrBuilder WriteLane =
- BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+ BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::SI_WRITELANE_PSEUDO),
SB.TmpVGPR)
.addReg(SubReg, SubKillState)
.addImm(i % PVD.PerVGPR)
@@ -1877,8 +1877,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SpilledReg Spill = VGPRSpills[i];
- auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
- SubReg)
+ auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
+ SB.TII.get(AMDGPU::SI_READLANE_PSEUDO), SubReg)
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
if (SB.NumSubRegs > 1 && i == 0)
@@ -1911,7 +1911,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
bool LastSubReg = (i + 1 == e);
auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
- SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ SB.TII.get(AMDGPU::SI_READLANE_PSEUDO), SubReg)
.addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
.addImm(i);
if (SB.NumSubRegs > 1 && i == 0)
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 7209d160e6c8a7a..3e56e49bf31d5cb 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -60,8 +60,8 @@ body: |
; GCN-NEXT: renamable $vgpr46 = COPY $vgpr1, implicit $exec
; GCN-NEXT: renamable $vgpr45 = COPY $vgpr0, implicit $exec
; GCN-NEXT: renamable $sgpr16_sgpr17 = IMPLICIT_DEF
- ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
- ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
+ ; GCN-NEXT: $vgpr40 = SI_WRITELANE_PSEUDO $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; GCN-NEXT: $vgpr40 = SI_WRITELANE_PSEUDO $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
@@ -124,8 +124,8 @@ body: |
ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
renamable $sgpr16_sgpr17 = IMPLICIT_DEF
- $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
- $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit killed $sgpr30_sgpr31
+ $vgpr40 = SI_WRITELANE_PSEUDO $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ $vgpr40 = SI_WRITELANE_PSEUDO killed $sgpr31, 1, $vgpr40, implicit killed $sgpr30_sgpr31
dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
%8:vreg_64 = nofpexcept V_FMA_F64_e64 0, %7, 0, %6, 0, %5, 0, 0, implicit $mode, implicit $exec
ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
index aed642d1f0670e1..5226f7cfad80770 100644
--- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
@@ -19,10 +19,10 @@ body: |
; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = SI_WRITELANE_PSEUDO $sgpr42, 0, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = SI_WRITELANE_PSEUDO $sgpr43, 1, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = SI_WRITELANE_PSEUDO $sgpr46, 2, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = SI_WRITELANE_PSEUDO $sgpr47, 3, $vgpr0
; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
index f802e1b1e18af5d..63e2f085bb1972c 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
@@ -26,9 +26,9 @@ body: |
; GCN: liveins: $sgpr4, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
- ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+ ; GCN-NEXT: renamable $vgpr0 = SI_WRITELANE_PSEUDO $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
+ ; GCN-NEXT: $sgpr4 = SI_READLANE_PSEUDO $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: KILL killed renamable $vgpr0
@@ -77,9 +77,9 @@ body: |
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0
+ ; GCN-NEXT: renamable $vgpr0 = SI_WRITELANE_PSEUDO $sgpr6, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 0
+ ; GCN-NEXT: $sgpr6 = SI_READLANE_PSEUDO $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
@@ -143,9 +143,9 @@ body: |
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+ ; GCN-NEXT: renamable $vgpr0 = SI_WRITELANE_PSEUDO $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
+ ; GCN-NEXT: $sgpr4 = SI_READLANE_PSEUDO $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: S_BRANCH %bb.2
; GCN-NEXT: {{ $}}
@@ -245,9 +245,9 @@ body: |
; GCN-NEXT: bb.1:
; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+ ; GCN-NEXT: renamable $vgpr0 = SI_WRITELANE_PSEUDO $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
+ ; GCN-NEXT: $sgpr4 = SI_READLANE_PSEUDO $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: KILL killed renamable $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index 33e766ad3bf9e18..217b94b5351242c 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -36,7 +36,7 @@ body: |
; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
- ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+ ; CHECK-NEXT: $vgpr2 = SI_WRITELANE_PSEUDO $sgpr4, 0, undef $vgpr2
; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -50,7 +50,7 @@ body: |
; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
- ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+ ; CHECK-NEXT: $sgpr4 = SI_READLANE_PSEUDO $vgpr2, 0
; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index 8cffede47704524..ec67719e86a7827 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -32,7 +32,7 @@ body: |
; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
- ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+ ; MUBUF-NEXT: $vgpr2 = SI_WRITELANE_PSEUDO $sgpr4, 0, undef $vgpr2
; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
@@ -40,7 +40,7 @@ body: |
; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
- ; MUBUF-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+ ; MUBUF-NEXT: $sgpr4 = SI_READLANE_PSEUDO $vgpr2, 0
; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -48,6 +48,7 @@ body: |
; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
; MUBUF-NEXT: $sgpr33 = COPY $sgpr4
; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc
+ ;
; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs
; FLATSCR: liveins: $vgpr1, $vgpr2
; FLATSCR-NEXT: {{ $}}
@@ -58,7 +59,7 @@ body: |
; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into...
[truncated]
|
What about naming them as SI_SPILL_S32_TO_VGPR and SI_RELOAD_S32_FROM_VGPR to better reflect what they are used for? |
Done. |
I think I need to see the context for why this is necessary. The comment spill flags are already produced? |
I think the change is necessary even without considering the context here. Basically we need to use a different opcode so that we can still differentiate the writelane for SGPR spill from the ones lowered from llvm.amdgcn.write.lane. See the test changes of the test: llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll |
Yes, I believe having a separate spill pseudo for SGPR spill to VGPRs would be appropriate. I don't think it is wise to depend on the comment flags for significant CodeGen decisions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, please wait some time before submit in case others may have concern over this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change looks mostly fine to me, splitting these out to dedicated pseudos certainly makes sense; however, I have some doubts (in line) about whether we want to treat the pseudos exactly like v_readlane/v_writelane.
Also curious about the contents of the follow up patch dependent on this?
Having spent a bit of time investigating bugs in shaders with heavy SGPR in the last few weeks, I have a few thoughts about optimizing away some of these v_readlane/v_writelane operations, but perhaps you are already working on that?
@@ -875,6 +875,25 @@ defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; | |||
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; | |||
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; | |||
|
|||
let SGPRSpill = 1 in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to be marking these as isConvergent to match existing behaviour?
I think the answer is probably no as they have much narrower scope that v_readlane/v_writelane, but are still cross-lane operations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These should have identical properties to the underlying instructions
@@ -4301,7 +4311,9 @@ static bool shouldReadExec(const MachineInstr &MI) { | |||
if (SIInstrInfo::isVALU(MI)) { | |||
switch (MI.getOpcode()) { | |||
case AMDGPU::V_READLANE_B32: | |||
case AMDGPU::SI_RELOAD_S32_FROM_VGPR: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure we want this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid MIR verifier error. Even though readlane/writelane are VALU operations, we don't add implicit EXEC operand to them.
@@ -3907,7 +3915,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const | |||
// However, executing them with EXEC = 0 causes them to operate on undefined |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I notice this comment is only really accurate for V_READFIRSTLANE which depends on EXEC.
readlane and writelane are well defined with EXEC=0 -- just we have other reasons for not running them.
If we are sure we want to include spill/reload here then they should probably be documented with a separate comment, because really we are avoiding these in EXEC=0 because it means a code branch should be run for any scalar operation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right. The readlane & writelane instructions shouldn't be here.
Posted #70206.
let Constraints = "$vdst = $vdst_in"; | ||
} | ||
|
||
def SI_RELOAD_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I think SI_RESTORE_S32_FROM_VGPR can be an even better name?
let hasSideEffects = 0; | ||
let mayLoad = 0; | ||
let mayStore = 0; | ||
let VALU = 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might as well add 'IsNeverUniform = 1' same as V_WRITELANE for completeness. Though I doubt it will be of any use.
ff3c457
to
9f460bf
Compare
Rebase after #70206 + Suggestions incorporated. |
let hasSideEffects = 0; | ||
let mayLoad = 0; | ||
let mayStore = 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also verify these are computed with the correct code size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
(ins VGPR_32:$src0, i32imm:$src1)> { | ||
let Size = 4; | ||
let FixedSize = 1; | ||
let IsNeverUniform = 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This IsNeverUniform should be applied to the SPILL opcode (like WRITELANE), not the RESTORE opcode (like READLANE).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah. Thanks for spotting it. I was supposed to add it to the SI_SPILL_S32_TO_VGPR instruction. Will update it.
For a future patch, is it important to keep the lowered SGPR spills to be recognized as spill instructions during regalloc. Directly lowering them into V_WRITELANE/V_READLANE won't allow us to attach the SPILL flag to their instructions. This patch introduces the pseudo instructions with the SGPRSpill flag set in their Desc. They will get lowered to equivalent instructions later during post RA pseudo expansion.
ae4fdae
to
0a1e394
Compare
Rebase + Correctly placed IsNeverUniform flag. |
For a future patch, is it important to keep the lowered SGPR spills to be recognized as spill instructions during regalloc. Directly lowering them into V_WRITELANE/V_READLANE won't allow us to attach the SPILL flag to their instructions. This patch introduces the pseudo instructions with the SGPRSpill flag set in their Desc. They will get lowered to equivalent instructions later during post RA pseudo expansion. Change-Id: I3f98352642089e5d2ecae3b5fe7a75b61a917862
For a future patch, is it important to keep the lowered SGPR spills to be recognized as spill instructions during regalloc. Directly lowering them into V_WRITELANE/V_READLANE won't allow us to attach the SPILL flag to their instructions.
This patch introduces the readlane/writelane pseudo instructions with the SGPRSpill flag set in their Desc. They will get lowered to hardware instructions later during post RA pseudo expansion.