-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesUse proper subreg access when lower SGPR pattern to VALU Full diff: https://github.com/llvm/llvm-project/pull/131859.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 4342e7a369c13..951b84ca01405 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -257,7 +257,13 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
return false;
}
// Change VGPR to SGPR destination.
- MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+ const auto *RC = MRI.getRegClass(DstReg);
+ const auto *TRC = TRI->getEquivalentSGPRClass(RC);
+ // 16-bit SGPRs are not legal operands in True16 instructions. Convert them to
+ // 32-bit SGPRs
+ if (RC == &AMDGPU::VGPR_16RegClass)
+ TRC = &AMDGPU::SGPR_32RegClass;
+ MRI.setRegClass(DstReg, TRC);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fb791c8342282..5b085e10a5ba5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6297,6 +6297,12 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
continue;
}
+ // True16 Operands cannot contain VGPR_32 (typically occurs during
+ // SIFixSGPRCopies). True16 instructions are always selected as VOP3
+ if (ST.useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc) && MO.isReg() &&
+ MRI.getRegClass(MO.getReg()) == &AMDGPU::VGPR_32RegClass)
+ legalizeOpWithMove(MI, Idx);
+
if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
continue; // VGPRs are legal
@@ -8632,7 +8638,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
break;
}
- if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
+ if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)) ||
+ AMDGPU::isTrue16Inst(UseMI.getOpcode())) {
Worklist.insert(&UseMI);
do {
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 5ea39997938ad..b0991d3eb6034 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -698,9 +698,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 644c88457714b..8c5bc4a33a303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 27ec1cfadd9d2..de12f2b246f57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index e16540fec0229..1a426096da197 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index ae41f4381251d..0f709b044f63a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
|
616db45
to
66d131b
Compare
66d131b
to
d5b9fa4
Compare
// True16 Operands cannot contain VGPR_32 (typically occurs during | ||
// SIFixSGPRCopies). True16 instructions are always selected as VOP3 | ||
if (ST.useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc) && MO.isReg() && | ||
MRI.getRegClass(MO.getReg()) == &AMDGPU::VGPR_32RegClass) | ||
legalizeOpWithMove(MI, Idx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't really happen in the first place. legalizeOperands is mostly a hack because the DAG doesn't really know if an operand is going to be SGPR or VGPR and we need to enforce the constant bus restriction. Getting the correct 32 or 16-bit register class doesn't fit the pattern; you should be able to get this correct from the start
Use proper subreg access when lower SGPR to true16 VALU