[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859

broxigarchen · 2025-03-18T17:05:47Z

Use proper subreg access when lower SGPR to true16 VALU

llvmbot · 2025-03-18T17:12:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

Use proper subreg access when lower SGPR pattern to VALU

Full diff: https://github.com/llvm/llvm-project/pull/131859.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+7-1)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8-1)
(modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+4-3)
(modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+5-5)
(modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+4-4)
(modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+5-5)
(modified) llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll (+4-4)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 4342e7a369c13..951b84ca01405 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -257,7 +257,13 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
       return false;
   }
   // Change VGPR to SGPR destination.
-  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+  const auto *RC = MRI.getRegClass(DstReg);
+  const auto *TRC = TRI->getEquivalentSGPRClass(RC);
+  // 16-bit SGPRs are not legal operands in True16 instructions. Convert them to
+  // 32-bit SGPRs
+  if (RC == &AMDGPU::VGPR_16RegClass)
+    TRC = &AMDGPU::SGPR_32RegClass;
+  MRI.setRegClass(DstReg, TRC);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fb791c8342282..5b085e10a5ba5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6297,6 +6297,12 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
       continue;
     }
 
+    // True16 Operands cannot contain VGPR_32 (typically occurs during
+    // SIFixSGPRCopies). True16 instructions are always selected as VOP3
+    if (ST.useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc) && MO.isReg() &&
+        MRI.getRegClass(MO.getReg()) == &AMDGPU::VGPR_32RegClass)
+      legalizeOpWithMove(MI, Idx);
+
     if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
       continue; // VGPRs are legal
 
@@ -8632,7 +8638,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
       break;
     }
 
-    if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
+    if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)) ||
+        AMDGPU::isTrue16Inst(UseMI.getOpcode())) {
       Worklist.insert(&UseMI);
 
       do {
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 5ea39997938ad..b0991d3eb6034 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -698,9 +698,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, s0, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 644c88457714b..8c5bc4a33a303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    global_load_b32 v0, v1, s[2:3]
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT:    ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 27ec1cfadd9d2..de12f2b246f57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index e16540fec0229..1a426096da197 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    global_load_b32 v0, v1, s[2:3]
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT:    ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_sin_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_sin_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index ae41f4381251d..0f709b044f63a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

arsenm · 2025-03-19T03:54:20Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+    // True16 Operands cannot contain VGPR_32 (typically occurs during
+    // SIFixSGPRCopies). True16 instructions are always selected as VOP3
+    if (ST.useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc) && MO.isReg() &&
+        MRI.getRegClass(MO.getReg()) == &AMDGPU::VGPR_32RegClass)
+      legalizeOpWithMove(MI, Idx);


This shouldn't really happen in the first place. legalizeOperands is mostly a hack because the DAG doesn't really know if an operand is going to be SGPR or VGPR and we need to enforce the constant bus restriction. Getting the correct 32 or 16-bit register class doesn't fit the pattern; you should be able to get this correct from the start

broxigarchen changed the title ~~fix cndmask pattern~~ [AMDGPU][True16][CodeGen] fix moveToValu in true16 Mar 18, 2025

broxigarchen changed the title ~~[AMDGPU][True16][CodeGen] fix moveToValu in true16~~ [AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 Mar 18, 2025

broxigarchen marked this pull request as ready for review March 18, 2025 17:11

llvmbot added the backend:AMDGPU label Mar 18, 2025

broxigarchen mentioned this pull request Mar 18, 2025

[AMDGPU][True16][CodeGen] enable true16 for more codegen test patch 1 #131206

Merged

broxigarchen requested review from Sisyph, arsenm, kosarev and jayfoad March 18, 2025 17:18

Sisyph reviewed Mar 18, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp Outdated Show resolved Hide resolved

broxigarchen force-pushed the main-merge-true16-codegen-fix-cndmask branch from 616db45 to 66d131b Compare March 18, 2025 19:22

fix lowering sgpr to t16 valu

d5b9fa4

broxigarchen force-pushed the main-merge-true16-codegen-fix-cndmask branch from 66d131b to d5b9fa4 Compare March 18, 2025 21:05

arsenm reviewed Mar 19, 2025

View reviewed changes

broxigarchen marked this pull request as draft March 19, 2025 21:47

broxigarchen closed this Mar 19, 2025

broxigarchen mentioned this pull request Mar 20, 2025

[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #132089

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859

[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859

Uh oh!

broxigarchen commented Mar 18, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 18, 2025

Uh oh!

Uh oh!

arsenm Mar 19, 2025

Uh oh!

Uh oh!

[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859

[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 #131859

Uh oh!

Conversation

broxigarchen commented Mar 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 18, 2025

Uh oh!

Uh oh!

arsenm Mar 19, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

broxigarchen commented Mar 18, 2025 •

edited

Loading