Skip to content

Commit dd1d41f

Browse files
authored
[AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 (#132089)
There are V2S copies between vpgr16 and spgr32 in true16 mode. This is caused by vgpr16 and sgpr32 both selectable by 16bit src in ISel. When a V2S copy and its useMI are lowered to VALU, this patch check 1. If the generated new VALU is used by a true16 inst. Add subreg access if necessary. 2. Legalize the V2S copy by replacing it to subreg_to_reg an example MIR looks like: ``` %2:sgpr_32 = COPY %1:vgpr_16 %3:sgpr_32 = S_OR_B32 %2:sgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3:sgpr_32, ... ``` currently lowered to ``` %2:vgpr_32 = COPY %1:vgpr_16 %3:vgpr_32 = V_OR_B32 %2:vgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3:vgpr_32, ... ``` after this patch ``` %2:vgpr_32 = SUBREG_TO_REG 0, %1:vgpr_16, lo16 %3:vgpr_32 = V_OR_B32 %2:vgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3.lo16:vgpr_32, ... ```
1 parent 4b19db6 commit dd1d41f

10 files changed

+137
-110
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7744,6 +7744,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77447744
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
77457745
return;
77467746
}
7747+
7748+
// If this is a v2s copy src from vgpr16 to sgpr32,
7749+
// replace vgpr copy to subreg_to_reg
7750+
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
7751+
Inst.getOperand(1).getReg().isVirtual() &&
7752+
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7753+
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
7754+
if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
7755+
32 == RI.getRegSizeInBits(*NewDstRC)) {
7756+
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7757+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7758+
get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
7759+
.add(MachineOperand::CreateImm(0))
7760+
.add(Inst.getOperand(1))
7761+
.add(MachineOperand::CreateImm(AMDGPU::lo16));
7762+
Inst.eraseFromParent();
7763+
7764+
MRI.replaceRegWith(DstReg, NewDstReg);
7765+
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7766+
return;
7767+
}
7768+
}
7769+
77477770
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
77487771
MRI.replaceRegWith(DstReg, NewDstReg);
77497772
legalizeOperands(Inst, MDT);
@@ -7837,6 +7860,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78377860
assert(NewDstRC);
78387861
NewDstReg = MRI.createVirtualRegister(NewDstRC);
78397862
MRI.replaceRegWith(DstReg, NewDstReg);
7863+
7864+
// Check useMI of NewInstr. If used by a true16 instruction,
7865+
// add a lo16 subreg access if size mismatched
7866+
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
7867+
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
7868+
E = MRI.use_end();
7869+
I != E; ++I) {
7870+
MachineInstr &UseMI = *I->getParent();
7871+
unsigned UseMIOpcode = UseMI.getOpcode();
7872+
if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
7873+
(16 ==
7874+
RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
7875+
I->setSubReg(AMDGPU::lo16);
7876+
}
7877+
}
7878+
}
78407879
}
78417880
fixImplicitOperands(*NewInstr);
78427881
// Legalize the operands
Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,35 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
3-
# XFAIL: *
4-
# FIXME-TRUE16 reenable after fix-sgpr-copies is updated for true16 flow
53

64
---
7-
name: cmp_f16
5+
name: cvt_hi_f32_f16
86
body: |
9-
bb.0.entry:
10-
; GCN-LABEL: name: cmp_f16
7+
bb.0:
8+
; GCN-LABEL: name: cvt_hi_f32_f16
119
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
12-
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
1310
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
14-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
15-
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY killed [[COPY]]
16-
; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
17-
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
11+
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
12+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]]
13+
; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec
1814
%0:vgpr_16 = IMPLICIT_DEF
19-
%1:sreg_32 = IMPLICIT_DEF
20-
%2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
21-
%3:sreg_32 = COPY %2:vgpr_16
22-
nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
23-
%4:sreg_32_xm0_xexec = COPY $scc
24-
%5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
15+
%1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
16+
%2:sreg_32 = COPY %1:vgpr_16
17+
%3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
2518
...
2619

2720
---
28-
name: cvt_hi_f32_f16
21+
name: s_or_b32
2922
body: |
3023
bb.0:
31-
; GCN-LABEL: name: cvt_hi_f32_f16
24+
; GCN-LABEL: name: s_or_b32
3225
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
3326
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
34-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
35-
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
36-
; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec
27+
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
28+
; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
29+
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
3730
%0:vgpr_16 = IMPLICIT_DEF
3831
%1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
3932
%2:sreg_32 = COPY %1:vgpr_16
40-
%3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
33+
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
34+
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
4135
...

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,13 +1093,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
10931093
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
10941094
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10951095
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
1096-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1097-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
1096+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1097+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
10981098
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1099-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
1100-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1099+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
1100+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
11011101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1102-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1102+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
11031103
; GFX11-TRUE16-NEXT: ;;#ASMSTART
11041104
; GFX11-TRUE16-NEXT: ; use v0
11051105
; GFX11-TRUE16-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -906,13 +906,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
906906
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
907907
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
908908
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
909-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
910-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
909+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
910+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
911911
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
912-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
913-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
912+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
913+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
914914
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
915-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
915+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
916916
; GFX11-TRUE16-NEXT: ;;#ASMSTART
917917
; GFX11-TRUE16-NEXT: ; use v0
918918
; GFX11-TRUE16-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
259259
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
260260
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
261261
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
262-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
262+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
263263
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
264264
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
265-
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l
265+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
266+
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
267+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
266268
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
267-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
268-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
269269
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
270270
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
271271
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
238238
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
239239
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
240240
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
241-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
241+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
242242
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
243243
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
244-
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l
244+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
245+
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
246+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
245247
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
246-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
247-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
248248
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
249249
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
250250
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -736,43 +736,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
736736
; GFX12-TRUE16-LABEL: constant_load_v16i16_align2:
737737
; GFX12-TRUE16: ; %bb.0: ; %entry
738738
; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
739-
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0
739+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
740740
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
741741
; GFX12-TRUE16-NEXT: s_clause 0x7
742-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16
743-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12
744-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8
745-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4
746-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28
747-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24
748-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20
749-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1]
750-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6
751-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
752-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5
753-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
742+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28
743+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24
744+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20
745+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16
746+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12
747+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8
748+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4
749+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1]
750+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
751+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
752+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
753+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
754+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
755+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
756+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
757+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
758+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
759+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
760+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
761+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
762+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
763+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
764+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
765+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
754766
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4
755-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
756-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3
757-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
758-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2
759-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h
760-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
761-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
762-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l
763-
; GFX12-TRUE16-NEXT: s_clause 0x7
764-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30
765-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26
766-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22
767-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18
768-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14
769-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10
770-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6
771-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2
767+
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
772768
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
773-
; GFX12-TRUE16-NEXT: s_clause 0x1
774769
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off
775-
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
776770
; GFX12-TRUE16-NEXT: s_endpgm
777771
;
778772
; GFX12-FAKE16-LABEL: constant_load_v16i16_align2:

0 commit comments

Comments
 (0)