Skip to content

Commit 5f08d52

Browse files
committed
fix moveToVALU in true16
1 parent 55d3a55 commit 5f08d52

File tree

10 files changed

+1874
-1028
lines changed

10 files changed

+1874
-1028
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7742,6 +7742,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77427742
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
77437743
return;
77447744
}
7745+
7746+
// in true16 mode, if this is a v2s copy src between vgpr16 and sgpr32,
7747+
// replace vgpr copy to subreg_to_reg
7748+
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
7749+
Inst.getOperand(1).getReg().isVirtual() &&
7750+
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7751+
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
7752+
if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
7753+
32 == RI.getRegSizeInBits(*NewDstRC)) {
7754+
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7755+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7756+
get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
7757+
.add(MachineOperand::CreateImm(0))
7758+
.add(Inst.getOperand(1))
7759+
.add(MachineOperand::CreateImm(AMDGPU::lo16));
7760+
Inst.eraseFromParent();
7761+
7762+
MRI.replaceRegWith(DstReg, NewDstReg);
7763+
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7764+
return;
7765+
}
7766+
}
7767+
77457768
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
77467769
MRI.replaceRegWith(DstReg, NewDstReg);
77477770
legalizeOperands(Inst, MDT);
@@ -7835,6 +7858,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78357858
assert(NewDstRC);
78367859
NewDstReg = MRI.createVirtualRegister(NewDstRC);
78377860
MRI.replaceRegWith(DstReg, NewDstReg);
7861+
7862+
// Check useMI of NewInstr. If used by a true16 instruction,
7863+
// add a lo16 subreg access if size mismatched
7864+
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
7865+
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
7866+
E = MRI.use_end();
7867+
I != E; ++I) {
7868+
MachineInstr &UseMI = *I->getParent();
7869+
unsigned UseMIOpcode = UseMI.getOpcode();
7870+
if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
7871+
(16 ==
7872+
RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
7873+
I->setSubReg(AMDGPU::lo16);
7874+
}
7875+
}
7876+
}
78387877
}
78397878
fixImplicitOperands(*NewInstr);
78407879
// Legalize the operands

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 1753 additions & 940 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,13 +1101,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
11011101
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
11021102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
11031103
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
1104-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1105-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
1104+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1105+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
11061106
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1107-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
1108-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1107+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
1108+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
11091109
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1110-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1110+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
11111111
; GFX11-TRUE16-NEXT: ;;#ASMSTART
11121112
; GFX11-TRUE16-NEXT: ; use v0
11131113
; GFX11-TRUE16-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -914,13 +914,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
914914
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
915915
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
916916
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
917-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
918-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
917+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
918+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
919919
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
920-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
921-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
920+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
921+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
922922
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
923-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
923+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
924924
; GFX11-TRUE16-NEXT: ;;#ASMSTART
925925
; GFX11-TRUE16-NEXT: ; use v0
926926
; GFX11-TRUE16-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
259259
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
260260
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
261261
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
262-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
262+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
263263
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
264264
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
265-
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l
265+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
266+
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
267+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
266268
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
267-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
268-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
269269
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
270270
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
271271
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
238238
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
239239
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
240240
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
241-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
241+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
242242
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
243243
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
244-
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l
244+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
245+
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
246+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
245247
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
246-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
247-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
248248
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
249249
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
250250
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -736,43 +736,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
736736
; GFX12-TRUE16-LABEL: constant_load_v16i16_align2:
737737
; GFX12-TRUE16: ; %bb.0: ; %entry
738738
; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
739-
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0
739+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
740740
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
741741
; GFX12-TRUE16-NEXT: s_clause 0x7
742-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16
743-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12
744-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8
745-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4
746-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28
747-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24
748-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20
749-
; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1]
750-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6
751-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
752-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5
753-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
742+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28
743+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24
744+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20
745+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16
746+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12
747+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8
748+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4
749+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1]
750+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
751+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
752+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
753+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
754+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
755+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
756+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
757+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
758+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
759+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
760+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
761+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
762+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
763+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
764+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
765+
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
754766
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4
755-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
756-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3
757-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
758-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2
759-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h
760-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
761-
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
762-
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l
763-
; GFX12-TRUE16-NEXT: s_clause 0x7
764-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30
765-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26
766-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22
767-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18
768-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14
769-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10
770-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6
771-
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2
767+
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
772768
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
773-
; GFX12-TRUE16-NEXT: s_clause 0x1
774769
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off
775-
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
776770
; GFX12-TRUE16-NEXT: s_endpgm
777771
;
778772
; GFX12-FAKE16-LABEL: constant_load_v16i16_align2:

llvm/test/CodeGen/AMDGPU/select.f16.ll

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -880,17 +880,17 @@ define amdgpu_kernel void @select_v2f16(
880880
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
881881
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
882882
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
883-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
883+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
884884
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
885-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
885+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
886886
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
887-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, vcc_lo
887+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
888888
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
889-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v6.l, s0
890-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
889+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
890+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
891891
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
892892
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
893-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
893+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
894894
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
895895
; GFX11-TRUE16-NEXT: s_endpgm
896896
;
@@ -1066,17 +1066,17 @@ define amdgpu_kernel void @select_v2f16_imm_a(
10661066
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
10671067
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
10681068
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
1069-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1069+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
10701070
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1071-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1071+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
10721072
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
1073-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
1073+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
10741074
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1075-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0
1076-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1075+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
1076+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
10771077
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
10781078
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1079-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1079+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
10801080
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
10811081
; GFX11-TRUE16-NEXT: s_endpgm
10821082
;
@@ -1245,17 +1245,17 @@ define amdgpu_kernel void @select_v2f16_imm_b(
12451245
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
12461246
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
12471247
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
1248-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1248+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
12491249
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1250-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1250+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
12511251
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
1252-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
1252+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
12531253
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1254-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0
1255-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1254+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
1255+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
12561256
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
12571257
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1258-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1258+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
12591259
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
12601260
; GFX11-TRUE16-NEXT: s_endpgm
12611261
;
@@ -1428,15 +1428,15 @@ define amdgpu_kernel void @select_v2f16_imm_c(
14281428
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
14291429
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
14301430
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1431-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1431+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
14321432
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14331433
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
1434-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo
1435-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0
1434+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
1435+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
14361436
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1437-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1437+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
14381438
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1439-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1439+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
14401440
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
14411441
; GFX11-TRUE16-NEXT: s_endpgm
14421442
;
@@ -1609,15 +1609,15 @@ define amdgpu_kernel void @select_v2f16_imm_d(
16091609
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
16101610
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
16111611
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1612-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1612+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
16131613
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
16141614
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
1615-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo
1616-
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0
1615+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
1616+
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
16171617
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1618-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
1618+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
16191619
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1620-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1620+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
16211621
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
16221622
; GFX11-TRUE16-NEXT: s_endpgm
16231623
;

0 commit comments

Comments
 (0)